diff mbox series

[1/2] hw/block/nvme: fix zone boundary check for append

Message ID 20210119135500.265403-2-its@irrelevant.dk (mailing list archive)
State New, archived
Headers show
Series hw/block/nvme: zoned fixes | expand

Commit Message

Klaus Jensen Jan. 19, 2021, 1:54 p.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

When a zone append is processed the controller checks that validity of
the write before assigning the LBA to the append command. This causes
the boundary check to be wrong.

Fix this by checking the write *after* assigning the LBA. Remove the
append special case from the nvme_check_zone_write and open code it in
nvme_do_write, assigning the slba when basic sanity checks have been
performed. Then check the validity of the resulting write like any other
write command.

In the process, also fix a missing endianness conversion for the zone
append ALBA.

Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

Comments

Dmitry Fomichev Jan. 26, 2021, 4:55 a.m. UTC | #1
On Tue, 2021-01-19 at 14:54 +0100, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> When a zone append is processed the controller checks that validity of
> the write before assigning the LBA to the append command. This causes
> the boundary check to be wrong.
> 
> Fix this by checking the write *after* assigning the LBA. Remove the
> append special case from the nvme_check_zone_write and open code it in
> nvme_do_write, assigning the slba when basic sanity checks have been
> performed. Then check the validity of the resulting write like any other
> write command.
> 

Klaus,

I tested this series and it works fine. I however think that the problem that
Niklas has found can be fixed in the much simpler way by applying the
following to the existing code -

--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1152,6 +1152,9 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
                 status = NVME_INVALID_FIELD;
             }
+            if (zone->w_ptr + nlb > nvme_zone_wr_boundary(zone)) {
+                status = NVME_ZONE_BOUNDARY_ERROR;
+            }
             if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
                 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
                 status = NVME_INVALID_FIELD;

I am going to send a few patches that take this approach, please take a look. In my
testing, it works just as well :)

> 
> In the process, also fix a missing endianness conversion for the zone
> append ALBA.

Great catch! This could be placed to a separate patch though...
A few more comments below.


Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 309c26db8ff7..f05dea657b01 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1133,7 +1133,7 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
 
 static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
                                       NvmeZone *zone, uint64_t slba,
-                                      uint32_t nlb, bool append)
+                                      uint32_t nlb)
 {
     uint16_t status;
 
@@ -1147,16 +1147,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
         trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
     } else {
         assert(nvme_wp_is_valid(zone));
-        if (append) {
-            if (unlikely(slba != zone->d.zslba)) {
-                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-                status = NVME_INVALID_FIELD;
-            }
-            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
-                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-                status = NVME_INVALID_FIELD;
-            }
-        } else if (unlikely(slba != zone->w_ptr)) {
+
+        if (unlikely(slba != zone->w_ptr)) {
             trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
                                                zone->w_ptr);
             status = NVME_ZONE_INVALID_WRITE;
@@ -1294,10 +1286,9 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
     }
 }
 
-static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
-                                     uint32_t nlb)
+static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+                                 uint32_t nlb)
 {
-    uint64_t result = zone->w_ptr;
     uint8_t zs;
 
     zone->w_ptr += nlb;
@@ -1313,8 +1304,6 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
         }
     }
-
-    return result;
 }

 static inline bool nvme_is_write(NvmeRequest *req)
@@ -1692,7 +1681,24 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     if (ns->params.zoned) {
         zone = nvme_get_zone_by_slba(ns, slba);

-        status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
+        if (append) {

This is what I see as a drawback of this approach.
You have to move the ZA checks that were previously nicely tucked in
nvme_check_zone_write() to the spot below and now this validation is done
in two different places for appends and regular writes. This can be avoided.
 
+            if (unlikely(slba != zone->d.zslba)) {
+                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+                status = NVME_INVALID_FIELD;
+                goto invalid;
+            }
+
+            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
+                status = NVME_INVALID_FIELD;
+                goto invalid;
+            }
+
+            slba = zone->w_ptr;
+            res->slba = cpu_to_le64(slba);

It is a bit premature to set the result here since the nvme_check_zone_write() below
can still fail. As I recall, ALBA is only returned by a successful command. Again,
good find about endianness!

Regards,
Dmitry

+        }
+
+        status = nvme_check_zone_write(n, ns, zone, slba, nlb);
         if (status != NVME_SUCCESS) {
             goto invalid;
         }
@@ -1702,11 +1708,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
             goto invalid;
         }
 

-        if (append) {
-            slba = zone->w_ptr;
-        }
-
-        res->slba = nvme_advance_zone_wp(ns, zone, nlb);
+        nvme_advance_zone_wp(ns, zone, nlb);
     }
 

     data_offset = nvme_l2b(ns, slba);
Klaus Jensen Jan. 26, 2021, 6:58 a.m. UTC | #2
On Jan 26 04:55, Dmitry Fomichev wrote:
> On Tue, 2021-01-19 at 14:54 +0100, Klaus Jensen wrote:
> > From: Klaus Jensen <k.jensen@samsung.com>
> > 
> > When a zone append is processed the controller checks that validity of
> > the write before assigning the LBA to the append command. This causes
> > the boundary check to be wrong.
> > 
> > Fix this by checking the write *after* assigning the LBA. Remove the
> > append special case from the nvme_check_zone_write and open code it in
> > nvme_do_write, assigning the slba when basic sanity checks have been
> > performed. Then check the validity of the resulting write like any other
> > write command.
> > 
> 
> Klaus,
> 
> I tested this series and it works fine. I however think that the problem that
> Niklas has found can be fixed in the much simpler way by applying the
> following to the existing code -
> 
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1152,6 +1152,9 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
>                  trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
>                  status = NVME_INVALID_FIELD;
>              }
> +            if (zone->w_ptr + nlb > nvme_zone_wr_boundary(zone)) {
> +                status = NVME_ZONE_BOUNDARY_ERROR;
> +            }
>              if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
>                  trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
>                  status = NVME_INVALID_FIELD;
> 
> I am going to send a few patches that take this approach, please take a look. In my
> testing, it works just as well :)
> 
> > 
> > In the process, also fix a missing endianness conversion for the zone
> > append ALBA.
> 
> Great catch! This could be placed to a separate patch though...
> A few more comments below.
> 
> 
> Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
> Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  hw/block/nvme.c | 46 ++++++++++++++++++++++++----------------------
>  1 file changed, 24 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 309c26db8ff7..f05dea657b01 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1133,7 +1133,7 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
>  
>  static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
>                                        NvmeZone *zone, uint64_t slba,
> -                                      uint32_t nlb, bool append)
> +                                      uint32_t nlb)
>  {
>      uint16_t status;
>  
> @@ -1147,16 +1147,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
>          trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
>      } else {
>          assert(nvme_wp_is_valid(zone));
> -        if (append) {
> -            if (unlikely(slba != zone->d.zslba)) {
> -                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
> -                status = NVME_INVALID_FIELD;
> -            }
> -            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
> -                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
> -                status = NVME_INVALID_FIELD;
> -            }
> -        } else if (unlikely(slba != zone->w_ptr)) {
> +
> +        if (unlikely(slba != zone->w_ptr)) {
>              trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
>                                                 zone->w_ptr);
>              status = NVME_ZONE_INVALID_WRITE;
> @@ -1294,10 +1286,9 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
>      }
>  }
>  
> -static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
> -                                     uint32_t nlb)
> +static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
> +                                 uint32_t nlb)
>  {
> -    uint64_t result = zone->w_ptr;
>      uint8_t zs;
>  
>      zone->w_ptr += nlb;
> @@ -1313,8 +1304,6 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
>              nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
>          }
>      }
> -
> -    return result;
>  }
> 
>  static inline bool nvme_is_write(NvmeRequest *req)
> @@ -1692,7 +1681,24 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
>      if (ns->params.zoned) {
>          zone = nvme_get_zone_by_slba(ns, slba);
> 
> -        status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
> +        if (append) {
> 
> This is what I see as a drawback of this approach.
> You have to move the ZA checks that were previously nicely tucked in
> nvme_check_zone_write() to the spot below and now this validation is done
> in two different places for appends and regular writes. This can be avoided.
> 

OK. However this means that other commands that write (Write Zeroes,
Copy) to zones has to go through that special case branch even though it
is a special case of regular writes. This is completely irrelevant for
performance so that's not my point, I just found it cleaner to tuck it
away as a special case in write.

> +            if (unlikely(slba != zone->d.zslba)) {
> +                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
> +                status = NVME_INVALID_FIELD;
> +                goto invalid;
> +            }
> +
> +            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
> +                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
> +                status = NVME_INVALID_FIELD;
> +                goto invalid;
> +            }
> +
> +            slba = zone->w_ptr;
> +            res->slba = cpu_to_le64(slba);
> 
> It is a bit premature to set the result here since the nvme_check_zone_write() below
> can still fail. As I recall, ALBA is only returned by a successful command. Again,
> good find about endianness!
> 

There has always been a branch in finalize that clears it to zero on
error.

My patch also has the effect of not setting ALBA for regular writes. To
change that you are gonna have to add a 'if (append)' special case in
nvme_do_write anyway.
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 309c26db8ff7..f05dea657b01 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1133,7 +1133,7 @@  static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
 
 static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
                                       NvmeZone *zone, uint64_t slba,
-                                      uint32_t nlb, bool append)
+                                      uint32_t nlb)
 {
     uint16_t status;
 
@@ -1147,16 +1147,8 @@  static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
         trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
     } else {
         assert(nvme_wp_is_valid(zone));
-        if (append) {
-            if (unlikely(slba != zone->d.zslba)) {
-                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-                status = NVME_INVALID_FIELD;
-            }
-            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
-                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-                status = NVME_INVALID_FIELD;
-            }
-        } else if (unlikely(slba != zone->w_ptr)) {
+
+        if (unlikely(slba != zone->w_ptr)) {
             trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
                                                zone->w_ptr);
             status = NVME_ZONE_INVALID_WRITE;
@@ -1294,10 +1286,9 @@  static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
     }
 }
 
-static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
-                                     uint32_t nlb)
+static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+                                 uint32_t nlb)
 {
-    uint64_t result = zone->w_ptr;
     uint8_t zs;
 
     zone->w_ptr += nlb;
@@ -1313,8 +1304,6 @@  static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
         }
     }
-
-    return result;
 }
 
 static inline bool nvme_is_write(NvmeRequest *req)
@@ -1692,7 +1681,24 @@  static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     if (ns->params.zoned) {
         zone = nvme_get_zone_by_slba(ns, slba);
 
-        status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
+        if (append) {
+            if (unlikely(slba != zone->d.zslba)) {
+                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+                status = NVME_INVALID_FIELD;
+                goto invalid;
+            }
+
+            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
+                status = NVME_INVALID_FIELD;
+                goto invalid;
+            }
+
+            slba = zone->w_ptr;
+            res->slba = cpu_to_le64(slba);
+        }
+
+        status = nvme_check_zone_write(n, ns, zone, slba, nlb);
         if (status != NVME_SUCCESS) {
             goto invalid;
         }
@@ -1702,11 +1708,7 @@  static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
             goto invalid;
         }
 
-        if (append) {
-            slba = zone->w_ptr;
-        }
-
-        res->slba = nvme_advance_zone_wp(ns, zone, nlb);
+        nvme_advance_zone_wp(ns, zone, nlb);
     }
 
     data_offset = nvme_l2b(ns, slba);