diff mbox series

[V4,5/6] block/rbd: add write zeroes support

Message ID 20210702090935.15300-6-pl@kamp.de (mailing list archive)
State New, archived
Headers show
Series block/rbd: migrate to coroutines and add write zeroes support | expand

Commit Message

Peter Lieven July 2, 2021, 9:09 a.m. UTC
this patch wittingly sets BDRV_REQ_NO_FALLBACK and silently ignores BDRV_REQ_MAY_UNMAP
for older librbd versions.

The rationale for this is as following (citing Ilya Dryomov current RBD maintainer):
---8<---
a) remove the BDRV_REQ_MAY_UNMAP check in qemu_rbd_co_pwrite_zeroes()
   and as a consequence always unmap if librbd is too old

   It's not clear what qemu's expectation is but in general Write
   Zeroes is allowed to unmap.  The only guarantee is that subsequent
   reads return zeroes, everything else is a hint.  This is how it is
   specified in the kernel and in the NVMe spec.

   In particular, block/nvme.c implements it as follows:

   if (flags & BDRV_REQ_MAY_UNMAP) {
       cdw12 |= (1 << 25);
   }

   This sets the Deallocate bit.  But if it's not set, the device may
   still deallocate:

   """
   If the Deallocate bit (CDW12.DEAC) is set to '1' in a Write Zeroes
   command, and the namespace supports clearing all bytes to 0h in the
   values read (e.g., bits 2:0 in the DLFEAT field are set to 001b)
   from a deallocated logical block and its metadata (excluding
   protection information), then for each specified logical block, the
   controller:
   - should deallocate that logical block;

   ...

   If the Deallocate bit is cleared to '0' in a Write Zeroes command,
   and the namespace supports clearing all bytes to 0h in the values
   read (e.g., bits 2:0 in the DLFEAT field are set to 001b) from
   a deallocated logical block and its metadata (excluding protection
   information), then, for each specified logical block, the
   controller:
   - may deallocate that logical block;
   """

   https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-2021.06.02-Ratified-1.pdf

b) set BDRV_REQ_NO_FALLBACK in supported_zero_flags

   Again, it's not clear what qemu expects here, but without it we end
   up in a ridiculous situation where specifying the "don't allow slow
   fallback" switch immediately fails all efficient zeroing requests on
   a device where Write Zeroes is always efficient:

   $ qemu-io -c 'help write' | grep -- '-[zun]'
    -n, -- with -z, don't allow slow fallback
    -u, -- with -z, allow unmapping
    -z, -- write zeroes using blk_co_pwrite_zeroes

   $ qemu-io -f rbd -c 'write -z -u -n 0 1M' rbd:foo/bar
   write failed: Operation not supported
--->8---

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

Comments

Ilya Dryomov July 2, 2021, 12:24 p.m. UTC | #1
On Fri, Jul 2, 2021 at 11:09 AM Peter Lieven <pl@kamp.de> wrote:
>
> this patch wittingly sets BDRV_REQ_NO_FALLBACK and silently ignores BDRV_REQ_MAY_UNMAP
> for older librbd versions.
>
> The rationale for this is as following (citing Ilya Dryomov current RBD maintainer):
> ---8<---
> a) remove the BDRV_REQ_MAY_UNMAP check in qemu_rbd_co_pwrite_zeroes()
>    and as a consequence always unmap if librbd is too old
>
>    It's not clear what qemu's expectation is but in general Write
>    Zeroes is allowed to unmap.  The only guarantee is that subsequent
>    reads return zeroes, everything else is a hint.  This is how it is
>    specified in the kernel and in the NVMe spec.
>
>    In particular, block/nvme.c implements it as follows:
>
>    if (flags & BDRV_REQ_MAY_UNMAP) {
>        cdw12 |= (1 << 25);
>    }
>
>    This sets the Deallocate bit.  But if it's not set, the device may
>    still deallocate:
>
>    """
>    If the Deallocate bit (CDW12.DEAC) is set to '1' in a Write Zeroes
>    command, and the namespace supports clearing all bytes to 0h in the
>    values read (e.g., bits 2:0 in the DLFEAT field are set to 001b)
>    from a deallocated logical block and its metadata (excluding
>    protection information), then for each specified logical block, the
>    controller:
>    - should deallocate that logical block;
>
>    ...
>
>    If the Deallocate bit is cleared to '0' in a Write Zeroes command,
>    and the namespace supports clearing all bytes to 0h in the values
>    read (e.g., bits 2:0 in the DLFEAT field are set to 001b) from
>    a deallocated logical block and its metadata (excluding protection
>    information), then, for each specified logical block, the
>    controller:
>    - may deallocate that logical block;
>    """
>
>    https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-2021.06.02-Ratified-1.pdf
>
> b) set BDRV_REQ_NO_FALLBACK in supported_zero_flags
>
>    Again, it's not clear what qemu expects here, but without it we end
>    up in a ridiculous situation where specifying the "don't allow slow
>    fallback" switch immediately fails all efficient zeroing requests on
>    a device where Write Zeroes is always efficient:
>
>    $ qemu-io -c 'help write' | grep -- '-[zun]'
>     -n, -- with -z, don't allow slow fallback
>     -u, -- with -z, allow unmapping
>     -z, -- write zeroes using blk_co_pwrite_zeroes
>
>    $ qemu-io -f rbd -c 'write -z -u -n 0 1M' rbd:foo/bar
>    write failed: Operation not supported
> --->8---
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/rbd.c | 32 +++++++++++++++++++++++++++++++-
>  1 file changed, 31 insertions(+), 1 deletion(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index be0471944a..149317d33c 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -63,7 +63,8 @@ typedef enum {
>      RBD_AIO_READ,
>      RBD_AIO_WRITE,
>      RBD_AIO_DISCARD,
> -    RBD_AIO_FLUSH
> +    RBD_AIO_FLUSH,
> +    RBD_AIO_WRITE_ZEROES
>  } RBDAIOCmd;
>
>  typedef struct BDRVRBDState {
> @@ -705,6 +706,10 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>          }
>      }
>
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
> +#endif
> +
>      /* When extending regular files, we get zeros from the OS */
>      bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
>
> @@ -827,6 +832,18 @@ static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
>      case RBD_AIO_FLUSH:
>          r = rbd_aio_flush(s->image, c);
>          break;
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    case RBD_AIO_WRITE_ZEROES: {
> +        int zero_flags = 0;
> +#ifdef RBD_WRITE_ZEROES_FLAG_THICK_PROVISION
> +        if (!(flags & BDRV_REQ_MAY_UNMAP)) {
> +            zero_flags = RBD_WRITE_ZEROES_FLAG_THICK_PROVISION;
> +        }
> +#endif
> +        r = rbd_aio_write_zeroes(s->image, offset, bytes, c, zero_flags, 0);
> +        break;
> +    }
> +#endif
>      default:
>          r = -EINVAL;
>      }
> @@ -897,6 +914,16 @@ static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
>      return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
>  }
>
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +static int
> +coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
> +                                      int count, BdrvRequestFlags flags)
> +{
> +    return qemu_rbd_start_co(bs, offset, count, NULL, flags,
> +                             RBD_AIO_WRITE_ZEROES);
> +}
> +#endif
> +
>  static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
>  {
>      BDRVRBDState *s = bs->opaque;
> @@ -1120,6 +1147,9 @@ static BlockDriver bdrv_rbd = {
>      .bdrv_co_pwritev        = qemu_rbd_co_pwritev,
>      .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
>      .bdrv_co_pdiscard       = qemu_rbd_co_pdiscard,
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
> +#endif
>
>      .bdrv_snapshot_create   = qemu_rbd_snap_create,
>      .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
> --
> 2.17.1
>
>

Reviewed-by: Ilya Dryomov <idryomov@gmail.com>

Thanks,

                Ilya
diff mbox series

Patch

diff --git a/block/rbd.c b/block/rbd.c
index be0471944a..149317d33c 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -63,7 +63,8 @@  typedef enum {
     RBD_AIO_READ,
     RBD_AIO_WRITE,
     RBD_AIO_DISCARD,
-    RBD_AIO_FLUSH
+    RBD_AIO_FLUSH,
+    RBD_AIO_WRITE_ZEROES
 } RBDAIOCmd;
 
 typedef struct BDRVRBDState {
@@ -705,6 +706,10 @@  static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
         }
     }
 
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
+#endif
+
     /* When extending regular files, we get zeros from the OS */
     bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
 
@@ -827,6 +832,18 @@  static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
     case RBD_AIO_FLUSH:
         r = rbd_aio_flush(s->image, c);
         break;
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    case RBD_AIO_WRITE_ZEROES: {
+        int zero_flags = 0;
+#ifdef RBD_WRITE_ZEROES_FLAG_THICK_PROVISION
+        if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+            zero_flags = RBD_WRITE_ZEROES_FLAG_THICK_PROVISION;
+        }
+#endif
+        r = rbd_aio_write_zeroes(s->image, offset, bytes, c, zero_flags, 0);
+        break;
+    }
+#endif
     default:
         r = -EINVAL;
     }
@@ -897,6 +914,16 @@  static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
     return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
 }
 
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+static int
+coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                      int count, BdrvRequestFlags flags)
+{
+    return qemu_rbd_start_co(bs, offset, count, NULL, flags,
+                             RBD_AIO_WRITE_ZEROES);
+}
+#endif
+
 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVRBDState *s = bs->opaque;
@@ -1120,6 +1147,9 @@  static BlockDriver bdrv_rbd = {
     .bdrv_co_pwritev        = qemu_rbd_co_pwritev,
     .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
     .bdrv_co_pdiscard       = qemu_rbd_co_pdiscard,
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
+#endif
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,