diff mbox series

[v2] block/rbd: add preallocation support

Message ID 20190506122322.90035-1-sgarzare@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v2] block/rbd: add preallocation support | expand

Commit Message

Stefano Garzarella May 6, 2019, 12:23 p.m. UTC
This patch adds the support of preallocation (off/full) for the RBD
block driver.
If available, we use rbd_writesame() to quickly fill the image when
full preallocation is required.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
v2:
- Use 4 KiB buffer for rbd_writesame() [Jason]
- Use "rbd_concurrent_management_ops" and "stripe_unit" to limit
  concurrent ops to the backing cluster [Jason]
---
 block/rbd.c          | 188 +++++++++++++++++++++++++++++++++++++++----
 qapi/block-core.json |   4 +-
 2 files changed, 175 insertions(+), 17 deletions(-)

Comments

Max Reitz June 25, 2019, 4:06 p.m. UTC | #1
On 06.05.19 14:23, Stefano Garzarella wrote:
> This patch adds the support of preallocation (off/full) for the RBD
> block driver.
> If available, we use rbd_writesame() to quickly fill the image when
> full preallocation is required.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
> v2:
> - Use 4 KiB buffer for rbd_writesame() [Jason]
> - Use "rbd_concurrent_management_ops" and "stripe_unit" to limit
>   concurrent ops to the backing cluster [Jason]
> ---
>  block/rbd.c          | 188 +++++++++++++++++++++++++++++++++++++++----
>  qapi/block-core.json |   4 +-
>  2 files changed, 175 insertions(+), 17 deletions(-)

This patch conflicts a bit with the rbd file growth patch, so that would
need to be resolved in a v3.

But still, that doesn’t prevent me from reviewing it as it is:

> diff --git a/block/rbd.c b/block/rbd.c
> index 0c549c9935..bc54395e1c 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c

[...]

> @@ -331,6 +333,147 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)

[...]

> +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> +                                int64_t offset, PreallocMode prealloc,
> +                                Error **errp)
> +{

[...]

> +#ifdef LIBRBD_SUPPORTS_WRITESAME
> +        uint64_t stripe_unit, writesame_max_size;
> +        int max_concurrent_ops;
> +
> +        max_concurrent_ops = qemu_rbd_get_max_concurrent_ops(cluster);
> +        ret = rbd_get_stripe_unit(image, &stripe_unit);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to get stripe unit");
> +            goto out;
> +        }
> +
> +        /*
> +         * We limit the rbd_writesame() size to avoid to spawn more then

s/then/than/

> +         * 'rbd_concurrent_management_ops' concurrent operations.
> +         */
> +        writesame_max_size = MIN(stripe_unit * max_concurrent_ops, INT_MAX);
> +#endif /* LIBRBD_SUPPORTS_WRITESAME */
> +
> +        buf = g_malloc(buf_size);
> +        /*
> +         * Some versions of rbd_writesame() discards writes of buffers with
> +         * all zeroes. In order to avoid this behaviour, we set the first byte
> +         * to one.
> +         */
> +        buf[0] = 1;

But I guess you’ll need to rewrite it as zero later, or the
“bdrv_rbd.bdrv_has_zero_init = bdrv_has_zero_init_1” is no longer quite
true.

[...]

> @@ -449,6 +603,16 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
>                                                     BLOCK_OPT_CLUSTER_SIZE, 0);
>      rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0);
>  
> +    prealloc = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
> +    rbd_opts->preallocation = qapi_enum_parse(&PreallocMode_lookup, prealloc,
> +                                              PREALLOC_MODE_OFF, &local_err);

You also need to set rbd_opts->has_preallocation to true.

> +    g_free(prealloc);
> +    if (local_err) {
> +        ret = -EINVAL;
> +        error_propagate(errp, local_err);
> +        goto exit;
> +    }
> +
>      options = qdict_new();
>      qemu_rbd_parse_filename(filename, options, &local_err);
>      if (local_err) {

[...]

> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 7ccbfff9d0..db25a4065b 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -4277,13 +4277,15 @@
>  #                   point to a snapshot.
>  # @size             Size of the virtual disk in bytes
>  # @cluster-size     RBD object size
> +# @preallocation    Preallocation mode (allowed values: off, full)

There should be a "(Since: 4.1)" note here.

Max

>  #
>  # Since: 2.12
>  ##
>  { 'struct': 'BlockdevCreateOptionsRbd',
>    'data': { 'location':         'BlockdevOptionsRbd',
>              'size':             'size',
> -            '*cluster-size' :   'size' } }
> +            '*cluster-size' :   'size',
> +            '*preallocation':   'PreallocMode' } }
>  
>  ##
>  # @BlockdevVmdkSubformat:
>
Stefano Garzarella June 26, 2019, 7:46 a.m. UTC | #2
On Tue, Jun 25, 2019 at 06:06:02PM +0200, Max Reitz wrote:
> On 06.05.19 14:23, Stefano Garzarella wrote:
> > This patch adds the support of preallocation (off/full) for the RBD
> > block driver.
> > If available, we use rbd_writesame() to quickly fill the image when
> > full preallocation is required.
> > 
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > ---
> > v2:
> > - Use 4 KiB buffer for rbd_writesame() [Jason]
> > - Use "rbd_concurrent_management_ops" and "stripe_unit" to limit
> >   concurrent ops to the backing cluster [Jason]
> > ---
> >  block/rbd.c          | 188 +++++++++++++++++++++++++++++++++++++++----
> >  qapi/block-core.json |   4 +-
> >  2 files changed, 175 insertions(+), 17 deletions(-)
> 
> This patch conflicts a bit with the rbd file growth patch, so that would
> need to be resolved in a v3.

Sure, I'll rebase this patch in a v3!

> 
> But still, that doesn’t prevent me from reviewing it as it is:
> 
> > diff --git a/block/rbd.c b/block/rbd.c
> > index 0c549c9935..bc54395e1c 100644
> > --- a/block/rbd.c
> > +++ b/block/rbd.c
> 
> [...]
> 
> > @@ -331,6 +333,147 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> 
> [...]
> 
> > +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> > +                                int64_t offset, PreallocMode prealloc,
> > +                                Error **errp)
> > +{
> 
> [...]
> 
> > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > +        uint64_t stripe_unit, writesame_max_size;
> > +        int max_concurrent_ops;
> > +
> > +        max_concurrent_ops = qemu_rbd_get_max_concurrent_ops(cluster);
> > +        ret = rbd_get_stripe_unit(image, &stripe_unit);
> > +        if (ret < 0) {
> > +            error_setg_errno(errp, -ret, "Failed to get stripe unit");
> > +            goto out;
> > +        }
> > +
> > +        /*
> > +         * We limit the rbd_writesame() size to avoid to spawn more then
> 
> s/then/than/

Ooo, I'll fix it!

> 
> > +         * 'rbd_concurrent_management_ops' concurrent operations.
> > +         */
> > +        writesame_max_size = MIN(stripe_unit * max_concurrent_ops, INT_MAX);
> > +#endif /* LIBRBD_SUPPORTS_WRITESAME */
> > +
> > +        buf = g_malloc(buf_size);
> > +        /*
> > +         * Some versions of rbd_writesame() discards writes of buffers with
> > +         * all zeroes. In order to avoid this behaviour, we set the first byte
> > +         * to one.
> > +         */
> > +        buf[0] = 1;
> 
> But I guess you’ll need to rewrite it as zero later, or the
> “bdrv_rbd.bdrv_has_zero_init = bdrv_has_zero_init_1” is no longer quite
> true.

Yes, and I should use g_malloc0. I'll check if the next rewrites is too
slow, in this case I'll use rbd_writesame() only for ceph version where
zeroed buffer is supported.

> 
> [...]
> 
> > @@ -449,6 +603,16 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
> >                                                     BLOCK_OPT_CLUSTER_SIZE, 0);
> >      rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0);
> >  
> > +    prealloc = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
> > +    rbd_opts->preallocation = qapi_enum_parse(&PreallocMode_lookup, prealloc,
> > +                                              PREALLOC_MODE_OFF, &local_err);
> 
> You also need to set rbd_opts->has_preallocation to true.

I'll fix.

> 
> > +    g_free(prealloc);
> > +    if (local_err) {
> > +        ret = -EINVAL;
> > +        error_propagate(errp, local_err);
> > +        goto exit;
> > +    }
> > +
> >      options = qdict_new();
> >      qemu_rbd_parse_filename(filename, options, &local_err);
> >      if (local_err) {
> 
> [...]
> 
> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> > index 7ccbfff9d0..db25a4065b 100644
> > --- a/qapi/block-core.json
> > +++ b/qapi/block-core.json
> > @@ -4277,13 +4277,15 @@
> >  #                   point to a snapshot.
> >  # @size             Size of the virtual disk in bytes
> >  # @cluster-size     RBD object size
> > +# @preallocation    Preallocation mode (allowed values: off, full)
> 
> There should be a "(Since: 4.1)" note here.

I'll add the note!

Thanks for the review,
Stefano
diff mbox series

Patch

diff --git a/block/rbd.c b/block/rbd.c
index 0c549c9935..bc54395e1c 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -13,6 +13,7 @@ 
 
 #include "qemu/osdep.h"
 
+#include "qemu/units.h"
 #include <rbd/librbd.h>
 #include "qapi/error.h"
 #include "qemu/error-report.h"
@@ -63,6 +64,7 @@ 
 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
 
 #define RBD_MAX_SNAPS 100
+#define RBD_DEFAULT_CONCURRENT_OPS 10
 
 /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
 #ifdef LIBRBD_SUPPORTS_IOVEC
@@ -331,6 +333,147 @@  static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
     }
 }
 
+static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
+{
+    char buf[16];
+    int ret, max_concurrent_ops;
+
+    ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", buf,
+                         sizeof(buf));
+    if (ret < 0) {
+        return RBD_DEFAULT_CONCURRENT_OPS;
+    }
+
+    ret = qemu_strtoi(buf, NULL, 10, &max_concurrent_ops);
+    if (ret < 0) {
+        return RBD_DEFAULT_CONCURRENT_OPS;
+    }
+
+    return max_concurrent_ops;
+}
+
+static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
+                                int64_t offset, PreallocMode prealloc,
+                                Error **errp)
+{
+    uint64_t current_length;
+    char *buf = NULL;
+    int ret;
+
+    ret = rbd_get_size(image, &current_length);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to get file length");
+        goto out;
+    }
+
+    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Cannot use preallocation for shrinking files");
+        ret = -ENOTSUP;
+        goto out;
+    }
+
+    switch (prealloc) {
+    case PREALLOC_MODE_FULL: {
+        uint64_t current_offset = current_length;
+        int buf_size = 4 * KiB;
+        ssize_t bytes;
+
+#ifdef LIBRBD_SUPPORTS_WRITESAME
+        uint64_t stripe_unit, writesame_max_size;
+        int max_concurrent_ops;
+
+        max_concurrent_ops = qemu_rbd_get_max_concurrent_ops(cluster);
+        ret = rbd_get_stripe_unit(image, &stripe_unit);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to get stripe unit");
+            goto out;
+        }
+
+        /*
+         * We limit the rbd_writesame() size to avoid to spawn more then
+         * 'rbd_concurrent_management_ops' concurrent operations.
+         */
+        writesame_max_size = MIN(stripe_unit * max_concurrent_ops, INT_MAX);
+#endif /* LIBRBD_SUPPORTS_WRITESAME */
+
+        buf = g_malloc(buf_size);
+        /*
+         * Some versions of rbd_writesame() discards writes of buffers with
+         * all zeroes. In order to avoid this behaviour, we set the first byte
+         * to one.
+         */
+        buf[0] = 1;
+
+        ret = rbd_resize(image, offset);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to resize file");
+            goto out;
+        }
+
+#ifdef LIBRBD_SUPPORTS_WRITESAME
+        while (offset - current_offset > buf_size) {
+            bytes = MIN(offset - current_offset, writesame_max_size);
+            /*
+             * rbd_writesame() supports only request where the size of the
+             * operation is multiple of buffer size.
+             */
+            bytes -= bytes % buf_size;
+
+            bytes = rbd_writesame(image, current_offset, bytes, buf, buf_size,
+                                  0);
+            if (bytes < 0) {
+                ret = bytes;
+                error_setg_errno(errp, -ret,
+                                 "Failed to write for preallocation");
+                goto out;
+            }
+
+            current_offset += bytes;
+        }
+#endif /* LIBRBD_SUPPORTS_WRITESAME */
+
+        while (current_offset < offset) {
+            bytes = rbd_write(image, current_offset,
+                              MIN(offset - current_offset, buf_size), buf);
+            if (bytes < 0) {
+                ret = bytes;
+                error_setg_errno(errp, -ret,
+                                 "Failed to write for preallocation");
+                goto out;
+            }
+
+            current_offset += bytes;
+        }
+
+        ret = rbd_flush(image);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to flush the file");
+            goto out;
+        }
+
+        break;
+    }
+    case PREALLOC_MODE_OFF:
+        ret = rbd_resize(image, offset);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to resize file");
+            goto out;
+        }
+        break;
+    default:
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_str(prealloc));
+        ret = -ENOTSUP;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    g_free(buf);
+    return ret;
+}
+
 static QemuOptsList runtime_opts = {
     .name = "rbd",
     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -376,6 +519,7 @@  static int qemu_rbd_do_create(BlockdevCreateOptions *options,
     BlockdevCreateOptionsRbd *opts = &options->u.rbd;
     rados_t cluster;
     rados_ioctx_t io_ctx;
+    rbd_image_t image;
     int obj_order = 0;
     int ret;
 
@@ -404,13 +548,22 @@  static int qemu_rbd_do_create(BlockdevCreateOptions *options,
         return ret;
     }
 
-    ret = rbd_create(io_ctx, opts->location->image, opts->size, &obj_order);
+    ret = rbd_create(io_ctx, opts->location->image, 0, &obj_order);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "error rbd create");
         goto out;
     }
 
-    ret = 0;
+    ret = rbd_open(io_ctx, opts->location->image, &image, NULL);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error rbd open");
+        goto out;
+    }
+
+    ret = qemu_rbd_do_truncate(cluster, image, opts->size, opts->preallocation,
+                               errp);
+
+    rbd_close(image);
 out:
     rados_ioctx_destroy(io_ctx);
     rados_shutdown(cluster);
@@ -431,6 +584,7 @@  static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
     BlockdevOptionsRbd *loc;
     Error *local_err = NULL;
     const char *keypairs, *password_secret;
+    char *prealloc;
     QDict *options = NULL;
     int ret = 0;
 
@@ -449,6 +603,16 @@  static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
                                                    BLOCK_OPT_CLUSTER_SIZE, 0);
     rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0);
 
+    prealloc = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    rbd_opts->preallocation = qapi_enum_parse(&PreallocMode_lookup, prealloc,
+                                              PREALLOC_MODE_OFF, &local_err);
+    g_free(prealloc);
+    if (local_err) {
+        ret = -EINVAL;
+        error_propagate(errp, local_err);
+        goto exit;
+    }
+
     options = qdict_new();
     qemu_rbd_parse_filename(filename, options, &local_err);
     if (local_err) {
@@ -1052,21 +1216,8 @@  static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
                                              Error **errp)
 {
     BDRVRBDState *s = bs->opaque;
-    int r;
 
-    if (prealloc != PREALLOC_MODE_OFF) {
-        error_setg(errp, "Unsupported preallocation mode '%s'",
-                   PreallocMode_str(prealloc));
-        return -ENOTSUP;
-    }
-
-    r = rbd_resize(s->image, offset);
-    if (r < 0) {
-        error_setg_errno(errp, -r, "Failed to resize file");
-        return r;
-    }
-
-    return 0;
+    return qemu_rbd_do_truncate(s->cluster, s->image, offset, prealloc, errp);
 }
 
 static int qemu_rbd_snap_create(BlockDriverState *bs,
@@ -1219,6 +1370,11 @@  static QemuOptsList qemu_rbd_create_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "RBD object size"
         },
+        {
+            .name = BLOCK_OPT_PREALLOC,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode (allowed values: off, full)"
+        },
         {
             .name = "password-secret",
             .type = QEMU_OPT_STRING,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7ccbfff9d0..db25a4065b 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4277,13 +4277,15 @@ 
 #                   point to a snapshot.
 # @size             Size of the virtual disk in bytes
 # @cluster-size     RBD object size
+# @preallocation    Preallocation mode (allowed values: off, full)
 #
 # Since: 2.12
 ##
 { 'struct': 'BlockdevCreateOptionsRbd',
   'data': { 'location':         'BlockdevOptionsRbd',
             'size':             'size',
-            '*cluster-size' :   'size' } }
+            '*cluster-size' :   'size',
+            '*preallocation':   'PreallocMode' } }
 
 ##
 # @BlockdevVmdkSubformat: