diff mbox series

block/rbd: add preallocation support

Message ID 20190427113625.46594-1-sgarzare@redhat.com (mailing list archive)
State New, archived
Headers show
Series block/rbd: add preallocation support | expand

Commit Message

Stefano Garzarella April 27, 2019, 11:36 a.m. UTC
This patch adds the support of preallocation (off/full) for the RBD
block driver.
If available, we use rbd_writesame() to quickly fill the image when
full preallocation is required.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
 block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
 qapi/block-core.json |   4 +-
 2 files changed, 136 insertions(+), 17 deletions(-)

Comments

Jason Dillaman April 27, 2019, 12:43 p.m. UTC | #1
On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
>
> This patch adds the support of preallocation (off/full) for the RBD
> block driver.
> If available, we use rbd_writesame() to quickly fill the image when
> full preallocation is required.
>
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
>  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
>  qapi/block-core.json |   4 +-
>  2 files changed, 136 insertions(+), 17 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index 0c549c9935..29dd1bb040 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -13,6 +13,7 @@
>
>  #include "qemu/osdep.h"
>
> +#include "qemu/units.h"
>  #include <rbd/librbd.h>
>  #include "qapi/error.h"
>  #include "qemu/error-report.h"
> @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
>      }
>  }
>
> +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> +                                PreallocMode prealloc, Error **errp)
> +{
> +    uint64_t current_length;
> +    char *buf = NULL;
> +    int ret;
> +
> +    ret = rbd_get_size(image, &current_length);
> +    if (ret < 0) {
> +        error_setg_errno(errp, -ret, "Failed to get file length");
> +        goto out;
> +    }
> +
> +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> +        error_setg(errp, "Cannot use preallocation for shrinking files");
> +        ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    switch (prealloc) {
> +    case PREALLOC_MODE_FULL: {
> +        uint64_t current_offset = current_length;
> +        int buf_size = 64 * KiB;

This should probably be 512B or 4KiB (which is the smallest striped
unit size). Not only will this avoid sending unnecessary zeroes to the
backing cluster, writesame silently turns into a standard write if
your buffer isn't properly aligned with the min(object size, stripe
unit size).

> +        ssize_t bytes;
> +
> +        buf = g_malloc(buf_size);
> +        /*
> +         * Some versions of rbd_writesame() discards writes of buffers with
> +         * all zeroes. In order to avoid this behaviour, we set the first byte
> +         * to one.
> +         */
> +        buf[0] = 1;

You could also use "rados_conf_set(cluster,
"rbd_discard_on_zeroed_write_same", "false").

> +        ret = rbd_resize(image, offset);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to resize file");
> +            goto out;
> +        }
> +
> +#ifdef LIBRBD_SUPPORTS_WRITESAME
> +        while (offset - current_offset > buf_size) {
> +            /*
> +             * rbd_writesame() supports only request where the size of the
> +             * operation is multiple of buffer size and it must be less or
> +             * equal to INT_MAX.
> +             */
> +            bytes = MIN(offset - current_offset, INT_MAX);
> +            bytes -= bytes % buf_size;

Using the default object size of 4MiB, this write size would result in
up to 512 concurrent ops to the backing cluster. Perhaps the size
should be bounded such that only a dozen or so concurrent requests are
issued per write, always rounded next largest object / stripe period
size. librbd and the rbd CLI usually try to bound themselves to the
value in the "rbd_concurrent_management_ops" configuration setting
(currently defaults to 10).

> +            bytes = rbd_writesame(image, current_offset, bytes, buf, buf_size,
> +                                  0);
> +            if (bytes < 0) {
> +                ret = bytes;
> +                error_setg_errno(errp, -ret,
> +                                 "Failed to write for preallocation");
> +                goto out;
> +            }
> +
> +            current_offset += bytes;
> +        }
> +#endif /* LIBRBD_SUPPORTS_WRITESAME */
> +
> +        while (current_offset < offset) {
> +            bytes = rbd_write(image, current_offset,
> +                              MIN(offset - current_offset, buf_size), buf);
> +            if (bytes < 0) {
> +                ret = bytes;
> +                error_setg_errno(errp, -ret,
> +                                 "Failed to write for preallocation");
> +                goto out;
> +            }
> +
> +            current_offset += bytes;
> +        }
> +
> +        ret = rbd_flush(image);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to flush the file");
> +            goto out;
> +        }
> +
> +        break;
> +    }
> +    case PREALLOC_MODE_OFF:
> +        ret = rbd_resize(image, offset);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Failed to resize file");
> +            goto out;
> +        }
> +        break;
> +    default:
> +        error_setg(errp, "Unsupported preallocation mode: %s",
> +                   PreallocMode_str(prealloc));
> +        ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    ret = 0;
> +
> +out:
> +    g_free(buf);
> +    return ret;
> +}
> +
>  static QemuOptsList runtime_opts = {
>      .name = "rbd",
>      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> @@ -376,6 +481,7 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options,
>      BlockdevCreateOptionsRbd *opts = &options->u.rbd;
>      rados_t cluster;
>      rados_ioctx_t io_ctx;
> +    rbd_image_t image;
>      int obj_order = 0;
>      int ret;
>
> @@ -404,13 +510,21 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options,
>          return ret;
>      }
>
> -    ret = rbd_create(io_ctx, opts->location->image, opts->size, &obj_order);
> +    ret = rbd_create(io_ctx, opts->location->image, 0, &obj_order);
>      if (ret < 0) {
>          error_setg_errno(errp, -ret, "error rbd create");
>          goto out;
>      }
>
> -    ret = 0;
> +    ret = rbd_open(io_ctx, opts->location->image, &image, NULL);
> +    if (ret < 0) {
> +        error_setg_errno(errp, -ret, "error rbd open");
> +        goto out;
> +    }
> +
> +    ret = qemu_rbd_do_truncate(image, opts->size, opts->preallocation, errp);
> +
> +    rbd_close(image);
>  out:
>      rados_ioctx_destroy(io_ctx);
>      rados_shutdown(cluster);
> @@ -431,6 +545,7 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
>      BlockdevOptionsRbd *loc;
>      Error *local_err = NULL;
>      const char *keypairs, *password_secret;
> +    char *prealloc;
>      QDict *options = NULL;
>      int ret = 0;
>
> @@ -449,6 +564,16 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
>                                                     BLOCK_OPT_CLUSTER_SIZE, 0);
>      rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0);
>
> +    prealloc = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
> +    rbd_opts->preallocation = qapi_enum_parse(&PreallocMode_lookup, prealloc,
> +                                              PREALLOC_MODE_OFF, &local_err);
> +    g_free(prealloc);
> +    if (local_err) {
> +        ret = -EINVAL;
> +        error_propagate(errp, local_err);
> +        goto exit;
> +    }
> +
>      options = qdict_new();
>      qemu_rbd_parse_filename(filename, options, &local_err);
>      if (local_err) {
> @@ -1052,21 +1177,8 @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
>                                               Error **errp)
>  {
>      BDRVRBDState *s = bs->opaque;
> -    int r;
> -
> -    if (prealloc != PREALLOC_MODE_OFF) {
> -        error_setg(errp, "Unsupported preallocation mode '%s'",
> -                   PreallocMode_str(prealloc));
> -        return -ENOTSUP;
> -    }
> -
> -    r = rbd_resize(s->image, offset);
> -    if (r < 0) {
> -        error_setg_errno(errp, -r, "Failed to resize file");
> -        return r;
> -    }
>
> -    return 0;
> +    return qemu_rbd_do_truncate(s->image, offset, prealloc, errp);
>  }
>
>  static int qemu_rbd_snap_create(BlockDriverState *bs,
> @@ -1219,6 +1331,11 @@ static QemuOptsList qemu_rbd_create_opts = {
>              .type = QEMU_OPT_SIZE,
>              .help = "RBD object size"
>          },
> +        {
> +            .name = BLOCK_OPT_PREALLOC,
> +            .type = QEMU_OPT_STRING,
> +            .help = "Preallocation mode (allowed values: off, full)"
> +        },
>          {
>              .name = "password-secret",
>              .type = QEMU_OPT_STRING,
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 7ccbfff9d0..db25a4065b 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -4277,13 +4277,15 @@
>  #                   point to a snapshot.
>  # @size             Size of the virtual disk in bytes
>  # @cluster-size     RBD object size
> +# @preallocation    Preallocation mode (allowed values: off, full)
>  #
>  # Since: 2.12
>  ##
>  { 'struct': 'BlockdevCreateOptionsRbd',
>    'data': { 'location':         'BlockdevOptionsRbd',
>              'size':             'size',
> -            '*cluster-size' :   'size' } }
> +            '*cluster-size' :   'size',
> +            '*preallocation':   'PreallocMode' } }
>
>  ##
>  # @BlockdevVmdkSubformat:
> --
> 2.20.1
>
>
Stefano Garzarella April 29, 2019, 12:47 p.m. UTC | #2
On Sat, Apr 27, 2019 at 08:43:26AM -0400, Jason Dillaman wrote:
> On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
> >
> > This patch adds the support of preallocation (off/full) for the RBD
> > block driver.
> > If available, we use rbd_writesame() to quickly fill the image when
> > full preallocation is required.
> >
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > ---
> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> >  qapi/block-core.json |   4 +-
> >  2 files changed, 136 insertions(+), 17 deletions(-)
> >
> > diff --git a/block/rbd.c b/block/rbd.c
> > index 0c549c9935..29dd1bb040 100644
> > --- a/block/rbd.c
> > +++ b/block/rbd.c
> > @@ -13,6 +13,7 @@
> >
> >  #include "qemu/osdep.h"
> >
> > +#include "qemu/units.h"
> >  #include <rbd/librbd.h>
> >  #include "qapi/error.h"
> >  #include "qemu/error-report.h"
> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> >      }
> >  }
> >
> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> > +                                PreallocMode prealloc, Error **errp)
> > +{
> > +    uint64_t current_length;
> > +    char *buf = NULL;
> > +    int ret;
> > +
> > +    ret = rbd_get_size(image, &current_length);
> > +    if (ret < 0) {
> > +        error_setg_errno(errp, -ret, "Failed to get file length");
> > +        goto out;
> > +    }
> > +
> > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > +        error_setg(errp, "Cannot use preallocation for shrinking files");
> > +        ret = -ENOTSUP;
> > +        goto out;
> > +    }
> > +
> > +    switch (prealloc) {
> > +    case PREALLOC_MODE_FULL: {
> > +        uint64_t current_offset = current_length;
> > +        int buf_size = 64 * KiB;
> 
> This should probably be 512B or 4KiB (which is the smallest striped
> unit size). Not only will this avoid sending unnecessary zeroes to the
> backing cluster, writesame silently turns into a standard write if
> your buffer isn't properly aligned with the min(object size, stripe
> unit size).
> 

Okay, I'll change it on v2.
Should we query about the "stripe_unit" size or we simply use the
smallest allowed?

> > +        ssize_t bytes;
> > +
> > +        buf = g_malloc(buf_size);
> > +        /*
> > +         * Some versions of rbd_writesame() discards writes of buffers with
> > +         * all zeroes. In order to avoid this behaviour, we set the first byte
> > +         * to one.
> > +         */
> > +        buf[0] = 1;
> 
> You could also use "rados_conf_set(cluster,
> "rbd_discard_on_zeroed_write_same", "false").
> 

I tried it, but it is not supported on all versions. (eg. I have Ceph
v12.2.11 on my Fedora 29 and it is not supported, but rbd_writesame() is
available)

Maybe we can use both: "rbd_discard_on_zeroed_write_same = false" and
"buf[0] = 1"

> > +        ret = rbd_resize(image, offset);
> > +        if (ret < 0) {
> > +            error_setg_errno(errp, -ret, "Failed to resize file");
> > +            goto out;
> > +        }
> > +
> > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > +        while (offset - current_offset > buf_size) {
> > +            /*
> > +             * rbd_writesame() supports only request where the size of the
> > +             * operation is multiple of buffer size and it must be less or
> > +             * equal to INT_MAX.
> > +             */
> > +            bytes = MIN(offset - current_offset, INT_MAX);
> > +            bytes -= bytes % buf_size;
> 
> Using the default object size of 4MiB, this write size would result in
> up to 512 concurrent ops to the backing cluster. Perhaps the size
> should be bounded such that only a dozen or so concurrent requests are
> issued per write, always rounded next largest object / stripe period
> size. librbd and the rbd CLI usually try to bound themselves to the
> value in the "rbd_concurrent_management_ops" configuration setting
> (currently defaults to 10).
> 

Do you suggest to use "rbd_concurrent_management_ops" to limit
concurrent requests or use a new QEMU parameters for the RBD driver?

Thanks for your comments,
Stefano
Jason Dillaman April 29, 2019, 1 p.m. UTC | #3
On Mon, Apr 29, 2019 at 8:47 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
>
> On Sat, Apr 27, 2019 at 08:43:26AM -0400, Jason Dillaman wrote:
> > On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
> > >
> > > This patch adds the support of preallocation (off/full) for the RBD
> > > block driver.
> > > If available, we use rbd_writesame() to quickly fill the image when
> > > full preallocation is required.
> > >
> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > ---
> > >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> > >  qapi/block-core.json |   4 +-
> > >  2 files changed, 136 insertions(+), 17 deletions(-)
> > >
> > > diff --git a/block/rbd.c b/block/rbd.c
> > > index 0c549c9935..29dd1bb040 100644
> > > --- a/block/rbd.c
> > > +++ b/block/rbd.c
> > > @@ -13,6 +13,7 @@
> > >
> > >  #include "qemu/osdep.h"
> > >
> > > +#include "qemu/units.h"
> > >  #include <rbd/librbd.h>
> > >  #include "qapi/error.h"
> > >  #include "qemu/error-report.h"
> > > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> > >      }
> > >  }
> > >
> > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> > > +                                PreallocMode prealloc, Error **errp)
> > > +{
> > > +    uint64_t current_length;
> > > +    char *buf = NULL;
> > > +    int ret;
> > > +
> > > +    ret = rbd_get_size(image, &current_length);
> > > +    if (ret < 0) {
> > > +        error_setg_errno(errp, -ret, "Failed to get file length");
> > > +        goto out;
> > > +    }
> > > +
> > > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > > +        error_setg(errp, "Cannot use preallocation for shrinking files");
> > > +        ret = -ENOTSUP;
> > > +        goto out;
> > > +    }
> > > +
> > > +    switch (prealloc) {
> > > +    case PREALLOC_MODE_FULL: {
> > > +        uint64_t current_offset = current_length;
> > > +        int buf_size = 64 * KiB;
> >
> > This should probably be 512B or 4KiB (which is the smallest striped
> > unit size). Not only will this avoid sending unnecessary zeroes to the
> > backing cluster, writesame silently turns into a standard write if
> > your buffer isn't properly aligned with the min(object size, stripe
> > unit size).
> >
>
> Okay, I'll change it on v2.
> Should we query about the "stripe_unit" size or we simply use the
> smallest allowed?

Technically we don't prevent a user from choosing terrible stripe unit
sizes (e.g. 1 byte), so you are probably safe to just use 4KiB.

> > > +        ssize_t bytes;
> > > +
> > > +        buf = g_malloc(buf_size);
> > > +        /*
> > > +         * Some versions of rbd_writesame() discards writes of buffers with
> > > +         * all zeroes. In order to avoid this behaviour, we set the first byte
> > > +         * to one.
> > > +         */
> > > +        buf[0] = 1;
> >
> > You could also use "rados_conf_set(cluster,
> > "rbd_discard_on_zeroed_write_same", "false").
> >
>
> I tried it, but it is not supported on all versions. (eg. I have Ceph
> v12.2.11 on my Fedora 29 and it is not supported, but rbd_writesame() is
> available)
>
> Maybe we can use both: "rbd_discard_on_zeroed_write_same = false" and
> "buf[0] = 1"

Probably not worth the effort if it's not supported across all releases.

> > > +        ret = rbd_resize(image, offset);
> > > +        if (ret < 0) {
> > > +            error_setg_errno(errp, -ret, "Failed to resize file");
> > > +            goto out;
> > > +        }
> > > +
> > > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > > +        while (offset - current_offset > buf_size) {
> > > +            /*
> > > +             * rbd_writesame() supports only request where the size of the
> > > +             * operation is multiple of buffer size and it must be less or
> > > +             * equal to INT_MAX.
> > > +             */
> > > +            bytes = MIN(offset - current_offset, INT_MAX);
> > > +            bytes -= bytes % buf_size;
> >
> > Using the default object size of 4MiB, this write size would result in
> > up to 512 concurrent ops to the backing cluster. Perhaps the size
> > should be bounded such that only a dozen or so concurrent requests are
> > issued per write, always rounded next largest object / stripe period
> > size. librbd and the rbd CLI usually try to bound themselves to the
> > value in the "rbd_concurrent_management_ops" configuration setting
> > (currently defaults to 10).
> >
>
> Do you suggest to use "rbd_concurrent_management_ops" to limit
> concurrent requests or use a new QEMU parameters for the RBD driver?

I think it would be nicer to just query the
"rbd_concurrent_management_ops" limit to derive your writesame size
since the Ceph cluster admin can globally set that option to match the
available parallelism of the cluster.

> Thanks for your comments,
> Stefano
Stefano Garzarella April 29, 2019, 2:08 p.m. UTC | #4
On Mon, Apr 29, 2019 at 09:00:26AM -0400, Jason Dillaman wrote:
> On Mon, Apr 29, 2019 at 8:47 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
> >
> > On Sat, Apr 27, 2019 at 08:43:26AM -0400, Jason Dillaman wrote:
> > > On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
> > > >
> > > > This patch adds the support of preallocation (off/full) for the RBD
> > > > block driver.
> > > > If available, we use rbd_writesame() to quickly fill the image when
> > > > full preallocation is required.
> > > >
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > ---
> > > >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> > > >  qapi/block-core.json |   4 +-
> > > >  2 files changed, 136 insertions(+), 17 deletions(-)
> > > >
> > > > diff --git a/block/rbd.c b/block/rbd.c
> > > > index 0c549c9935..29dd1bb040 100644
> > > > --- a/block/rbd.c
> > > > +++ b/block/rbd.c
> > > > @@ -13,6 +13,7 @@
> > > >
> > > >  #include "qemu/osdep.h"
> > > >
> > > > +#include "qemu/units.h"
> > > >  #include <rbd/librbd.h>
> > > >  #include "qapi/error.h"
> > > >  #include "qemu/error-report.h"
> > > > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> > > >      }
> > > >  }
> > > >
> > > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> > > > +                                PreallocMode prealloc, Error **errp)
> > > > +{
> > > > +    uint64_t current_length;
> > > > +    char *buf = NULL;
> > > > +    int ret;
> > > > +
> > > > +    ret = rbd_get_size(image, &current_length);
> > > > +    if (ret < 0) {
> > > > +        error_setg_errno(errp, -ret, "Failed to get file length");
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > > > +        error_setg(errp, "Cannot use preallocation for shrinking files");
> > > > +        ret = -ENOTSUP;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    switch (prealloc) {
> > > > +    case PREALLOC_MODE_FULL: {
> > > > +        uint64_t current_offset = current_length;
> > > > +        int buf_size = 64 * KiB;
> > >
> > > This should probably be 512B or 4KiB (which is the smallest striped
> > > unit size). Not only will this avoid sending unnecessary zeroes to the
> > > backing cluster, writesame silently turns into a standard write if
> > > your buffer isn't properly aligned with the min(object size, stripe
> > > unit size).
> > >
> >
> > Okay, I'll change it on v2.
> > Should we query about the "stripe_unit" size or we simply use the
> > smallest allowed?
> 
> Technically we don't prevent a user from choosing terrible stripe unit
> sizes (e.g. 1 byte), so you are probably safe to just use 4KiB.
> 
> > > > +        ssize_t bytes;
> > > > +
> > > > +        buf = g_malloc(buf_size);
> > > > +        /*
> > > > +         * Some versions of rbd_writesame() discards writes of buffers with
> > > > +         * all zeroes. In order to avoid this behaviour, we set the first byte
> > > > +         * to one.
> > > > +         */
> > > > +        buf[0] = 1;
> > >
> > > You could also use "rados_conf_set(cluster,
> > > "rbd_discard_on_zeroed_write_same", "false").
> > >
> >
> > I tried it, but it is not supported on all versions. (eg. I have Ceph
> > v12.2.11 on my Fedora 29 and it is not supported, but rbd_writesame() is
> > available)
> >
> > Maybe we can use both: "rbd_discard_on_zeroed_write_same = false" and
> > "buf[0] = 1"
> 
> Probably not worth the effort if it's not supported across all releases.
> 
> > > > +        ret = rbd_resize(image, offset);
> > > > +        if (ret < 0) {
> > > > +            error_setg_errno(errp, -ret, "Failed to resize file");
> > > > +            goto out;
> > > > +        }
> > > > +
> > > > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > > > +        while (offset - current_offset > buf_size) {
> > > > +            /*
> > > > +             * rbd_writesame() supports only request where the size of the
> > > > +             * operation is multiple of buffer size and it must be less or
> > > > +             * equal to INT_MAX.
> > > > +             */
> > > > +            bytes = MIN(offset - current_offset, INT_MAX);
> > > > +            bytes -= bytes % buf_size;
> > >
> > > Using the default object size of 4MiB, this write size would result in
> > > up to 512 concurrent ops to the backing cluster. Perhaps the size
> > > should be bounded such that only a dozen or so concurrent requests are
> > > issued per write, always rounded next largest object / stripe period
> > > size. librbd and the rbd CLI usually try to bound themselves to the
> > > value in the "rbd_concurrent_management_ops" configuration setting
> > > (currently defaults to 10).
> > >
> >
> > Do you suggest to use "rbd_concurrent_management_ops" to limit
> > concurrent requests or use a new QEMU parameters for the RBD driver?
> 
> I think it would be nicer to just query the
> "rbd_concurrent_management_ops" limit to derive your writesame size
> since the Ceph cluster admin can globally set that option to match the
> available parallelism of the cluster.
> 

Thanks for the details, I'll send a v2 following yuor comments!

Stefano
Markus Armbruster May 7, 2019, 6:34 a.m. UTC | #5
Cc: Peter for a libvirt perspective.

Stefano Garzarella <sgarzare@redhat.com> writes:

> This patch adds the support of preallocation (off/full) for the RBD
> block driver.
> If available, we use rbd_writesame() to quickly fill the image when
> full preallocation is required.
>
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
>  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
>  qapi/block-core.json |   4 +-
>  2 files changed, 136 insertions(+), 17 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index 0c549c9935..29dd1bb040 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -13,6 +13,7 @@
>  
>  #include "qemu/osdep.h"
>  
> +#include "qemu/units.h"
>  #include <rbd/librbd.h>
>  #include "qapi/error.h"
>  #include "qemu/error-report.h"
> @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
>      }
>  }
>  
> +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> +                                PreallocMode prealloc, Error **errp)
> +{
> +    uint64_t current_length;
> +    char *buf = NULL;
> +    int ret;
> +
> +    ret = rbd_get_size(image, &current_length);
> +    if (ret < 0) {
> +        error_setg_errno(errp, -ret, "Failed to get file length");
> +        goto out;
> +    }
> +
> +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> +        error_setg(errp, "Cannot use preallocation for shrinking files");
> +        ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    switch (prealloc) {
> +    case PREALLOC_MODE_FULL: {
[...]
> +    case PREALLOC_MODE_OFF:
[...]
> +    default:
> +        error_setg(errp, "Unsupported preallocation mode: %s",
> +                   PreallocMode_str(prealloc));
> +        ret = -ENOTSUP;
> +        goto out;
> +    }

Other block drivers also accept only some values of PreallocMode.  Okay.

I wonder whether management applications need to know which values are
supported.

Let me review support in drivers:

* file (file-win32.c)
* iscsi
* nfs
* qed
* ssh

  - Reject all but PREALLOC_MODE_OFF

* copy-on-read
* luks (crypto.c)
* raw

  - Pass through only

* file host_cdrom host_device (file-posix.c)

  - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
    files
  - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
  - Reject PREALLOC_MODE_METADATA

* gluster

  - Reject all but PREALLOC_MODE_OFF when shrinking
  - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
  - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
  - Reject PREALLOC_MODE_METADATA

* qcow2

  - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
    file
  
* rbd with this patch

  - Reject all but PREALLOC_MODE_OFF when shrinking
  - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC

* sheepdog

  - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
  - Doesn't support shrinking

* vdi

  - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
  - Doesn't support shrinking

* blkdebug
* blklogwrites
* blkverify
* bochs
* cloop
* dmg
* ftp
* ftps
* http
* https
* luks
* nbd
* null-aio
* null-co
* nvme
* parallels
* qcow
* quorum
* replication
* throttle
* vhdx
* vmdk
* vpc
* vvfat
* vxhs

  - These appear not to use PreallocMode: they don't implement
    .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
    implement it without a prealloc parameter.

Looks good to me.

> +
> +    ret = 0;
> +
> +out:
> +    g_free(buf);
> +    return ret;
> +}
> +
>  static QemuOptsList runtime_opts = {
>      .name = "rbd",
>      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
[...]
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 7ccbfff9d0..db25a4065b 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -4277,13 +4277,15 @@
>  #                   point to a snapshot.
>  # @size             Size of the virtual disk in bytes
>  # @cluster-size     RBD object size
> +# @preallocation    Preallocation mode (allowed values: off, full)
>  #
>  # Since: 2.12
>  ##
>  { 'struct': 'BlockdevCreateOptionsRbd',
>    'data': { 'location':         'BlockdevOptionsRbd',
>              'size':             'size',
> -            '*cluster-size' :   'size' } }
> +            '*cluster-size' :   'size',
> +            '*preallocation':   'PreallocMode' } }
>  
>  ##
>  # @BlockdevVmdkSubformat:

The non-support of values 'metadata' and 'falloc' is not visible in
introspection, only in documentation.  No reason to block this patch, as
the other block drivers have the same introspection weakness (only
sheepdog and vdi bother to document).

Should we address the introspection weakness?  Only if there's a use for
the information, I think.

Should we improve documentation for the other block drivers?
Stefano Garzarella May 7, 2019, 8:36 a.m. UTC | #6
On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
> Cc: Peter for a libvirt perspective.
> 
> Stefano Garzarella <sgarzare@redhat.com> writes:
> 
> > This patch adds the support of preallocation (off/full) for the RBD
> > block driver.
> > If available, we use rbd_writesame() to quickly fill the image when
> > full preallocation is required.
> >
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > ---
> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> >  qapi/block-core.json |   4 +-
> >  2 files changed, 136 insertions(+), 17 deletions(-)
> >
> > diff --git a/block/rbd.c b/block/rbd.c
> > index 0c549c9935..29dd1bb040 100644
> > --- a/block/rbd.c
> > +++ b/block/rbd.c
> > @@ -13,6 +13,7 @@
> >  
> >  #include "qemu/osdep.h"
> >  
> > +#include "qemu/units.h"
> >  #include <rbd/librbd.h>
> >  #include "qapi/error.h"
> >  #include "qemu/error-report.h"
> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> >      }
> >  }
> >  
> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> > +                                PreallocMode prealloc, Error **errp)
> > +{
> > +    uint64_t current_length;
> > +    char *buf = NULL;
> > +    int ret;
> > +
> > +    ret = rbd_get_size(image, &current_length);
> > +    if (ret < 0) {
> > +        error_setg_errno(errp, -ret, "Failed to get file length");
> > +        goto out;
> > +    }
> > +
> > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > +        error_setg(errp, "Cannot use preallocation for shrinking files");
> > +        ret = -ENOTSUP;
> > +        goto out;
> > +    }
> > +
> > +    switch (prealloc) {
> > +    case PREALLOC_MODE_FULL: {
> [...]
> > +    case PREALLOC_MODE_OFF:
> [...]
> > +    default:
> > +        error_setg(errp, "Unsupported preallocation mode: %s",
> > +                   PreallocMode_str(prealloc));
> > +        ret = -ENOTSUP;
> > +        goto out;
> > +    }
> 
> Other block drivers also accept only some values of PreallocMode.  Okay.
> 
> I wonder whether management applications need to know which values are
> supported.

Good point!

> 
> Let me review support in drivers:
> 
> * file (file-win32.c)
> * iscsi
> * nfs
> * qed
> * ssh
> 
>   - Reject all but PREALLOC_MODE_OFF
> 
> * copy-on-read
> * luks (crypto.c)
> * raw
> 
>   - Pass through only
> 
> * file host_cdrom host_device (file-posix.c)
> 
>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
>     files
>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
>   - Reject PREALLOC_MODE_METADATA
> 
> * gluster
> 
>   - Reject all but PREALLOC_MODE_OFF when shrinking
>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
>   - Reject PREALLOC_MODE_METADATA
> 
> * qcow2
> 
>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
>     file
>   
> * rbd with this patch
> 
>   - Reject all but PREALLOC_MODE_OFF when shrinking
>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> 
> * sheepdog
> 
>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
>   - Doesn't support shrinking
> 
> * vdi
> 
>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
>   - Doesn't support shrinking
> 
> * blkdebug
> * blklogwrites
> * blkverify
> * bochs
> * cloop
> * dmg
> * ftp
> * ftps
> * http
> * https
> * luks
> * nbd
> * null-aio
> * null-co
> * nvme
> * parallels
> * qcow
> * quorum
> * replication
> * throttle
> * vhdx
> * vmdk
> * vpc
> * vvfat
> * vxhs
> 
>   - These appear not to use PreallocMode: they don't implement
>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
>     implement it without a prealloc parameter.
> 
> Looks good to me.
>

Thanks for the analysis!

> > +
> > +    ret = 0;
> > +
> > +out:
> > +    g_free(buf);
> > +    return ret;
> > +}
> > +
> >  static QemuOptsList runtime_opts = {
> >      .name = "rbd",
> >      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> [...]
> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> > index 7ccbfff9d0..db25a4065b 100644
> > --- a/qapi/block-core.json
> > +++ b/qapi/block-core.json
> > @@ -4277,13 +4277,15 @@
> >  #                   point to a snapshot.
> >  # @size             Size of the virtual disk in bytes
> >  # @cluster-size     RBD object size
> > +# @preallocation    Preallocation mode (allowed values: off, full)
> >  #
> >  # Since: 2.12
> >  ##
> >  { 'struct': 'BlockdevCreateOptionsRbd',
> >    'data': { 'location':         'BlockdevOptionsRbd',
> >              'size':             'size',
> > -            '*cluster-size' :   'size' } }
> > +            '*cluster-size' :   'size',
> > +            '*preallocation':   'PreallocMode' } }
> >  
> >  ##
> >  # @BlockdevVmdkSubformat:
> 
> The non-support of values 'metadata' and 'falloc' is not visible in
> introspection, only in documentation.  No reason to block this patch, as
> the other block drivers have the same introspection weakness (only
> sheepdog and vdi bother to document).
> 
> Should we address the introspection weakness?  Only if there's a use for
> the information, I think.

If the management applications will use that information (or maybe also
our help pages), could be useful to have an array of 'PreallocMode'
supported per-driver.

> 
> Should we improve documentation for the other block drivers?
> 

Yes, e.g. for Gluster it is not updated.
If you agree, I can check and update the documentation of all drivers following
your analysis.

Cheers,
Stefano
Markus Armbruster May 8, 2019, 11:44 a.m. UTC | #7
Stefano Garzarella <sgarzare@redhat.com> writes:

> On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
>> Cc: Peter for a libvirt perspective.
>> 
>> Stefano Garzarella <sgarzare@redhat.com> writes:
>> 
>> > This patch adds the support of preallocation (off/full) for the RBD
>> > block driver.
>> > If available, we use rbd_writesame() to quickly fill the image when
>> > full preallocation is required.
>> >
>> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>> > ---
>> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
>> >  qapi/block-core.json |   4 +-
>> >  2 files changed, 136 insertions(+), 17 deletions(-)
>> >
>> > diff --git a/block/rbd.c b/block/rbd.c
>> > index 0c549c9935..29dd1bb040 100644
>> > --- a/block/rbd.c
>> > +++ b/block/rbd.c
>> > @@ -13,6 +13,7 @@
>> >  
>> >  #include "qemu/osdep.h"
>> >  
>> > +#include "qemu/units.h"
>> >  #include <rbd/librbd.h>
>> >  #include "qapi/error.h"
>> >  #include "qemu/error-report.h"
>> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
>> >      }
>> >  }
>> >  
>> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
>> > +                                PreallocMode prealloc, Error **errp)
>> > +{
>> > +    uint64_t current_length;
>> > +    char *buf = NULL;
>> > +    int ret;
>> > +
>> > +    ret = rbd_get_size(image, &current_length);
>> > +    if (ret < 0) {
>> > +        error_setg_errno(errp, -ret, "Failed to get file length");
>> > +        goto out;
>> > +    }
>> > +
>> > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
>> > +        error_setg(errp, "Cannot use preallocation for shrinking files");
>> > +        ret = -ENOTSUP;
>> > +        goto out;
>> > +    }
>> > +
>> > +    switch (prealloc) {
>> > +    case PREALLOC_MODE_FULL: {
>> [...]
>> > +    case PREALLOC_MODE_OFF:
>> [...]
>> > +    default:
>> > +        error_setg(errp, "Unsupported preallocation mode: %s",
>> > +                   PreallocMode_str(prealloc));
>> > +        ret = -ENOTSUP;
>> > +        goto out;
>> > +    }
>> 
>> Other block drivers also accept only some values of PreallocMode.  Okay.
>> 
>> I wonder whether management applications need to know which values are
>> supported.
>
> Good point!

We can continue to assume they don't until somebody tells us otherwise.

>> Let me review support in drivers:
>> 
>> * file (file-win32.c)
>> * iscsi
>> * nfs
>> * qed
>> * ssh
>> 
>>   - Reject all but PREALLOC_MODE_OFF
>> 
>> * copy-on-read
>> * luks (crypto.c)
>> * raw
>> 
>>   - Pass through only
>> 
>> * file host_cdrom host_device (file-posix.c)
>> 
>>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
>>     files
>>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
>>   - Reject PREALLOC_MODE_METADATA
>> 
>> * gluster
>> 
>>   - Reject all but PREALLOC_MODE_OFF when shrinking
>>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
>>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
>>   - Reject PREALLOC_MODE_METADATA
>> 
>> * qcow2
>> 
>>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
>>     file
>>   
>> * rbd with this patch
>> 
>>   - Reject all but PREALLOC_MODE_OFF when shrinking
>>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
>> 
>> * sheepdog
>> 
>>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
>>   - Doesn't support shrinking
>> 
>> * vdi
>> 
>>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
>>   - Doesn't support shrinking
>> 
>> * blkdebug
>> * blklogwrites
>> * blkverify
>> * bochs
>> * cloop
>> * dmg
>> * ftp
>> * ftps
>> * http
>> * https
>> * luks
>> * nbd
>> * null-aio
>> * null-co
>> * nvme
>> * parallels
>> * qcow
>> * quorum
>> * replication
>> * throttle
>> * vhdx
>> * vmdk
>> * vpc
>> * vvfat
>> * vxhs
>> 
>>   - These appear not to use PreallocMode: they don't implement
>>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
>>     implement it without a prealloc parameter.
>> 
>> Looks good to me.
>>
>
> Thanks for the analysis!
>
>> > +
>> > +    ret = 0;
>> > +
>> > +out:
>> > +    g_free(buf);
>> > +    return ret;
>> > +}
>> > +
>> >  static QemuOptsList runtime_opts = {
>> >      .name = "rbd",
>> >      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
>> [...]
>> > diff --git a/qapi/block-core.json b/qapi/block-core.json
>> > index 7ccbfff9d0..db25a4065b 100644
>> > --- a/qapi/block-core.json
>> > +++ b/qapi/block-core.json
>> > @@ -4277,13 +4277,15 @@
>> >  #                   point to a snapshot.
>> >  # @size             Size of the virtual disk in bytes
>> >  # @cluster-size     RBD object size
>> > +# @preallocation    Preallocation mode (allowed values: off, full)
>> >  #
>> >  # Since: 2.12
>> >  ##
>> >  { 'struct': 'BlockdevCreateOptionsRbd',
>> >    'data': { 'location':         'BlockdevOptionsRbd',
>> >              'size':             'size',
>> > -            '*cluster-size' :   'size' } }
>> > +            '*cluster-size' :   'size',
>> > +            '*preallocation':   'PreallocMode' } }
>> >  
>> >  ##
>> >  # @BlockdevVmdkSubformat:
>> 
>> The non-support of values 'metadata' and 'falloc' is not visible in
>> introspection, only in documentation.  No reason to block this patch, as
>> the other block drivers have the same introspection weakness (only
>> sheepdog and vdi bother to document).
>> 
>> Should we address the introspection weakness?  Only if there's a use for
>> the information, I think.
>
> If the management applications will use that information (or maybe also
> our help pages), could be useful to have an array of 'PreallocMode'
> supported per-driver.

Ideally, query-qmp-schema would show only the supported values.

Not hard to do, just tedious: we'd get a number of sub-enums in addition
to the full one, and we'd have to map from sub-enum to the full one.

QAPI language support for sub-enums would remove most of the tedium.
Not worthwhile unless the need for sub-enums is actually common.

>> Should we improve documentation for the other block drivers?
>> 
>
> Yes, e.g. for Gluster it is not updated.
> If you agree, I can check and update the documentation of all drivers following
> your analysis.

Yes, please!
Stefano Garzarella May 9, 2019, 8:26 a.m. UTC | #8
On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote:
> Stefano Garzarella <sgarzare@redhat.com> writes:
> 
> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
> >> Cc: Peter for a libvirt perspective.
> >> 
> >> Stefano Garzarella <sgarzare@redhat.com> writes:
> >> 
> >> > This patch adds the support of preallocation (off/full) for the RBD
> >> > block driver.
> >> > If available, we use rbd_writesame() to quickly fill the image when
> >> > full preallocation is required.
> >> >
> >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> >> > ---
> >> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> >> >  qapi/block-core.json |   4 +-
> >> >  2 files changed, 136 insertions(+), 17 deletions(-)
> >> >
> >> > diff --git a/block/rbd.c b/block/rbd.c
> >> > index 0c549c9935..29dd1bb040 100644
> >> > --- a/block/rbd.c
> >> > +++ b/block/rbd.c
> >> > @@ -13,6 +13,7 @@
> >> >  
> >> >  #include "qemu/osdep.h"
> >> >  
> >> > +#include "qemu/units.h"
> >> >  #include <rbd/librbd.h>
> >> >  #include "qapi/error.h"
> >> >  #include "qemu/error-report.h"
> >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> >> >      }
> >> >  }
> >> >  
> >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> >> > +                                PreallocMode prealloc, Error **errp)
> >> > +{
> >> > +    uint64_t current_length;
> >> > +    char *buf = NULL;
> >> > +    int ret;
> >> > +
> >> > +    ret = rbd_get_size(image, &current_length);
> >> > +    if (ret < 0) {
> >> > +        error_setg_errno(errp, -ret, "Failed to get file length");
> >> > +        goto out;
> >> > +    }
> >> > +
> >> > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> >> > +        error_setg(errp, "Cannot use preallocation for shrinking files");
> >> > +        ret = -ENOTSUP;
> >> > +        goto out;
> >> > +    }
> >> > +
> >> > +    switch (prealloc) {
> >> > +    case PREALLOC_MODE_FULL: {
> >> [...]
> >> > +    case PREALLOC_MODE_OFF:
> >> [...]
> >> > +    default:
> >> > +        error_setg(errp, "Unsupported preallocation mode: %s",
> >> > +                   PreallocMode_str(prealloc));
> >> > +        ret = -ENOTSUP;
> >> > +        goto out;
> >> > +    }
> >> 
> >> Other block drivers also accept only some values of PreallocMode.  Okay.
> >> 
> >> I wonder whether management applications need to know which values are
> >> supported.
> >
> > Good point!
> 
> We can continue to assume they don't until somebody tells us otherwise.
> 
> >> Let me review support in drivers:
> >> 
> >> * file (file-win32.c)
> >> * iscsi
> >> * nfs
> >> * qed
> >> * ssh
> >> 
> >>   - Reject all but PREALLOC_MODE_OFF
> >> 
> >> * copy-on-read
> >> * luks (crypto.c)
> >> * raw
> >> 
> >>   - Pass through only
> >> 
> >> * file host_cdrom host_device (file-posix.c)
> >> 
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
> >>     files
> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
> >>   - Reject PREALLOC_MODE_METADATA
> >> 
> >> * gluster
> >> 
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
> >>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
> >>   - Reject PREALLOC_MODE_METADATA
> >> 
> >> * qcow2
> >> 
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
> >>     file
> >>   
> >> * rbd with this patch
> >> 
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >> 
> >> * sheepdog
> >> 
> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >>   - Doesn't support shrinking
> >> 
> >> * vdi
> >> 
> >>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
> >>   - Doesn't support shrinking
> >> 
> >> * blkdebug
> >> * blklogwrites
> >> * blkverify
> >> * bochs
> >> * cloop
> >> * dmg
> >> * ftp
> >> * ftps
> >> * http
> >> * https
> >> * luks
> >> * nbd
> >> * null-aio
> >> * null-co
> >> * nvme
> >> * parallels
> >> * qcow
> >> * quorum
> >> * replication
> >> * throttle
> >> * vhdx
> >> * vmdk
> >> * vpc
> >> * vvfat
> >> * vxhs
> >> 
> >>   - These appear not to use PreallocMode: they don't implement
> >>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
> >>     implement it without a prealloc parameter.
> >> 
> >> Looks good to me.
> >>
> >
> > Thanks for the analysis!
> >
> >> > +
> >> > +    ret = 0;
> >> > +
> >> > +out:
> >> > +    g_free(buf);
> >> > +    return ret;
> >> > +}
> >> > +
> >> >  static QemuOptsList runtime_opts = {
> >> >      .name = "rbd",
> >> >      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> >> [...]
> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> >> > index 7ccbfff9d0..db25a4065b 100644
> >> > --- a/qapi/block-core.json
> >> > +++ b/qapi/block-core.json
> >> > @@ -4277,13 +4277,15 @@
> >> >  #                   point to a snapshot.
> >> >  # @size             Size of the virtual disk in bytes
> >> >  # @cluster-size     RBD object size
> >> > +# @preallocation    Preallocation mode (allowed values: off, full)
> >> >  #
> >> >  # Since: 2.12
> >> >  ##
> >> >  { 'struct': 'BlockdevCreateOptionsRbd',
> >> >    'data': { 'location':         'BlockdevOptionsRbd',
> >> >              'size':             'size',
> >> > -            '*cluster-size' :   'size' } }
> >> > +            '*cluster-size' :   'size',
> >> > +            '*preallocation':   'PreallocMode' } }
> >> >  
> >> >  ##
> >> >  # @BlockdevVmdkSubformat:
> >> 
> >> The non-support of values 'metadata' and 'falloc' is not visible in
> >> introspection, only in documentation.  No reason to block this patch, as
> >> the other block drivers have the same introspection weakness (only
> >> sheepdog and vdi bother to document).
> >> 
> >> Should we address the introspection weakness?  Only if there's a use for
> >> the information, I think.
> >
> > If the management applications will use that information (or maybe also
> > our help pages), could be useful to have an array of 'PreallocMode'
> > supported per-driver.
> 
> Ideally, query-qmp-schema would show only the supported values.
> 
> Not hard to do, just tedious: we'd get a number of sub-enums in addition
> to the full one, and we'd have to map from sub-enum to the full one.
> 
> QAPI language support for sub-enums would remove most of the tedium.
> Not worthwhile unless the need for sub-enums is actually common.

I should study better the QMP and QAPI to understand how to implement
the sub-enums.

If you agree, I'll put it as a background task, until somebody from
management applications tell us his interest.

> 
> >> Should we improve documentation for the other block drivers?
> >> 
> >
> > Yes, e.g. for Gluster it is not updated.
> > If you agree, I can check and update the documentation of all drivers following
> > your analysis.
> 
> Yes, please!

Okay, I'll send a patch to update it.

Thanks,
Stefano
Markus Armbruster May 9, 2019, 12:07 p.m. UTC | #9
Stefano Garzarella <sgarzare@redhat.com> writes:

> On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote:
>> Stefano Garzarella <sgarzare@redhat.com> writes:
>> 
>> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
>> >> Cc: Peter for a libvirt perspective.
>> >> 
>> >> Stefano Garzarella <sgarzare@redhat.com> writes:
>> >> 
>> >> > This patch adds the support of preallocation (off/full) for the RBD
>> >> > block driver.
>> >> > If available, we use rbd_writesame() to quickly fill the image when
>> >> > full preallocation is required.
>> >> >
>> >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>> >> > ---
>> >> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
>> >> >  qapi/block-core.json |   4 +-
>> >> >  2 files changed, 136 insertions(+), 17 deletions(-)
>> >> >
>> >> > diff --git a/block/rbd.c b/block/rbd.c
>> >> > index 0c549c9935..29dd1bb040 100644
>> >> > --- a/block/rbd.c
>> >> > +++ b/block/rbd.c
>> >> > @@ -13,6 +13,7 @@
>> >> >  
>> >> >  #include "qemu/osdep.h"
>> >> >  
>> >> > +#include "qemu/units.h"
>> >> >  #include <rbd/librbd.h>
>> >> >  #include "qapi/error.h"
>> >> >  #include "qemu/error-report.h"
>> >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
>> >> >      }
>> >> >  }
>> >> >  
>> >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
>> >> > +                                PreallocMode prealloc, Error **errp)
>> >> > +{
>> >> > +    uint64_t current_length;
>> >> > +    char *buf = NULL;
>> >> > +    int ret;
>> >> > +
>> >> > +    ret = rbd_get_size(image, &current_length);
>> >> > +    if (ret < 0) {
>> >> > +        error_setg_errno(errp, -ret, "Failed to get file length");
>> >> > +        goto out;
>> >> > +    }
>> >> > +
>> >> > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
>> >> > +        error_setg(errp, "Cannot use preallocation for shrinking files");
>> >> > +        ret = -ENOTSUP;
>> >> > +        goto out;
>> >> > +    }
>> >> > +
>> >> > +    switch (prealloc) {
>> >> > +    case PREALLOC_MODE_FULL: {
>> >> [...]
>> >> > +    case PREALLOC_MODE_OFF:
>> >> [...]
>> >> > +    default:
>> >> > +        error_setg(errp, "Unsupported preallocation mode: %s",
>> >> > +                   PreallocMode_str(prealloc));
>> >> > +        ret = -ENOTSUP;
>> >> > +        goto out;
>> >> > +    }
>> >> 
>> >> Other block drivers also accept only some values of PreallocMode.  Okay.
>> >> 
>> >> I wonder whether management applications need to know which values are
>> >> supported.
>> >
>> > Good point!
>> 
>> We can continue to assume they don't until somebody tells us otherwise.
>> 
>> >> Let me review support in drivers:
>> >> 
>> >> * file (file-win32.c)
>> >> * iscsi
>> >> * nfs
>> >> * qed
>> >> * ssh
>> >> 
>> >>   - Reject all but PREALLOC_MODE_OFF
>> >> 
>> >> * copy-on-read
>> >> * luks (crypto.c)
>> >> * raw
>> >> 
>> >>   - Pass through only
>> >> 
>> >> * file host_cdrom host_device (file-posix.c)
>> >> 
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
>> >>     files
>> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
>> >>   - Reject PREALLOC_MODE_METADATA
>> >> 
>> >> * gluster
>> >> 
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
>> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
>> >>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
>> >>   - Reject PREALLOC_MODE_METADATA
>> >> 
>> >> * qcow2
>> >> 
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
>> >>     file
>> >>   
>> >> * rbd with this patch
>> >> 
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
>> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
>> >> 
>> >> * sheepdog
>> >> 
>> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
>> >>   - Doesn't support shrinking
>> >> 
>> >> * vdi
>> >> 
>> >>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
>> >>   - Doesn't support shrinking
>> >> 
>> >> * blkdebug
>> >> * blklogwrites
>> >> * blkverify
>> >> * bochs
>> >> * cloop
>> >> * dmg
>> >> * ftp
>> >> * ftps
>> >> * http
>> >> * https
>> >> * luks
>> >> * nbd
>> >> * null-aio
>> >> * null-co
>> >> * nvme
>> >> * parallels
>> >> * qcow
>> >> * quorum
>> >> * replication
>> >> * throttle
>> >> * vhdx
>> >> * vmdk
>> >> * vpc
>> >> * vvfat
>> >> * vxhs
>> >> 
>> >>   - These appear not to use PreallocMode: they don't implement
>> >>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
>> >>     implement it without a prealloc parameter.
>> >> 
>> >> Looks good to me.
>> >>
>> >
>> > Thanks for the analysis!
>> >
>> >> > +
>> >> > +    ret = 0;
>> >> > +
>> >> > +out:
>> >> > +    g_free(buf);
>> >> > +    return ret;
>> >> > +}
>> >> > +
>> >> >  static QemuOptsList runtime_opts = {
>> >> >      .name = "rbd",
>> >> >      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
>> >> [...]
>> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json
>> >> > index 7ccbfff9d0..db25a4065b 100644
>> >> > --- a/qapi/block-core.json
>> >> > +++ b/qapi/block-core.json
>> >> > @@ -4277,13 +4277,15 @@
>> >> >  #                   point to a snapshot.
>> >> >  # @size             Size of the virtual disk in bytes
>> >> >  # @cluster-size     RBD object size
>> >> > +# @preallocation    Preallocation mode (allowed values: off, full)
>> >> >  #
>> >> >  # Since: 2.12
>> >> >  ##
>> >> >  { 'struct': 'BlockdevCreateOptionsRbd',
>> >> >    'data': { 'location':         'BlockdevOptionsRbd',
>> >> >              'size':             'size',
>> >> > -            '*cluster-size' :   'size' } }
>> >> > +            '*cluster-size' :   'size',
>> >> > +            '*preallocation':   'PreallocMode' } }
>> >> >  
>> >> >  ##
>> >> >  # @BlockdevVmdkSubformat:
>> >> 
>> >> The non-support of values 'metadata' and 'falloc' is not visible in
>> >> introspection, only in documentation.  No reason to block this patch, as
>> >> the other block drivers have the same introspection weakness (only
>> >> sheepdog and vdi bother to document).
>> >> 
>> >> Should we address the introspection weakness?  Only if there's a use for
>> >> the information, I think.
>> >
>> > If the management applications will use that information (or maybe also
>> > our help pages), could be useful to have an array of 'PreallocMode'
>> > supported per-driver.
>> 
>> Ideally, query-qmp-schema would show only the supported values.
>> 
>> Not hard to do, just tedious: we'd get a number of sub-enums in addition
>> to the full one, and we'd have to map from sub-enum to the full one.
>> 
>> QAPI language support for sub-enums would remove most of the tedium.
>> Not worthwhile unless the need for sub-enums is actually common.
>
> I should study better the QMP and QAPI to understand how to implement
> the sub-enums.

Sub-enums of

    { 'enum': 'PreallocMode',
      'data': [ 'off', 'metadata', 'falloc', 'full' ] }

done the obvious way:

    { 'enum': 'PreallocModeOff',
      'data': [ 'off' ] }
    { 'enum': 'PreallocModeOffPosix',
      'data': [ 'off', 'metadata',
                 { 'name': 'falloc', 'if': 'defined(CONFIG_POSIX_FALLOCATE)' },
                 'full' ] }

and so forth.

This generates a bunch of different C enum types in addition to
PreallocMode: PreallocModeOff, PreallocModePosix, ...

Common C code continues to use just PreallocMode.  The QMP command
handlers using sub-enums will have to map between the sub-enums and
PreallocMode.

Tedious.

With QAPI language support for sub-enums, we could eliminate the
additional C enums.

> If you agree, I'll put it as a background task, until somebody from
> management applications tell us his interest.

Only act if there's a compelling use case.

>> >> Should we improve documentation for the other block drivers?
>> >> 
>> >
>> > Yes, e.g. for Gluster it is not updated.
>> > If you agree, I can check and update the documentation of all drivers following
>> > your analysis.
>> 
>> Yes, please!
>
> Okay, I'll send a patch to update it.
>
> Thanks,
> Stefano
Peter Krempa May 9, 2019, 1:29 p.m. UTC | #10
On Thu, May 09, 2019 at 10:26:46 +0200, Stefano Garzarella wrote:
> On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote:
> > Stefano Garzarella <sgarzare@redhat.com> writes:
> > 
> > > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
> > >> Cc: Peter for a libvirt perspective.
> > >> 
> > >> Stefano Garzarella <sgarzare@redhat.com> writes:
> > >> 
> > >> > This patch adds the support of preallocation (off/full) for the RBD
> > >> > block driver.
> > >> > If available, we use rbd_writesame() to quickly fill the image when
> > >> > full preallocation is required.
> > >> >
> > >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > >> > ---
> > >> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> > >> >  qapi/block-core.json |   4 +-
> > >> >  2 files changed, 136 insertions(+), 17 deletions(-)

[...]


> > >> >  ##
> > >> >  # @BlockdevVmdkSubformat:
> > >> 
> > >> The non-support of values 'metadata' and 'falloc' is not visible in
> > >> introspection, only in documentation.  No reason to block this patch, as
> > >> the other block drivers have the same introspection weakness (only
> > >> sheepdog and vdi bother to document).
> > >> 
> > >> Should we address the introspection weakness?  Only if there's a use for
> > >> the information, I think.
> > >
> > > If the management applications will use that information (or maybe also
> > > our help pages), could be useful to have an array of 'PreallocMode'
> > > supported per-driver.
> > 
> > Ideally, query-qmp-schema would show only the supported values.
> > 
> > Not hard to do, just tedious: we'd get a number of sub-enums in addition
> > to the full one, and we'd have to map from sub-enum to the full one.
> > 
> > QAPI language support for sub-enums would remove most of the tedium.
> > Not worthwhile unless the need for sub-enums is actually common.
> 
> I should study better the QMP and QAPI to understand how to implement
> the sub-enums.
> 
> If you agree, I'll put it as a background task, until somebody from
> management applications tell us his interest.

Sorry for the late response. Libvirt currently does not deal that much
with the preallocation settings. Preallocation isn't in current state
implemented at all for 'blockdev-create' and only the 'metadata' and
'falloc' modes are used in the storage driver via qemu-img.

We currently hardcode the knowledge for which formats actually support
it internally.

I'd say it's not criticall to expose this in the QMP schema but
obviously if we'll ever need to use it for a recent enough qemu it's
welcome to have a way to check.
Stefano Garzarella May 10, 2019, 8:36 a.m. UTC | #11
On Thu, May 09, 2019 at 02:07:34PM +0200, Markus Armbruster wrote:
> Stefano Garzarella <sgarzare@redhat.com> writes:
> 
> > On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote:
> >> Stefano Garzarella <sgarzare@redhat.com> writes:
> >> 
> >> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
> >> >> Cc: Peter for a libvirt perspective.
> >> >> 
> >> >> Stefano Garzarella <sgarzare@redhat.com> writes:
> >> >> 
> >> >> > This patch adds the support of preallocation (off/full) for the RBD
> >> >> > block driver.
> >> >> > If available, we use rbd_writesame() to quickly fill the image when
> >> >> > full preallocation is required.
> >> >> >
> >> >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> >> >> > ---
> >> >> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> >> >> >  qapi/block-core.json |   4 +-
> >> >> >  2 files changed, 136 insertions(+), 17 deletions(-)
> >> >> >
> >> >> > diff --git a/block/rbd.c b/block/rbd.c
> >> >> > index 0c549c9935..29dd1bb040 100644
> >> >> > --- a/block/rbd.c
> >> >> > +++ b/block/rbd.c
> >> >> > @@ -13,6 +13,7 @@
> >> >> >  
> >> >> >  #include "qemu/osdep.h"
> >> >> >  
> >> >> > +#include "qemu/units.h"
> >> >> >  #include <rbd/librbd.h>
> >> >> >  #include "qapi/error.h"
> >> >> >  #include "qemu/error-report.h"
> >> >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> >> >> >      }
> >> >> >  }
> >> >> >  
> >> >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> >> >> > +                                PreallocMode prealloc, Error **errp)
> >> >> > +{
> >> >> > +    uint64_t current_length;
> >> >> > +    char *buf = NULL;
> >> >> > +    int ret;
> >> >> > +
> >> >> > +    ret = rbd_get_size(image, &current_length);
> >> >> > +    if (ret < 0) {
> >> >> > +        error_setg_errno(errp, -ret, "Failed to get file length");
> >> >> > +        goto out;
> >> >> > +    }
> >> >> > +
> >> >> > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> >> >> > +        error_setg(errp, "Cannot use preallocation for shrinking files");
> >> >> > +        ret = -ENOTSUP;
> >> >> > +        goto out;
> >> >> > +    }
> >> >> > +
> >> >> > +    switch (prealloc) {
> >> >> > +    case PREALLOC_MODE_FULL: {
> >> >> [...]
> >> >> > +    case PREALLOC_MODE_OFF:
> >> >> [...]
> >> >> > +    default:
> >> >> > +        error_setg(errp, "Unsupported preallocation mode: %s",
> >> >> > +                   PreallocMode_str(prealloc));
> >> >> > +        ret = -ENOTSUP;
> >> >> > +        goto out;
> >> >> > +    }
> >> >> 
> >> >> Other block drivers also accept only some values of PreallocMode.  Okay.
> >> >> 
> >> >> I wonder whether management applications need to know which values are
> >> >> supported.
> >> >
> >> > Good point!
> >> 
> >> We can continue to assume they don't until somebody tells us otherwise.
> >> 
> >> >> Let me review support in drivers:
> >> >> 
> >> >> * file (file-win32.c)
> >> >> * iscsi
> >> >> * nfs
> >> >> * qed
> >> >> * ssh
> >> >> 
> >> >>   - Reject all but PREALLOC_MODE_OFF
> >> >> 
> >> >> * copy-on-read
> >> >> * luks (crypto.c)
> >> >> * raw
> >> >> 
> >> >>   - Pass through only
> >> >> 
> >> >> * file host_cdrom host_device (file-posix.c)
> >> >> 
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
> >> >>     files
> >> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
> >> >>   - Reject PREALLOC_MODE_METADATA
> >> >> 
> >> >> * gluster
> >> >> 
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
> >> >>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
> >> >>   - Reject PREALLOC_MODE_METADATA
> >> >> 
> >> >> * qcow2
> >> >> 
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
> >> >>     file
> >> >>   
> >> >> * rbd with this patch
> >> >> 
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >> >> 
> >> >> * sheepdog
> >> >> 
> >> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >> >>   - Doesn't support shrinking
> >> >> 
> >> >> * vdi
> >> >> 
> >> >>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
> >> >>   - Doesn't support shrinking
> >> >> 
> >> >> * blkdebug
> >> >> * blklogwrites
> >> >> * blkverify
> >> >> * bochs
> >> >> * cloop
> >> >> * dmg
> >> >> * ftp
> >> >> * ftps
> >> >> * http
> >> >> * https
> >> >> * luks
> >> >> * nbd
> >> >> * null-aio
> >> >> * null-co
> >> >> * nvme
> >> >> * parallels
> >> >> * qcow
> >> >> * quorum
> >> >> * replication
> >> >> * throttle
> >> >> * vhdx
> >> >> * vmdk
> >> >> * vpc
> >> >> * vvfat
> >> >> * vxhs
> >> >> 
> >> >>   - These appear not to use PreallocMode: they don't implement
> >> >>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
> >> >>     implement it without a prealloc parameter.
> >> >> 
> >> >> Looks good to me.
> >> >>
> >> >
> >> > Thanks for the analysis!
> >> >
> >> >> > +
> >> >> > +    ret = 0;
> >> >> > +
> >> >> > +out:
> >> >> > +    g_free(buf);
> >> >> > +    return ret;
> >> >> > +}
> >> >> > +
> >> >> >  static QemuOptsList runtime_opts = {
> >> >> >      .name = "rbd",
> >> >> >      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> >> >> [...]
> >> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> >> >> > index 7ccbfff9d0..db25a4065b 100644
> >> >> > --- a/qapi/block-core.json
> >> >> > +++ b/qapi/block-core.json
> >> >> > @@ -4277,13 +4277,15 @@
> >> >> >  #                   point to a snapshot.
> >> >> >  # @size             Size of the virtual disk in bytes
> >> >> >  # @cluster-size     RBD object size
> >> >> > +# @preallocation    Preallocation mode (allowed values: off, full)
> >> >> >  #
> >> >> >  # Since: 2.12
> >> >> >  ##
> >> >> >  { 'struct': 'BlockdevCreateOptionsRbd',
> >> >> >    'data': { 'location':         'BlockdevOptionsRbd',
> >> >> >              'size':             'size',
> >> >> > -            '*cluster-size' :   'size' } }
> >> >> > +            '*cluster-size' :   'size',
> >> >> > +            '*preallocation':   'PreallocMode' } }
> >> >> >  
> >> >> >  ##
> >> >> >  # @BlockdevVmdkSubformat:
> >> >> 
> >> >> The non-support of values 'metadata' and 'falloc' is not visible in
> >> >> introspection, only in documentation.  No reason to block this patch, as
> >> >> the other block drivers have the same introspection weakness (only
> >> >> sheepdog and vdi bother to document).
> >> >> 
> >> >> Should we address the introspection weakness?  Only if there's a use for
> >> >> the information, I think.
> >> >
> >> > If the management applications will use that information (or maybe also
> >> > our help pages), could be useful to have an array of 'PreallocMode'
> >> > supported per-driver.
> >> 
> >> Ideally, query-qmp-schema would show only the supported values.
> >> 
> >> Not hard to do, just tedious: we'd get a number of sub-enums in addition
> >> to the full one, and we'd have to map from sub-enum to the full one.
> >> 
> >> QAPI language support for sub-enums would remove most of the tedium.
> >> Not worthwhile unless the need for sub-enums is actually common.
> >
> > I should study better the QMP and QAPI to understand how to implement
> > the sub-enums.
> 
> Sub-enums of
> 
>     { 'enum': 'PreallocMode',
>       'data': [ 'off', 'metadata', 'falloc', 'full' ] }
> 
> done the obvious way:
> 
>     { 'enum': 'PreallocModeOff',
>       'data': [ 'off' ] }
>     { 'enum': 'PreallocModeOffPosix',
>       'data': [ 'off', 'metadata',
>                  { 'name': 'falloc', 'if': 'defined(CONFIG_POSIX_FALLOCATE)' },
>                  'full' ] }
> 
> and so forth.
> 
> This generates a bunch of different C enum types in addition to
> PreallocMode: PreallocModeOff, PreallocModePosix, ...
> 
> Common C code continues to use just PreallocMode.  The QMP command
> handlers using sub-enums will have to map between the sub-enums and
> PreallocMode.
> 
> Tedious.
> 
> With QAPI language support for sub-enums, we could eliminate the
> additional C enums.
> 

Okay, I understood your idea.
Thanks for the explanation!

> > If you agree, I'll put it as a background task, until somebody from
> > management applications tell us his interest.
> 
> Only act if there's a compelling use case.

Sure.

Thanks,
Stefano
Stefano Garzarella May 10, 2019, 8:38 a.m. UTC | #12
On Thu, May 09, 2019 at 03:29:13PM +0200, Peter Krempa wrote:
> On Thu, May 09, 2019 at 10:26:46 +0200, Stefano Garzarella wrote:
> > On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote:
> > > Stefano Garzarella <sgarzare@redhat.com> writes:
> > > 
> > > > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
> > > >> Cc: Peter for a libvirt perspective.
> > > >> 
> > > >> Stefano Garzarella <sgarzare@redhat.com> writes:
> > > >> 
> > > >> > This patch adds the support of preallocation (off/full) for the RBD
> > > >> > block driver.
> > > >> > If available, we use rbd_writesame() to quickly fill the image when
> > > >> > full preallocation is required.
> > > >> >
> > > >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > >> > ---
> > > >> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> > > >> >  qapi/block-core.json |   4 +-
> > > >> >  2 files changed, 136 insertions(+), 17 deletions(-)
> 
> [...]
> 
> 
> > > >> >  ##
> > > >> >  # @BlockdevVmdkSubformat:
> > > >> 
> > > >> The non-support of values 'metadata' and 'falloc' is not visible in
> > > >> introspection, only in documentation.  No reason to block this patch, as
> > > >> the other block drivers have the same introspection weakness (only
> > > >> sheepdog and vdi bother to document).
> > > >> 
> > > >> Should we address the introspection weakness?  Only if there's a use for
> > > >> the information, I think.
> > > >
> > > > If the management applications will use that information (or maybe also
> > > > our help pages), could be useful to have an array of 'PreallocMode'
> > > > supported per-driver.
> > > 
> > > Ideally, query-qmp-schema would show only the supported values.
> > > 
> > > Not hard to do, just tedious: we'd get a number of sub-enums in addition
> > > to the full one, and we'd have to map from sub-enum to the full one.
> > > 
> > > QAPI language support for sub-enums would remove most of the tedium.
> > > Not worthwhile unless the need for sub-enums is actually common.
> > 
> > I should study better the QMP and QAPI to understand how to implement
> > the sub-enums.
> > 
> > If you agree, I'll put it as a background task, until somebody from
> > management applications tell us his interest.
> 
> Sorry for the late response. Libvirt currently does not deal that much
> with the preallocation settings. Preallocation isn't in current state
> implemented at all for 'blockdev-create' and only the 'metadata' and
> 'falloc' modes are used in the storage driver via qemu-img.
> 
> We currently hardcode the knowledge for which formats actually support
> it internally.
> 
> I'd say it's not criticall to expose this in the QMP schema but
> obviously if we'll ever need to use it for a recent enough qemu it's
> welcome to have a way to check.

Thank you for sharing this information!

Cheers,
Stefano
Stefano Garzarella May 22, 2019, 8:57 a.m. UTC | #13
On Wed, May 8, 2019 at 1:44 PM Markus Armbruster <armbru@redhat.com> wrote:
>
> Stefano Garzarella <sgarzare@redhat.com> writes:
>
> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
> >> Cc: Peter for a libvirt perspective.
> >>
> >> Stefano Garzarella <sgarzare@redhat.com> writes:
> >>
> >> > This patch adds the support of preallocation (off/full) for the RBD
> >> > block driver.
> >> > If available, we use rbd_writesame() to quickly fill the image when
> >> > full preallocation is required.
> >> >
> >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> >> > ---
> >> >  block/rbd.c          | 149 ++++++++++++++++++++++++++++++++++++++-----
> >> >  qapi/block-core.json |   4 +-
> >> >  2 files changed, 136 insertions(+), 17 deletions(-)
> >> >
> >> > diff --git a/block/rbd.c b/block/rbd.c
> >> > index 0c549c9935..29dd1bb040 100644
> >> > --- a/block/rbd.c
> >> > +++ b/block/rbd.c
> >> > @@ -13,6 +13,7 @@
> >> >
> >> >  #include "qemu/osdep.h"
> >> >
> >> > +#include "qemu/units.h"
> >> >  #include <rbd/librbd.h>
> >> >  #include "qapi/error.h"
> >> >  #include "qemu/error-report.h"
> >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> >> >      }
> >> >  }
> >> >
> >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
> >> > +                                PreallocMode prealloc, Error **errp)
> >> > +{
> >> > +    uint64_t current_length;
> >> > +    char *buf = NULL;
> >> > +    int ret;
> >> > +
> >> > +    ret = rbd_get_size(image, &current_length);
> >> > +    if (ret < 0) {
> >> > +        error_setg_errno(errp, -ret, "Failed to get file length");
> >> > +        goto out;
> >> > +    }
> >> > +
> >> > +    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> >> > +        error_setg(errp, "Cannot use preallocation for shrinking files");
> >> > +        ret = -ENOTSUP;
> >> > +        goto out;
> >> > +    }
> >> > +
> >> > +    switch (prealloc) {
> >> > +    case PREALLOC_MODE_FULL: {
> >> [...]
> >> > +    case PREALLOC_MODE_OFF:
> >> [...]
> >> > +    default:
> >> > +        error_setg(errp, "Unsupported preallocation mode: %s",
> >> > +                   PreallocMode_str(prealloc));
> >> > +        ret = -ENOTSUP;
> >> > +        goto out;
> >> > +    }
> >>
> >> Other block drivers also accept only some values of PreallocMode.  Okay.
> >>
> >> I wonder whether management applications need to know which values are
> >> supported.
> >
> > Good point!
>
> We can continue to assume they don't until somebody tells us otherwise.
>
> >> Let me review support in drivers:
> >>
> >> * file (file-win32.c)
> >> * iscsi
> >> * nfs
> >> * qed
> >> * ssh
> >>
> >>   - Reject all but PREALLOC_MODE_OFF
> >>
> >> * copy-on-read
> >> * luks (crypto.c)
> >> * raw
> >>
> >>   - Pass through only
> >>
> >> * file host_cdrom host_device (file-posix.c)
> >>
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
> >>     files
> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
> >>   - Reject PREALLOC_MODE_METADATA
> >>
> >> * gluster
> >>
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
> >>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
> >>   - Reject PREALLOC_MODE_METADATA
> >>
> >> * qcow2
> >>
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
> >>     file
> >>
> >> * rbd with this patch
> >>
> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >>
> >> * sheepdog
> >>
> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >>   - Doesn't support shrinking
> >>
> >> * vdi
> >>
> >>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
> >>   - Doesn't support shrinking
> >>
> >> * blkdebug
> >> * blklogwrites
> >> * blkverify
> >> * bochs
> >> * cloop
> >> * dmg
> >> * ftp
> >> * ftps
> >> * http
> >> * https
> >> * luks
> >> * nbd
> >> * null-aio
> >> * null-co
> >> * nvme
> >> * parallels
> >> * qcow
> >> * quorum
> >> * replication
> >> * throttle
> >> * vhdx
> >> * vmdk
> >> * vpc
> >> * vvfat
> >> * vxhs
> >>
> >>   - These appear not to use PreallocMode: they don't implement
> >>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
> >>     implement it without a prealloc parameter.
> >>
> >> Looks good to me.
> >>
> >
> > Thanks for the analysis!
> >
> >> > +
> >> > +    ret = 0;
> >> > +
> >> > +out:
> >> > +    g_free(buf);
> >> > +    return ret;
> >> > +}
> >> > +
> >> >  static QemuOptsList runtime_opts = {
> >> >      .name = "rbd",
> >> >      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> >> [...]
> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json
> >> > index 7ccbfff9d0..db25a4065b 100644
> >> > --- a/qapi/block-core.json
> >> > +++ b/qapi/block-core.json
> >> > @@ -4277,13 +4277,15 @@
> >> >  #                   point to a snapshot.
> >> >  # @size             Size of the virtual disk in bytes
> >> >  # @cluster-size     RBD object size
> >> > +# @preallocation    Preallocation mode (allowed values: off, full)
> >> >  #
> >> >  # Since: 2.12
> >> >  ##
> >> >  { 'struct': 'BlockdevCreateOptionsRbd',
> >> >    'data': { 'location':         'BlockdevOptionsRbd',
> >> >              'size':             'size',
> >> > -            '*cluster-size' :   'size' } }
> >> > +            '*cluster-size' :   'size',
> >> > +            '*preallocation':   'PreallocMode' } }
> >> >
> >> >  ##
> >> >  # @BlockdevVmdkSubformat:
> >>
> >> The non-support of values 'metadata' and 'falloc' is not visible in
> >> introspection, only in documentation.  No reason to block this patch, as
> >> the other block drivers have the same introspection weakness (only
> >> sheepdog and vdi bother to document).
> >>
> >> Should we address the introspection weakness?  Only if there's a use for
> >> the information, I think.
> >
> > If the management applications will use that information (or maybe also
> > our help pages), could be useful to have an array of 'PreallocMode'
> > supported per-driver.
>
> Ideally, query-qmp-schema would show only the supported values.
>
> Not hard to do, just tedious: we'd get a number of sub-enums in addition
> to the full one, and we'd have to map from sub-enum to the full one.
>
> QAPI language support for sub-enums would remove most of the tedium.
> Not worthwhile unless the need for sub-enums is actually common.
>
> >> Should we improve documentation for the other block drivers?
> >>
> >
> > Yes, e.g. for Gluster it is not updated.
> > If you agree, I can check and update the documentation of all drivers following
> > your analysis.
>
> Yes, please!


Hi Markus,
I'm finally updating the documentation of preallocation modes
supported by block drivers and protocols in qapi/block-core.json.
As sheepdog and vdi I'm adding the supported values for each driver or
protocol that supports 'preallocation' parameter during the creation,
I'm also updating the '.help' in the QemuOptsList.

My doubt is: where is better to put the documentation about
preallocation modes supported during the resize? (e.g. some drivers
support only PREALLOC_MODE_OFF when shrinking)

Thanks,
Stefano
Markus Armbruster May 22, 2019, 4:25 p.m. UTC | #14
Stefano Garzarella <sgarzare@redhat.com> writes:

> On Wed, May 8, 2019 at 1:44 PM Markus Armbruster <armbru@redhat.com> wrote:
>>
>> Stefano Garzarella <sgarzare@redhat.com> writes:
>>
>> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
[...]
>> >> Let me review support in drivers:
>> >>
>> >> * file (file-win32.c)
>> >> * iscsi
>> >> * nfs
>> >> * qed
>> >> * ssh
>> >>
>> >>   - Reject all but PREALLOC_MODE_OFF
>> >>
>> >> * copy-on-read
>> >> * luks (crypto.c)
>> >> * raw
>> >>
>> >>   - Pass through only
>> >>
>> >> * file host_cdrom host_device (file-posix.c)
>> >>
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
>> >>     files
>> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
>> >>   - Reject PREALLOC_MODE_METADATA
>> >>
>> >> * gluster
>> >>
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
>> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
>> >>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
>> >>   - Reject PREALLOC_MODE_METADATA
>> >>
>> >> * qcow2
>> >>
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
>> >>     file
>> >>
>> >> * rbd with this patch
>> >>
>> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
>> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
>> >>
>> >> * sheepdog
>> >>
>> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
>> >>   - Doesn't support shrinking
>> >>
>> >> * vdi
>> >>
>> >>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
>> >>   - Doesn't support shrinking
>> >>
>> >> * blkdebug
>> >> * blklogwrites
>> >> * blkverify
>> >> * bochs
>> >> * cloop
>> >> * dmg
>> >> * ftp
>> >> * ftps
>> >> * http
>> >> * https
>> >> * luks
>> >> * nbd
>> >> * null-aio
>> >> * null-co
>> >> * nvme
>> >> * parallels
>> >> * qcow
>> >> * quorum
>> >> * replication
>> >> * throttle
>> >> * vhdx
>> >> * vmdk
>> >> * vpc
>> >> * vvfat
>> >> * vxhs
>> >>
>> >>   - These appear not to use PreallocMode: they don't implement
>> >>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
>> >>     implement it without a prealloc parameter.
>> >>
>> >> Looks good to me.
>> >>
>> >
>> > Thanks for the analysis!
[...]
>> > If you agree, I can check and update the documentation of all drivers following
>> > your analysis.
>>
>> Yes, please!
>
>
> Hi Markus,
> I'm finally updating the documentation of preallocation modes
> supported by block drivers and protocols in qapi/block-core.json.
> As sheepdog and vdi I'm adding the supported values for each driver or
> protocol that supports 'preallocation' parameter during the creation,
> I'm also updating the '.help' in the QemuOptsList.
>
> My doubt is: where is better to put the documentation about
> preallocation modes supported during the resize? (e.g. some drivers
> support only PREALLOC_MODE_OFF when shrinking)

As far as I can tell, no driver supports anything but PREALLOC_MODE_OFF
when shrinking.  Suggest to ignore the shrinking case for now when
documenting.

I'm not sure I fully answered your question.  Don't hesitate to ask for
more advice.
Stefano Garzarella May 23, 2019, 1:39 p.m. UTC | #15
On Wed, May 22, 2019 at 06:25:17PM +0200, Markus Armbruster wrote:
> Stefano Garzarella <sgarzare@redhat.com> writes:
> 
> > On Wed, May 8, 2019 at 1:44 PM Markus Armbruster <armbru@redhat.com> wrote:
> >>
> >> Stefano Garzarella <sgarzare@redhat.com> writes:
> >>
> >> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote:
> [...]
> >> >> Let me review support in drivers:
> >> >>
> >> >> * file (file-win32.c)
> >> >> * iscsi
> >> >> * nfs
> >> >> * qed
> >> >> * ssh
> >> >>
> >> >>   - Reject all but PREALLOC_MODE_OFF
> >> >>
> >> >> * copy-on-read
> >> >> * luks (crypto.c)
> >> >> * raw
> >> >>
> >> >>   - Pass through only
> >> >>
> >> >> * file host_cdrom host_device (file-posix.c)
> >> >>
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular
> >> >>     files
> >> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE
> >> >>   - Reject PREALLOC_MODE_METADATA
> >> >>
> >> >> * gluster
> >> >>
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >> >>   - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE
> >> >>   - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL
> >> >>   - Reject PREALLOC_MODE_METADATA
> >> >>
> >> >> * qcow2
> >> >>
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing
> >> >>     file
> >> >>
> >> >> * rbd with this patch
> >> >>
> >> >>   - Reject all but PREALLOC_MODE_OFF when shrinking
> >> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >> >>
> >> >> * sheepdog
> >> >>
> >> >>   - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC
> >> >>   - Doesn't support shrinking
> >> >>
> >> >> * vdi
> >> >>
> >> >>   - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL
> >> >>   - Doesn't support shrinking
> >> >>
> >> >> * blkdebug
> >> >> * blklogwrites
> >> >> * blkverify
> >> >> * bochs
> >> >> * cloop
> >> >> * dmg
> >> >> * ftp
> >> >> * ftps
> >> >> * http
> >> >> * https
> >> >> * luks
> >> >> * nbd
> >> >> * null-aio
> >> >> * null-co
> >> >> * nvme
> >> >> * parallels
> >> >> * qcow
> >> >> * quorum
> >> >> * replication
> >> >> * throttle
> >> >> * vhdx
> >> >> * vmdk
> >> >> * vpc
> >> >> * vvfat
> >> >> * vxhs
> >> >>
> >> >>   - These appear not to use PreallocMode: they don't implement
> >> >>     .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or
> >> >>     implement it without a prealloc parameter.
> >> >>
> >> >> Looks good to me.
> >> >>
> >> >
> >> > Thanks for the analysis!
> [...]
> >> > If you agree, I can check and update the documentation of all drivers following
> >> > your analysis.
> >>
> >> Yes, please!
> >
> >
> > Hi Markus,
> > I'm finally updating the documentation of preallocation modes
> > supported by block drivers and protocols in qapi/block-core.json.
> > As sheepdog and vdi I'm adding the supported values for each driver or
> > protocol that supports 'preallocation' parameter during the creation,
> > I'm also updating the '.help' in the QemuOptsList.
> >
> > My doubt is: where is better to put the documentation about
> > preallocation modes supported during the resize? (e.g. some drivers
> > support only PREALLOC_MODE_OFF when shrinking)
> 
> As far as I can tell, no driver supports anything but PREALLOC_MODE_OFF
> when shrinking.  Suggest to ignore the shrinking case for now when
> documenting.
> 

Okay, I'll ignore it for now.

> I'm not sure I fully answered your question.  Don't hesitate to ask for
> more advice.

Yes, your answer is what I was looking for :)

Thanks,
Stefano
diff mbox series

Patch

diff --git a/block/rbd.c b/block/rbd.c
index 0c549c9935..29dd1bb040 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -13,6 +13,7 @@ 
 
 #include "qemu/osdep.h"
 
+#include "qemu/units.h"
 #include <rbd/librbd.h>
 #include "qapi/error.h"
 #include "qemu/error-report.h"
@@ -331,6 +332,110 @@  static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
     }
 }
 
+static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset,
+                                PreallocMode prealloc, Error **errp)
+{
+    uint64_t current_length;
+    char *buf = NULL;
+    int ret;
+
+    ret = rbd_get_size(image, &current_length);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to get file length");
+        goto out;
+    }
+
+    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Cannot use preallocation for shrinking files");
+        ret = -ENOTSUP;
+        goto out;
+    }
+
+    switch (prealloc) {
+    case PREALLOC_MODE_FULL: {
+        uint64_t current_offset = current_length;
+        int buf_size = 64 * KiB;
+        ssize_t bytes;
+
+        buf = g_malloc(buf_size);
+        /*
+         * Some versions of rbd_writesame() discards writes of buffers with
+         * all zeroes. In order to avoid this behaviour, we set the first byte
+         * to one.
+         */
+        buf[0] = 1;
+
+        ret = rbd_resize(image, offset);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to resize file");
+            goto out;
+        }
+
+#ifdef LIBRBD_SUPPORTS_WRITESAME
+        while (offset - current_offset > buf_size) {
+            /*
+             * rbd_writesame() supports only request where the size of the
+             * operation is multiple of buffer size and it must be less or
+             * equal to INT_MAX.
+             */
+            bytes = MIN(offset - current_offset, INT_MAX);
+            bytes -= bytes % buf_size;
+
+            bytes = rbd_writesame(image, current_offset, bytes, buf, buf_size,
+                                  0);
+            if (bytes < 0) {
+                ret = bytes;
+                error_setg_errno(errp, -ret,
+                                 "Failed to write for preallocation");
+                goto out;
+            }
+
+            current_offset += bytes;
+        }
+#endif /* LIBRBD_SUPPORTS_WRITESAME */
+
+        while (current_offset < offset) {
+            bytes = rbd_write(image, current_offset,
+                              MIN(offset - current_offset, buf_size), buf);
+            if (bytes < 0) {
+                ret = bytes;
+                error_setg_errno(errp, -ret,
+                                 "Failed to write for preallocation");
+                goto out;
+            }
+
+            current_offset += bytes;
+        }
+
+        ret = rbd_flush(image);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to flush the file");
+            goto out;
+        }
+
+        break;
+    }
+    case PREALLOC_MODE_OFF:
+        ret = rbd_resize(image, offset);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to resize file");
+            goto out;
+        }
+        break;
+    default:
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_str(prealloc));
+        ret = -ENOTSUP;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    g_free(buf);
+    return ret;
+}
+
 static QemuOptsList runtime_opts = {
     .name = "rbd",
     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -376,6 +481,7 @@  static int qemu_rbd_do_create(BlockdevCreateOptions *options,
     BlockdevCreateOptionsRbd *opts = &options->u.rbd;
     rados_t cluster;
     rados_ioctx_t io_ctx;
+    rbd_image_t image;
     int obj_order = 0;
     int ret;
 
@@ -404,13 +510,21 @@  static int qemu_rbd_do_create(BlockdevCreateOptions *options,
         return ret;
     }
 
-    ret = rbd_create(io_ctx, opts->location->image, opts->size, &obj_order);
+    ret = rbd_create(io_ctx, opts->location->image, 0, &obj_order);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "error rbd create");
         goto out;
     }
 
-    ret = 0;
+    ret = rbd_open(io_ctx, opts->location->image, &image, NULL);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "error rbd open");
+        goto out;
+    }
+
+    ret = qemu_rbd_do_truncate(image, opts->size, opts->preallocation, errp);
+
+    rbd_close(image);
 out:
     rados_ioctx_destroy(io_ctx);
     rados_shutdown(cluster);
@@ -431,6 +545,7 @@  static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
     BlockdevOptionsRbd *loc;
     Error *local_err = NULL;
     const char *keypairs, *password_secret;
+    char *prealloc;
     QDict *options = NULL;
     int ret = 0;
 
@@ -449,6 +564,16 @@  static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
                                                    BLOCK_OPT_CLUSTER_SIZE, 0);
     rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0);
 
+    prealloc = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    rbd_opts->preallocation = qapi_enum_parse(&PreallocMode_lookup, prealloc,
+                                              PREALLOC_MODE_OFF, &local_err);
+    g_free(prealloc);
+    if (local_err) {
+        ret = -EINVAL;
+        error_propagate(errp, local_err);
+        goto exit;
+    }
+
     options = qdict_new();
     qemu_rbd_parse_filename(filename, options, &local_err);
     if (local_err) {
@@ -1052,21 +1177,8 @@  static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
                                              Error **errp)
 {
     BDRVRBDState *s = bs->opaque;
-    int r;
-
-    if (prealloc != PREALLOC_MODE_OFF) {
-        error_setg(errp, "Unsupported preallocation mode '%s'",
-                   PreallocMode_str(prealloc));
-        return -ENOTSUP;
-    }
-
-    r = rbd_resize(s->image, offset);
-    if (r < 0) {
-        error_setg_errno(errp, -r, "Failed to resize file");
-        return r;
-    }
 
-    return 0;
+    return qemu_rbd_do_truncate(s->image, offset, prealloc, errp);
 }
 
 static int qemu_rbd_snap_create(BlockDriverState *bs,
@@ -1219,6 +1331,11 @@  static QemuOptsList qemu_rbd_create_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "RBD object size"
         },
+        {
+            .name = BLOCK_OPT_PREALLOC,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode (allowed values: off, full)"
+        },
         {
             .name = "password-secret",
             .type = QEMU_OPT_STRING,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7ccbfff9d0..db25a4065b 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4277,13 +4277,15 @@ 
 #                   point to a snapshot.
 # @size             Size of the virtual disk in bytes
 # @cluster-size     RBD object size
+# @preallocation    Preallocation mode (allowed values: off, full)
 #
 # Since: 2.12
 ##
 { 'struct': 'BlockdevCreateOptionsRbd',
   'data': { 'location':         'BlockdevOptionsRbd',
             'size':             'size',
-            '*cluster-size' :   'size' } }
+            '*cluster-size' :   'size',
+            '*preallocation':   'PreallocMode' } }
 
 ##
 # @BlockdevVmdkSubformat: