[5/5] block/io: improve savevm performance
diff mbox series

Message ID 20200616162035.29857-6-den@openvz.org
State New
Headers show
Series
  • block: seriously improve savevm performance
Related show

Commit Message

Denis V. Lunev June 16, 2020, 4:20 p.m. UTC
This patch does 2 standard basic things:
- it creates intermediate buffer for all writes from QEMU migration code
  to block driver,
- this buffer is sent to disk asynchronously, allowing several writes to
  run in parallel.

Thus bdrv_vmstate_write() is becoming asynchronous. All pending operations
completion are performed in newly invented bdrv_flush_vmstate().

In general, migration code is fantastically inefficent (by observation),
buffers are not aligned and sent with arbitrary pieces, a lot of time
less than 100 bytes at a chunk, which results in read-modify-write
operations if target file descriptor is opened with O_DIRECT. It should
also be noted that all operations are performed into unallocated image
blocks, which also suffer due to partial writes to such new clusters
even on cached file descriptors.

Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
                original     fixed
cached:          1.79s       1.27s
non-cached:      3.29s       0.81s

The difference over HDD would be more significant :)

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Max Reitz <mreitz@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Fam Zheng <fam@euphon.net>
CC: Juan Quintela <quintela@redhat.com>
CC: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
CC: Denis Plotnikov <dplotnikov@virtuozzo.com>
---
 block/io.c                | 123 +++++++++++++++++++++++++++++++++++++-
 include/block/block_int.h |   8 +++
 2 files changed, 129 insertions(+), 2 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy June 18, 2020, 10:56 a.m. UTC | #1
16.06.2020 19:20, Denis V. Lunev wrote:
> This patch does 2 standard basic things:
> - it creates intermediate buffer for all writes from QEMU migration code
>    to block driver,
> - this buffer is sent to disk asynchronously, allowing several writes to
>    run in parallel.
> 
> Thus bdrv_vmstate_write() is becoming asynchronous. All pending operations
> completion are performed in newly invented bdrv_flush_vmstate().
> 
> In general, migration code is fantastically inefficent (by observation),
> buffers are not aligned and sent with arbitrary pieces, a lot of time
> less than 100 bytes at a chunk, which results in read-modify-write
> operations if target file descriptor is opened with O_DIRECT. It should
> also be noted that all operations are performed into unallocated image
> blocks, which also suffer due to partial writes to such new clusters
> even on cached file descriptors.
> 
> Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
>                  original     fixed
> cached:          1.79s       1.27s
> non-cached:      3.29s       0.81s
> 
> The difference over HDD would be more significant :)
> 
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Max Reitz <mreitz@redhat.com>
> CC: Stefan Hajnoczi <stefanha@redhat.com>
> CC: Fam Zheng <fam@euphon.net>
> CC: Juan Quintela <quintela@redhat.com>
> CC: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> CC: Denis Plotnikov <dplotnikov@virtuozzo.com>
> ---
>   block/io.c                | 123 +++++++++++++++++++++++++++++++++++++-
>   include/block/block_int.h |   8 +++
>   2 files changed, 129 insertions(+), 2 deletions(-)
> 
> diff --git a/block/io.c b/block/io.c
> index 8718df4ea8..1979098c02 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -26,6 +26,7 @@
>   #include "trace.h"
>   #include "sysemu/block-backend.h"
>   #include "block/aio-wait.h"
> +#include "block/aio_task.h"
>   #include "block/blockjob.h"
>   #include "block/blockjob_int.h"
>   #include "block/block_int.h"
> @@ -33,6 +34,7 @@
>   #include "qapi/error.h"
>   #include "qemu/error-report.h"
>   #include "qemu/main-loop.h"
> +#include "qemu/units.h"
>   #include "sysemu/replay.h"
>   
>   /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
> @@ -2640,6 +2642,100 @@ typedef struct BdrvVmstateCo {
>       bool                is_read;
>   } BdrvVmstateCo;
>   
> +typedef struct BdrvVMStateTask {
> +    AioTask task;
> +
> +    BlockDriverState *bs;
> +    int64_t offset;
> +    void *buf;
> +    size_t bytes;
> +} BdrvVMStateTask;
> +
> +typedef struct BdrvSaveVMState {
> +    AioTaskPool *pool;
> +    BdrvVMStateTask *t;
> +} BdrvSaveVMState;
> +
> +
> +static coroutine_fn int bdrv_co_vmstate_save_task_entry(AioTask *task)
> +{
> +    int err = 0;
> +    BdrvVMStateTask *t = container_of(task, BdrvVMStateTask, task);
> +
> +    if (t->bytes != 0) {
> +        QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, t->buf, t->bytes);
> +
> +        bdrv_inc_in_flight(t->bs);
> +        err = t->bs->drv->bdrv_save_vmstate(t->bs, &qiov, t->offset);
> +        bdrv_dec_in_flight(t->bs);
> +    }
> +
> +    qemu_vfree(t->buf);
> +    return err;
> +}
> +
> +static BdrvVMStateTask *bdrv_vmstate_task_create(BlockDriverState *bs,
> +                                                 int64_t pos, size_t size)
> +{
> +    BdrvVMStateTask *t = g_new(BdrvVMStateTask, 1);
> +
> +    *t = (BdrvVMStateTask) {
> +        .task.func = bdrv_co_vmstate_save_task_entry,
> +        .buf = qemu_blockalign(bs, size),
> +        .offset = pos,
> +        .bs = bs,
> +    };
> +
> +    return t;
> +}
> +
> +static int bdrv_co_do_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
> +                                   int64_t pos)
> +{
> +    BdrvSaveVMState *state = bs->savevm_state;
> +    BdrvVMStateTask *t;
> +    size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
> +    size_t to_copy, off;
> +
> +    if (state == NULL) {
> +        state = g_new(BdrvSaveVMState, 1);
> +        *state = (BdrvSaveVMState) {
> +            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
> +            .t = bdrv_vmstate_task_create(bs, pos, buf_size),
> +        };
> +
> +        bs->savevm_state = state;
> +    }
> +
> +    if (aio_task_pool_status(state->pool) < 0) {
> +        /* Caller is responsible for cleanup. We should block all further
> +         * save operations for this exact state */
> +        return aio_task_pool_status(state->pool);
> +    }
> +
> +    t = state->t;
> +    if (t->offset + t->bytes != pos) {
> +        /* Normally this branch is not reachable from migration */
> +        return bs->drv->bdrv_save_vmstate(bs, qiov, pos);
> +    }
> +
> +    off = 0;
> +    while (1) {

"while (aio_task_pool_status(state->pool) == 0)" + "return aio_task_pool_status(state->pool)" after loop will improve interactivity on failure path, but shouldn't be necessary.

> +        to_copy = MIN(qiov->size - off, buf_size - t->bytes);
> +        qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
> +        t->bytes += to_copy;
> +        if (t->bytes < buf_size) {
> +            return qiov->size;

Intersting: we are substituting .bdrv_save_vmstate by this function. There are two existing now:

qcow2_save_vmstate, which doesn't care to return qiov->size and sd_save_vmstate which does it.

So, it seems safe to return qiov->size now, but I'm sure that it's actually unused and should be
refactored to return 0 on success everywhere.

> +        }
> +
> +        aio_task_pool_start_task(state->pool, &t->task);
> +
> +        pos += to_copy;
> +        off += to_copy;
> +        state->t = t = bdrv_vmstate_task_create(bs, pos, buf_size);
> +    }
> +}
> +
>   static int coroutine_fn
>   bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
>                      bool is_read)
> @@ -2655,7 +2751,7 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
>           if (is_read) {
>               ret = drv->bdrv_load_vmstate(bs, qiov, pos);
>           } else {
> -            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
> +            ret = bdrv_co_do_save_vmstate(bs, qiov, pos);
>           }
>       } else if (bs->file) {
>           ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
> @@ -2726,7 +2822,30 @@ int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
>   
>   static int coroutine_fn bdrv_co_flush_vmstate(BlockDriverState *bs)
>   {
> -    return 0;
> +    int err;

nitpicking: you use "err" to store return codes, but everywhere in this file (and mostly in block/*) "ret" is used for it.

> +    BdrvSaveVMState *state = bs->savevm_state;
> +
> +    if (bs->drv->bdrv_save_vmstate == NULL && bs->file != NULL) {
> +        return bdrv_co_flush_vmstate(bs->file->bs);
> +    }
> +    if (state == NULL) {
> +        return 0;
> +    }
> +
> +    if (aio_task_pool_status(state->pool) >= 0) {
> +        /* We are on success path, commit last chunk if possible */
> +        aio_task_pool_start_task(state->pool, &state->t->task);
> +    }
> +
> +    aio_task_pool_wait_all(state->pool);
> +    err = aio_task_pool_status(state->pool);
> +
> +    aio_task_pool_free(state->pool);
> +    g_free(state);
> +
> +    bs->savevm_state = NULL;
> +
> +    return err;
>   }
>   
>   static int coroutine_fn bdrv_flush_vmstate_co_entry(void *opaque)
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 791de6a59c..f90f0e8b6a 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -61,6 +61,8 @@
>   
>   #define BLOCK_PROBE_BUF_SIZE        512
>   
> +#define BDRV_VMSTATE_WORKERS_MAX    8
> +
>   enum BdrvTrackedRequestType {
>       BDRV_TRACKED_READ,
>       BDRV_TRACKED_WRITE,
> @@ -784,6 +786,9 @@ struct BdrvChild {
>       QLIST_ENTRY(BdrvChild) next_parent;
>   };
>   
> +
> +typedef struct BdrvSaveVMState BdrvSaveVMState;
> +
>   /*
>    * Note: the function bdrv_append() copies and swaps contents of
>    * BlockDriverStates, so if you add new fields to this struct, please
> @@ -947,6 +952,9 @@ struct BlockDriverState {
>   
>       /* BdrvChild links to this node may never be frozen */
>       bool never_freeze;
> +
> +    /* Intermediate buffer for VM state saving from snapshot creation code */
> +    BdrvSaveVMState *savevm_state;
>   };
>   
>   struct BlockBackendRootState {
> 

Minor improvements and further refactoring may be done in separate, I'm OK with it as is:
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Denis V. Lunev June 18, 2020, 11:07 a.m. UTC | #2
On 6/18/20 1:56 PM, Vladimir Sementsov-Ogievskiy wrote:
> 16.06.2020 19:20, Denis V. Lunev wrote:
>> This patch does 2 standard basic things:
>> - it creates intermediate buffer for all writes from QEMU migration code
>>    to block driver,
>> - this buffer is sent to disk asynchronously, allowing several writes to
>>    run in parallel.
>>
>> Thus bdrv_vmstate_write() is becoming asynchronous. All pending
>> operations
>> completion are performed in newly invented bdrv_flush_vmstate().
>>
>> In general, migration code is fantastically inefficent (by observation),
>> buffers are not aligned and sent with arbitrary pieces, a lot of time
>> less than 100 bytes at a chunk, which results in read-modify-write
>> operations if target file descriptor is opened with O_DIRECT. It should
>> also be noted that all operations are performed into unallocated image
>> blocks, which also suffer due to partial writes to such new clusters
>> even on cached file descriptors.
>>
>> Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
>>                  original     fixed
>> cached:          1.79s       1.27s
>> non-cached:      3.29s       0.81s
>>
>> The difference over HDD would be more significant :)
>>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> CC: Kevin Wolf <kwolf@redhat.com>
>> CC: Max Reitz <mreitz@redhat.com>
>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>> CC: Fam Zheng <fam@euphon.net>
>> CC: Juan Quintela <quintela@redhat.com>
>> CC: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
>> CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> CC: Denis Plotnikov <dplotnikov@virtuozzo.com>
>> ---
>>   block/io.c                | 123 +++++++++++++++++++++++++++++++++++++-
>>   include/block/block_int.h |   8 +++
>>   2 files changed, 129 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/io.c b/block/io.c
>> index 8718df4ea8..1979098c02 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -26,6 +26,7 @@
>>   #include "trace.h"
>>   #include "sysemu/block-backend.h"
>>   #include "block/aio-wait.h"
>> +#include "block/aio_task.h"
>>   #include "block/blockjob.h"
>>   #include "block/blockjob_int.h"
>>   #include "block/block_int.h"
>> @@ -33,6 +34,7 @@
>>   #include "qapi/error.h"
>>   #include "qemu/error-report.h"
>>   #include "qemu/main-loop.h"
>> +#include "qemu/units.h"
>>   #include "sysemu/replay.h"
>>     /* Maximum bounce buffer for copy-on-read and write zeroes, in
>> bytes */
>> @@ -2640,6 +2642,100 @@ typedef struct BdrvVmstateCo {
>>       bool                is_read;
>>   } BdrvVmstateCo;
>>   +typedef struct BdrvVMStateTask {
>> +    AioTask task;
>> +
>> +    BlockDriverState *bs;
>> +    int64_t offset;
>> +    void *buf;
>> +    size_t bytes;
>> +} BdrvVMStateTask;
>> +
>> +typedef struct BdrvSaveVMState {
>> +    AioTaskPool *pool;
>> +    BdrvVMStateTask *t;
>> +} BdrvSaveVMState;
>> +
>> +
>> +static coroutine_fn int bdrv_co_vmstate_save_task_entry(AioTask *task)
>> +{
>> +    int err = 0;
>> +    BdrvVMStateTask *t = container_of(task, BdrvVMStateTask, task);
>> +
>> +    if (t->bytes != 0) {
>> +        QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, t->buf,
>> t->bytes);
>> +
>> +        bdrv_inc_in_flight(t->bs);
>> +        err = t->bs->drv->bdrv_save_vmstate(t->bs, &qiov, t->offset);
>> +        bdrv_dec_in_flight(t->bs);
>> +    }
>> +
>> +    qemu_vfree(t->buf);
>> +    return err;
>> +}
>> +
>> +static BdrvVMStateTask *bdrv_vmstate_task_create(BlockDriverState *bs,
>> +                                                 int64_t pos, size_t
>> size)
>> +{
>> +    BdrvVMStateTask *t = g_new(BdrvVMStateTask, 1);
>> +
>> +    *t = (BdrvVMStateTask) {
>> +        .task.func = bdrv_co_vmstate_save_task_entry,
>> +        .buf = qemu_blockalign(bs, size),
>> +        .offset = pos,
>> +        .bs = bs,
>> +    };
>> +
>> +    return t;
>> +}
>> +
>> +static int bdrv_co_do_save_vmstate(BlockDriverState *bs,
>> QEMUIOVector *qiov,
>> +                                   int64_t pos)
>> +{
>> +    BdrvSaveVMState *state = bs->savevm_state;
>> +    BdrvVMStateTask *t;
>> +    size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
>> +    size_t to_copy, off;
>> +
>> +    if (state == NULL) {
>> +        state = g_new(BdrvSaveVMState, 1);
>> +        *state = (BdrvSaveVMState) {
>> +            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
>> +            .t = bdrv_vmstate_task_create(bs, pos, buf_size),
>> +        };
>> +
>> +        bs->savevm_state = state;
>> +    }
>> +
>> +    if (aio_task_pool_status(state->pool) < 0) {
>> +        /* Caller is responsible for cleanup. We should block all
>> further
>> +         * save operations for this exact state */
>> +        return aio_task_pool_status(state->pool);
>> +    }
>> +
>> +    t = state->t;
>> +    if (t->offset + t->bytes != pos) {
>> +        /* Normally this branch is not reachable from migration */
>> +        return bs->drv->bdrv_save_vmstate(bs, qiov, pos);
>> +    }
>> +
>> +    off = 0;
>> +    while (1) {
>
> "while (aio_task_pool_status(state->pool) == 0)" + "return
> aio_task_pool_status(state->pool)" after loop will improve
> interactivity on failure path, but shouldn't be necessary.
>
>> +        to_copy = MIN(qiov->size - off, buf_size - t->bytes);
>> +        qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
>> +        t->bytes += to_copy;
>> +        if (t->bytes < buf_size) {
>> +            return qiov->size;
>
> Intersting: we are substituting .bdrv_save_vmstate by this function.
> There are two existing now:
>
> qcow2_save_vmstate, which doesn't care to return qiov->size and
> sd_save_vmstate which does it.
>
> So, it seems safe to return qiov->size now, but I'm sure that it's
> actually unused and should be
> refactored to return 0 on success everywhere.

This looks like my mistake now. I have spent some time
with error codes working with Eric's suggestions and
believe that 0 should be returned now.

Will fix in next revision.

Den

Patch
diff mbox series

diff --git a/block/io.c b/block/io.c
index 8718df4ea8..1979098c02 100644
--- a/block/io.c
+++ b/block/io.c
@@ -26,6 +26,7 @@ 
 #include "trace.h"
 #include "sysemu/block-backend.h"
 #include "block/aio-wait.h"
+#include "block/aio_task.h"
 #include "block/blockjob.h"
 #include "block/blockjob_int.h"
 #include "block/block_int.h"
@@ -33,6 +34,7 @@ 
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
+#include "qemu/units.h"
 #include "sysemu/replay.h"
 
 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
@@ -2640,6 +2642,100 @@  typedef struct BdrvVmstateCo {
     bool                is_read;
 } BdrvVmstateCo;
 
+typedef struct BdrvVMStateTask {
+    AioTask task;
+
+    BlockDriverState *bs;
+    int64_t offset;
+    void *buf;
+    size_t bytes;
+} BdrvVMStateTask;
+
+typedef struct BdrvSaveVMState {
+    AioTaskPool *pool;
+    BdrvVMStateTask *t;
+} BdrvSaveVMState;
+
+
+static coroutine_fn int bdrv_co_vmstate_save_task_entry(AioTask *task)
+{
+    int err = 0;
+    BdrvVMStateTask *t = container_of(task, BdrvVMStateTask, task);
+
+    if (t->bytes != 0) {
+        QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, t->buf, t->bytes);
+
+        bdrv_inc_in_flight(t->bs);
+        err = t->bs->drv->bdrv_save_vmstate(t->bs, &qiov, t->offset);
+        bdrv_dec_in_flight(t->bs);
+    }
+
+    qemu_vfree(t->buf);
+    return err;
+}
+
+static BdrvVMStateTask *bdrv_vmstate_task_create(BlockDriverState *bs,
+                                                 int64_t pos, size_t size)
+{
+    BdrvVMStateTask *t = g_new(BdrvVMStateTask, 1);
+
+    *t = (BdrvVMStateTask) {
+        .task.func = bdrv_co_vmstate_save_task_entry,
+        .buf = qemu_blockalign(bs, size),
+        .offset = pos,
+        .bs = bs,
+    };
+
+    return t;
+}
+
+static int bdrv_co_do_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                                   int64_t pos)
+{
+    BdrvSaveVMState *state = bs->savevm_state;
+    BdrvVMStateTask *t;
+    size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
+    size_t to_copy, off;
+
+    if (state == NULL) {
+        state = g_new(BdrvSaveVMState, 1);
+        *state = (BdrvSaveVMState) {
+            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
+            .t = bdrv_vmstate_task_create(bs, pos, buf_size),
+        };
+
+        bs->savevm_state = state;
+    }
+
+    if (aio_task_pool_status(state->pool) < 0) {
+        /* Caller is responsible for cleanup. We should block all further
+         * save operations for this exact state */
+        return aio_task_pool_status(state->pool);
+    }
+
+    t = state->t;
+    if (t->offset + t->bytes != pos) {
+        /* Normally this branch is not reachable from migration */
+        return bs->drv->bdrv_save_vmstate(bs, qiov, pos);
+    }
+
+    off = 0;
+    while (1) {
+        to_copy = MIN(qiov->size - off, buf_size - t->bytes);
+        qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
+        t->bytes += to_copy;
+        if (t->bytes < buf_size) {
+            return qiov->size;
+        }
+
+        aio_task_pool_start_task(state->pool, &t->task);
+
+        pos += to_copy;
+        off += to_copy;
+        state->t = t = bdrv_vmstate_task_create(bs, pos, buf_size);
+    }
+}
+
 static int coroutine_fn
 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                    bool is_read)
@@ -2655,7 +2751,7 @@  bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
         if (is_read) {
             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
         } else {
-            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+            ret = bdrv_co_do_save_vmstate(bs, qiov, pos);
         }
     } else if (bs->file) {
         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
@@ -2726,7 +2822,30 @@  int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
 
 static int coroutine_fn bdrv_co_flush_vmstate(BlockDriverState *bs)
 {
-    return 0;
+    int err;
+    BdrvSaveVMState *state = bs->savevm_state;
+
+    if (bs->drv->bdrv_save_vmstate == NULL && bs->file != NULL) {
+        return bdrv_co_flush_vmstate(bs->file->bs);
+    }
+    if (state == NULL) {
+        return 0;
+    }
+
+    if (aio_task_pool_status(state->pool) >= 0) {
+        /* We are on success path, commit last chunk if possible */
+        aio_task_pool_start_task(state->pool, &state->t->task);
+    }
+
+    aio_task_pool_wait_all(state->pool);
+    err = aio_task_pool_status(state->pool);
+
+    aio_task_pool_free(state->pool);
+    g_free(state);
+
+    bs->savevm_state = NULL;
+
+    return err;
 }
 
 static int coroutine_fn bdrv_flush_vmstate_co_entry(void *opaque)
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 791de6a59c..f90f0e8b6a 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -61,6 +61,8 @@ 
 
 #define BLOCK_PROBE_BUF_SIZE        512
 
+#define BDRV_VMSTATE_WORKERS_MAX    8
+
 enum BdrvTrackedRequestType {
     BDRV_TRACKED_READ,
     BDRV_TRACKED_WRITE,
@@ -784,6 +786,9 @@  struct BdrvChild {
     QLIST_ENTRY(BdrvChild) next_parent;
 };
 
+
+typedef struct BdrvSaveVMState BdrvSaveVMState;
+
 /*
  * Note: the function bdrv_append() copies and swaps contents of
  * BlockDriverStates, so if you add new fields to this struct, please
@@ -947,6 +952,9 @@  struct BlockDriverState {
 
     /* BdrvChild links to this node may never be frozen */
     bool never_freeze;
+
+    /* Intermediate buffer for VM state saving from snapshot creation code */
+    BdrvSaveVMState *savevm_state;
 };
 
 struct BlockBackendRootState {