[7/7] block/io: improve loadvm performance
diff mbox series

Message ID 20200703173538.29296-8-den@openvz.org
State New
Headers show
Series
  • block: seriously improve savevm/loadvm performance
Related show

Commit Message

Denis V. Lunev July 3, 2020, 5:35 p.m. UTC
This patch creates intermediate buffer for reading from block driver
state and performs read-ahead to this buffer. Snapshot code performs
reads sequentially and thus we know what offsets will be required
and when they will become not needed.

Results are fantastic. Switch to snapshot times of 2GB Fedora 31 VM
over NVME storage are the following:
                original     fixed
cached:          1.84s       1.16s
non-cached:     12.74s       1.27s

The difference over HDD would be more significant :)

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Max Reitz <mreitz@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Fam Zheng <fam@euphon.net>
CC: Juan Quintela <quintela@redhat.com>
CC: Denis Plotnikov <dplotnikov@virtuozzo.com>
---
 block/block-backend.c     |  12 +-
 block/io.c                | 254 +++++++++++++++++++++++++++++++++++++-
 include/block/block_int.h |   3 +
 3 files changed, 265 insertions(+), 4 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy July 8, 2020, 6:38 p.m. UTC | #1
03.07.2020 20:35, Denis V. Lunev wrote:
> This patch creates intermediate buffer for reading from block driver
> state and performs read-ahead to this buffer. Snapshot code performs
> reads sequentially and thus we know what offsets will be required
> and when they will become not needed.
> 
> Results are fantastic. Switch to snapshot times of 2GB Fedora 31 VM
> over NVME storage are the following:
>                  original     fixed
> cached:          1.84s       1.16s
> non-cached:     12.74s       1.27s
> 
> The difference over HDD would be more significant :)
> 

[..]

> +static coroutine_fn int bdrv_co_vmstate_load_task_entry(AioTask *task)
> +{
> +    int err = 0;
> +    BdrvLoadVMStateTask *t = container_of(task, BdrvLoadVMStateTask, task);
> +    BdrvLoadVMChunk *c = t->chunk;
> +    BdrvLoadVMState *state = t->bs->loadvm_state;
> +    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, c->buf, c->bytes);
> +
> +    bdrv_inc_in_flight(t->bs);
> +    err = t->bs->drv->bdrv_load_vmstate(t->bs, &qiov, c->offset);
> +    bdrv_dec_in_flight(t->bs);
> +
> +    qemu_co_mutex_lock(&state->lock);

In my understanding, we don't need this mutex. We need coroutine mutex, when there
is a critical section with an yield inside, like accessing metadata in qcow2 code.

If all the critical sections doesn't have an yield inside they should not intersect
anyway, as all these functions should be executed in coroutines in the aio context
of this bs, i.e. not in parallel iothreads simultaneously.

> +    QLIST_REMOVE(c, list);
> +    if (err == 0) {
> +        QLIST_INSERT_HEAD(&state->chunks, c, list);
> +    } else {
> +        bdrv_free_loadvm_chunk(c);
> +    }
> +    qemu_co_mutex_unlock(&state->lock);
> +    qemu_co_queue_restart_all(&state->waiters);
> +
> +    return err;
> +}

[..]

> +static int bdrv_co_do_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
> +                                   int64_t pos)
> +{
> +    BdrvLoadVMState *state = bs->loadvm_state;
> +    BdrvLoadVMChunk *c;
> +    size_t off;
> +    int64_t start_pos = pos;
> +
> +    if (state == NULL) {
> +        if (pos != 0) {
> +            goto slow_path;
> +        }
> +
> +        state = g_new(BdrvLoadVMState, 1);
> +        *state = (BdrvLoadVMState) {
> +            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
> +            .chunks = QLIST_HEAD_INITIALIZER(state->chunks),
> +            .loading = QLIST_HEAD_INITIALIZER(state->loading),
> +        };
> +        qemu_co_mutex_init(&state->lock);
> +        qemu_co_queue_init(&state->waiters);
> +
> +        bs->loadvm_state = state;
> +
> +        bdrv_co_loadvmstate_start(bs);
> +    }
> +
> +    if (state->offset != pos) {
> +        goto slow_path;
> +    }
> +
> +    off = 0;
> +
> +    qemu_co_mutex_lock(&state->lock);
> +    while (off < qiov->size && aio_task_pool_status(state->pool) == 0) {
> +        c = bdrv_co_find_loadvmstate_chunk(pos, QLIST_FIRST(&state->chunks));
> +        if (c != NULL) {
> +            ssize_t chunk_off = pos - c->offset;
> +            ssize_t to_copy = MIN(qiov->size - off, c->bytes - chunk_off);
> +
> +            qemu_iovec_from_buf(qiov, off, c->buf + chunk_off, to_copy);
> +
> +            off += to_copy;
> +            pos += to_copy;
> +
> +            if (pos == c->offset + c->bytes) {
> +                state->chunk_count--;
> +                /* End of buffer, discard it from the list */
> +                QLIST_REMOVE(c, list);
> +
> +                /*
> +                 * Start loading next chunk. The slot in the pool should
> +                 * always be free for this purpose at the moment.
> +                 *
> +                 * There is a problem with the end of the stream. This code
> +                 * starts to read the data beyond the end of the saved state.
> +                 * The code in low level should be ready to such behavior but
> +                 * we will have unnecessary BDRV_VMSTATE_WORKERS_MAX chunks
> +                 * fully zeroed. This is not good, but acceptable.

As I understand, we don't know where is the end of state, yes? So, we can't avoid it anyway. Should not be a problem.

> +                 */
> +                bdrv_co_loadvmstate_next(bs, c);
> +            }
> +
> +            state->offset += to_copy;
> +            continue;
> +        }
> +
> +        c = bdrv_co_find_loadvmstate_chunk(pos, QLIST_FIRST(&state->loading));
> +        if (c != NULL) {
> +            qemu_co_queue_wait(&state->waiters, &state->lock);
> +            continue;
> +        }
> +
> +        /*
> +         * This should not happen normally. This point could be reached only
> +         * if we have had some parallel requests. Fallback to slow load.
> +         */
> +        qemu_co_mutex_unlock(&state->lock);
> +
> +slow_path:

A kind of taste, but I'd prefer to keep it in the end of the function, avoiding gotos leading to loop body.

> +        return bs->drv->bdrv_load_vmstate(bs, qiov, start_pos);
> +    }
> +    qemu_co_mutex_unlock(&state->lock);
> +
> +    return aio_task_pool_status(state->pool);
> +}
> +
>   static int coroutine_fn
>   bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
>                      bool is_read)
> @@ -2752,7 +2955,7 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
>           ret = -ENOMEDIUM;
>       } else if (drv->bdrv_load_vmstate) {
>           if (is_read) {
> -            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
> +            ret = bdrv_co_do_load_vmstate(bs, qiov, pos);
>           } else {
>               ret = bdrv_co_do_save_vmstate(bs, qiov, pos);
>           }
> @@ -2823,13 +3026,13 @@ int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
>       return bdrv_rw_vmstate(bs, qiov, pos, true);
>   }
>   
> -static int coroutine_fn bdrv_co_finalize_vmstate(BlockDriverState *bs)
> +static int coroutine_fn bdrv_co_finalize_save_vmstate(BlockDriverState *bs)
>   {
>       int err;
>       BdrvSaveVMState *state = bs->savevm_state;
>   
>       if (bs->drv->bdrv_save_vmstate == NULL && bs->file != NULL) {
> -        return bdrv_co_finalize_vmstate(bs->file->bs);
> +        return bdrv_co_finalize_save_vmstate(bs->file->bs);
>       }
>       if (state == NULL) {
>           return 0;
> @@ -2851,6 +3054,51 @@ static int coroutine_fn bdrv_co_finalize_vmstate(BlockDriverState *bs)
>       return err;
>   }
>   
> +static int coroutine_fn bdrv_co_finalize_load_vmstate(BlockDriverState *bs)
> +{
> +    int err;
> +    BdrvLoadVMState *state = bs->loadvm_state;
> +    BdrvLoadVMChunk *c, *tmp;
> +
> +    if (bs->drv->bdrv_load_vmstate == NULL && bs->file != NULL) {
> +        return bdrv_co_finalize_load_vmstate(bs->file->bs);
> +    }
> +    if (state == NULL) {
> +        return 0;
> +    }
> +
> +    aio_task_pool_wait_all(state->pool);
> +    err = aio_task_pool_status(state->pool);
> +    aio_task_pool_free(state->pool);
> +
> +    QLIST_FOREACH(c, &state->loading, list) {
> +        assert(1);  /* this list must be empty as all tasks are committed */
> +    }

assert(QLIST_EMPTY(&state->loading));

> +    QLIST_FOREACH_SAFE(c, &state->chunks, list, tmp) {
> +        QLIST_REMOVE(c, list);
> +        bdrv_free_loadvm_chunk(c);
> +    }
> +
> +    g_free(state);
> +
> +    bs->loadvm_state = NULL;
> +
> +    return err;
> +}
> +
> +static int coroutine_fn bdrv_co_finalize_vmstate(BlockDriverState *bs)
> +{
> +    int err1 = bdrv_co_finalize_save_vmstate(bs);
> +    int err2 = bdrv_co_finalize_load_vmstate(bs);
> +    if (err1 < 0) {
> +        return err1;
> +    }
> +    if (err2 < 0) {
> +        return err2;
> +    }
> +    return 0;
> +}
> +
>   static int coroutine_fn bdrv_finalize_vmstate_co_entry(void *opaque)
>   {
>       return bdrv_co_finalize_vmstate(opaque);
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index f90f0e8b6a..0942578a74 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -788,6 +788,7 @@ struct BdrvChild {
>   
>   
>   typedef struct BdrvSaveVMState BdrvSaveVMState;
> +typedef struct BdrvLoadVMState BdrvLoadVMState;
>   
>   /*
>    * Note: the function bdrv_append() copies and swaps contents of
> @@ -955,6 +956,8 @@ struct BlockDriverState {
>   
>       /* Intermediate buffer for VM state saving from snapshot creation code */
>       BdrvSaveVMState *savevm_state;
> +    /* Intermediate buffer for VM state loading */
> +    BdrvLoadVMState *loadvm_state;
>   };
>   
>   struct BlockBackendRootState {
>

Patch
diff mbox series

diff --git a/block/block-backend.c b/block/block-backend.c
index 5bb11c8e01..09773b3e37 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2201,11 +2201,21 @@  int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
 
 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
 {
+    int ret, ret2;
+
     if (!blk_is_available(blk)) {
         return -ENOMEDIUM;
     }
 
-    return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
+    ret = bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
+    ret2 = bdrv_finalize_vmstate(blk_bs(blk));
+    if (ret < 0) {
+        return ret;
+    }
+    if (ret2 < 0) {
+        return ret2;
+    }
+    return ret;
 }
 
 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
diff --git a/block/io.c b/block/io.c
index 71a696deb7..decb850f1a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2739,6 +2739,209 @@  static int bdrv_co_do_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
     }
 }
 
+
+typedef struct BdrvLoadVMChunk {
+    void *buf;
+    uint64_t offset;
+    ssize_t bytes;
+
+    QLIST_ENTRY(BdrvLoadVMChunk) list;
+} BdrvLoadVMChunk;
+
+typedef struct BdrvLoadVMState {
+    AioTaskPool *pool;
+
+    int64_t offset;
+    int64_t last_loaded;
+
+    int chunk_count;
+    QLIST_HEAD(, BdrvLoadVMChunk) chunks;
+    QLIST_HEAD(, BdrvLoadVMChunk) loading;
+    CoMutex lock;
+    CoQueue waiters;
+} BdrvLoadVMState;
+
+typedef struct BdrvLoadVMStateTask {
+    AioTask task;
+
+    BlockDriverState *bs;
+    BdrvLoadVMChunk *chunk;
+} BdrvLoadVMStateTask;
+
+static BdrvLoadVMChunk *bdrv_co_find_loadvmstate_chunk(int64_t pos,
+                                                       BdrvLoadVMChunk *c)
+{
+    for (; c != NULL; c = QLIST_NEXT(c, list)) {
+        if (c->offset <= pos && c->offset + c->bytes > pos) {
+            return c;
+        }
+    }
+
+    return NULL;
+}
+
+static void bdrv_free_loadvm_chunk(BdrvLoadVMChunk *c)
+{
+    qemu_vfree(c->buf);
+    g_free(c);
+}
+
+static coroutine_fn int bdrv_co_vmstate_load_task_entry(AioTask *task)
+{
+    int err = 0;
+    BdrvLoadVMStateTask *t = container_of(task, BdrvLoadVMStateTask, task);
+    BdrvLoadVMChunk *c = t->chunk;
+    BdrvLoadVMState *state = t->bs->loadvm_state;
+    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, c->buf, c->bytes);
+
+    bdrv_inc_in_flight(t->bs);
+    err = t->bs->drv->bdrv_load_vmstate(t->bs, &qiov, c->offset);
+    bdrv_dec_in_flight(t->bs);
+
+    qemu_co_mutex_lock(&state->lock);
+    QLIST_REMOVE(c, list);
+    if (err == 0) {
+        QLIST_INSERT_HEAD(&state->chunks, c, list);
+    } else {
+        bdrv_free_loadvm_chunk(c);
+    }
+    qemu_co_mutex_unlock(&state->lock);
+    qemu_co_queue_restart_all(&state->waiters);
+
+    return err;
+}
+
+
+static void bdrv_co_loadvmstate_next(BlockDriverState *bs, BdrvLoadVMChunk *c)
+{
+    BdrvLoadVMStateTask *t = g_new(BdrvLoadVMStateTask, 1);
+    BdrvLoadVMState *state = bs->loadvm_state;
+
+    qemu_co_mutex_assert_locked(&state->lock);
+
+    c->offset = state->last_loaded;
+
+    *t = (BdrvLoadVMStateTask) {
+        .task.func = bdrv_co_vmstate_load_task_entry,
+        .bs = bs,
+        .chunk = c,
+    };
+
+    QLIST_INSERT_HEAD(&state->loading, t->chunk, list);
+    state->chunk_count++;
+    state->last_loaded += c->bytes;
+
+    qemu_co_mutex_unlock(&state->lock);
+    aio_task_pool_start_task(state->pool, &t->task);
+    qemu_co_mutex_lock(&state->lock);
+}
+
+
+static void bdrv_co_loadvmstate_start(BlockDriverState *bs)
+{
+    int i;
+    size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
+    BdrvLoadVMState *state = bs->loadvm_state;
+
+    qemu_co_mutex_lock(&state->lock);
+    for (i = 0; i < BDRV_VMSTATE_WORKERS_MAX; i++) {
+        BdrvLoadVMChunk *c = g_new0(BdrvLoadVMChunk, 1);
+
+        c->buf = qemu_blockalign(bs, buf_size);
+        c->bytes = buf_size;
+
+        bdrv_co_loadvmstate_next(bs, c);
+    }
+    qemu_co_mutex_unlock(&state->lock);
+}
+
+static int bdrv_co_do_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                                   int64_t pos)
+{
+    BdrvLoadVMState *state = bs->loadvm_state;
+    BdrvLoadVMChunk *c;
+    size_t off;
+    int64_t start_pos = pos;
+
+    if (state == NULL) {
+        if (pos != 0) {
+            goto slow_path;
+        }
+
+        state = g_new(BdrvLoadVMState, 1);
+        *state = (BdrvLoadVMState) {
+            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
+            .chunks = QLIST_HEAD_INITIALIZER(state->chunks),
+            .loading = QLIST_HEAD_INITIALIZER(state->loading),
+        };
+        qemu_co_mutex_init(&state->lock);
+        qemu_co_queue_init(&state->waiters);
+
+        bs->loadvm_state = state;
+
+        bdrv_co_loadvmstate_start(bs);
+    }
+
+    if (state->offset != pos) {
+        goto slow_path;
+    }
+
+    off = 0;
+
+    qemu_co_mutex_lock(&state->lock);
+    while (off < qiov->size && aio_task_pool_status(state->pool) == 0) {
+        c = bdrv_co_find_loadvmstate_chunk(pos, QLIST_FIRST(&state->chunks));
+        if (c != NULL) {
+            ssize_t chunk_off = pos - c->offset;
+            ssize_t to_copy = MIN(qiov->size - off, c->bytes - chunk_off);
+
+            qemu_iovec_from_buf(qiov, off, c->buf + chunk_off, to_copy);
+
+            off += to_copy;
+            pos += to_copy;
+
+            if (pos == c->offset + c->bytes) {
+                state->chunk_count--;
+                /* End of buffer, discard it from the list */
+                QLIST_REMOVE(c, list);
+
+                /*
+                 * Start loading next chunk. The slot in the pool should
+                 * always be free for this purpose at the moment.
+                 *
+                 * There is a problem with the end of the stream. This code
+                 * starts to read the data beyond the end of the saved state.
+                 * The code in low level should be ready to such behavior but
+                 * we will have unnecessary BDRV_VMSTATE_WORKERS_MAX chunks
+                 * fully zeroed. This is not good, but acceptable.
+                 */
+                bdrv_co_loadvmstate_next(bs, c);
+            }
+
+            state->offset += to_copy;
+            continue;
+        }
+
+        c = bdrv_co_find_loadvmstate_chunk(pos, QLIST_FIRST(&state->loading));
+        if (c != NULL) {
+            qemu_co_queue_wait(&state->waiters, &state->lock);
+            continue;
+        }
+
+        /*
+         * This should not happen normally. This point could be reached only
+         * if we have had some parallel requests. Fallback to slow load.
+         */
+        qemu_co_mutex_unlock(&state->lock);
+
+slow_path:
+        return bs->drv->bdrv_load_vmstate(bs, qiov, start_pos);
+    }
+    qemu_co_mutex_unlock(&state->lock);
+
+    return aio_task_pool_status(state->pool);
+}
+
 static int coroutine_fn
 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                    bool is_read)
@@ -2752,7 +2955,7 @@  bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
         ret = -ENOMEDIUM;
     } else if (drv->bdrv_load_vmstate) {
         if (is_read) {
-            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
+            ret = bdrv_co_do_load_vmstate(bs, qiov, pos);
         } else {
             ret = bdrv_co_do_save_vmstate(bs, qiov, pos);
         }
@@ -2823,13 +3026,13 @@  int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
     return bdrv_rw_vmstate(bs, qiov, pos, true);
 }
 
-static int coroutine_fn bdrv_co_finalize_vmstate(BlockDriverState *bs)
+static int coroutine_fn bdrv_co_finalize_save_vmstate(BlockDriverState *bs)
 {
     int err;
     BdrvSaveVMState *state = bs->savevm_state;
 
     if (bs->drv->bdrv_save_vmstate == NULL && bs->file != NULL) {
-        return bdrv_co_finalize_vmstate(bs->file->bs);
+        return bdrv_co_finalize_save_vmstate(bs->file->bs);
     }
     if (state == NULL) {
         return 0;
@@ -2851,6 +3054,51 @@  static int coroutine_fn bdrv_co_finalize_vmstate(BlockDriverState *bs)
     return err;
 }
 
+static int coroutine_fn bdrv_co_finalize_load_vmstate(BlockDriverState *bs)
+{
+    int err;
+    BdrvLoadVMState *state = bs->loadvm_state;
+    BdrvLoadVMChunk *c, *tmp;
+
+    if (bs->drv->bdrv_load_vmstate == NULL && bs->file != NULL) {
+        return bdrv_co_finalize_load_vmstate(bs->file->bs);
+    }
+    if (state == NULL) {
+        return 0;
+    }
+
+    aio_task_pool_wait_all(state->pool);
+    err = aio_task_pool_status(state->pool);
+    aio_task_pool_free(state->pool);
+
+    QLIST_FOREACH(c, &state->loading, list) {
+        assert(1);  /* this list must be empty as all tasks are committed */
+    }
+    QLIST_FOREACH_SAFE(c, &state->chunks, list, tmp) {
+        QLIST_REMOVE(c, list);
+        bdrv_free_loadvm_chunk(c);
+    }
+
+    g_free(state);
+
+    bs->loadvm_state = NULL;
+
+    return err;
+}
+
+static int coroutine_fn bdrv_co_finalize_vmstate(BlockDriverState *bs)
+{
+    int err1 = bdrv_co_finalize_save_vmstate(bs);
+    int err2 = bdrv_co_finalize_load_vmstate(bs);
+    if (err1 < 0) {
+        return err1;
+    }
+    if (err2 < 0) {
+        return err2;
+    }
+    return 0;
+}
+
 static int coroutine_fn bdrv_finalize_vmstate_co_entry(void *opaque)
 {
     return bdrv_co_finalize_vmstate(opaque);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f90f0e8b6a..0942578a74 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -788,6 +788,7 @@  struct BdrvChild {
 
 
 typedef struct BdrvSaveVMState BdrvSaveVMState;
+typedef struct BdrvLoadVMState BdrvLoadVMState;
 
 /*
  * Note: the function bdrv_append() copies and swaps contents of
@@ -955,6 +956,8 @@  struct BlockDriverState {
 
     /* Intermediate buffer for VM state saving from snapshot creation code */
     BdrvSaveVMState *savevm_state;
+    /* Intermediate buffer for VM state loading */
+    BdrvLoadVMState *loadvm_state;
 };
 
 struct BlockBackendRootState {