diff mbox series

[v4,3/3] block/qcow2: introduce discard_rw_lock: fix discarding host clusters

Message ID 20210319100811.342464-4-vsementsov@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series qcow2: fix parallel rewrite and discard (rw-lock) | expand

Commit Message

Vladimir Sementsov-Ogievskiy March 19, 2021, 10:08 a.m. UTC
There is a bug in qcow2: host cluster can be discarded (refcount
becomes 0) and reused during data write. In this case data write may
pollute another data cluster (recently allocated) or even metadata.

To fix the issue introduce rw-lock: hold read-lock during data writing
take write-lock when refcount becomes 0.

Enable qcow2-discard-during-rewrite as it is fixed.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/qcow2.h                                 | 20 +++++
 block/qcow2-refcount.c                        | 22 ++++++
 block/qcow2.c                                 | 73 +++++++++++++++++--
 .../tests/qcow2-discard-during-rewrite        |  2 +-
 4 files changed, 109 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/block/qcow2.h b/block/qcow2.h
index 0fe5f74ed3..7d387fe39d 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -420,6 +420,26 @@  typedef struct BDRVQcow2State {
      * is to convert the image with the desired compression type set.
      */
     Qcow2CompressionType compression_type;
+
+    /*
+     * discard_rw_lock prevents discarding host cluster when there are in-flight
+     * writes into it. So, in-flight writes are "readers" and the code reducing
+     * cluster refcount to zero in update_refcount() is "writer".
+     *
+     * Data-writes should works as follows:
+     * 1. rd-lock should be acquired in the same s->lock critical section where
+     *    corresponding host clusters were allocated or somehow got from the
+     *    metadata. Otherwise we risk miss discarding the cluster on s->lock
+     *    unlocking (unlock should only schedule another coroutine, not enter
+     *    it, but better be absolutely safe).
+     *
+     * 2. rd-lock should be released after data writing finish.
+     *
+     * For reducing refcount to zero (and therefore allowing reallocating the
+     * host cluster for other needs) it's enough to take rw-lock (to wait for
+     * all in-flight writes) and immediately release it (see update_refcount()).
+     */
+    CoRwlock discard_rw_lock;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 8e649b008e..bb6842cd8f 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -878,6 +878,28 @@  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         } else {
             refcount += addend;
         }
+
+        if (qemu_in_coroutine() && refcount == 0) {
+            /*
+             * Refcount becomes zero, but there are still may be in-fligth
+             * writes, that writing data to the cluster (that's done without
+             * qcow2 mutext held).
+             *
+             * Data writing protected by rd-locked discard_rw_lock. So here
+             * it's enough to take and immediately release wr-lock on it.
+             * We can immediately release it, because new write requests can't
+             * came to cluster which refcount becomes 0 (There should not be any
+             * links from L2 tables to it).
+             *
+             * We can't do it if we are not in coroutine. But if we are not in
+             * coroutine, that also means that we are modifying metadata not
+             * taking qcow2 s->lock mutex, which is wrong as well.. So, let's
+             * hope for a bright future.
+             */
+            qemu_co_rwlock_wrlock(&s->discard_rw_lock);
+            qemu_co_rwlock_unlock(&s->discard_rw_lock);
+        }
+
         if (refcount == 0 && cluster_index < s->free_cluster_index) {
             s->free_cluster_index = cluster_index;
         }
diff --git a/block/qcow2.c b/block/qcow2.c
index 0db1227ac9..aea7aea334 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1899,6 +1899,7 @@  static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
 
     /* Initialise locks */
     qemu_co_mutex_init(&s->lock);
+    qemu_co_rwlock_init(&s->discard_rw_lock);
 
     if (qemu_in_coroutine()) {
         /* From bdrv_co_create.  */
@@ -2182,9 +2183,17 @@  typedef struct Qcow2AioTask {
     QEMUIOVector *qiov;
     uint64_t qiov_offset;
     QCowL2Meta *l2meta; /* only for write */
+    bool rdlock; /* only for write */
 } Qcow2AioTask;
 
 static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task);
+
+/*
+ * @rdlock: If true, it means that qcow2_add_task is called with discard_rw_lock
+ * rd-locked, and this rd-lock must be transaferred to the task. The task itself
+ * will release the lock. The caller expects that after qcow2_add_task() call
+ * the lock is already released.
+ */
 static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
                                        AioTaskPool *pool,
                                        AioTaskFunc func,
@@ -2194,8 +2203,10 @@  static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
                                        uint64_t bytes,
                                        QEMUIOVector *qiov,
                                        size_t qiov_offset,
-                                       QCowL2Meta *l2meta)
+                                       QCowL2Meta *l2meta,
+                                       bool rdlock)
 {
+    BDRVQcow2State *s = bs->opaque;
     Qcow2AioTask local_task;
     Qcow2AioTask *task = pool ? g_new(Qcow2AioTask, 1) : &local_task;
 
@@ -2209,6 +2220,7 @@  static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
         .bytes = bytes,
         .qiov_offset = qiov_offset,
         .l2meta = l2meta,
+        .rdlock = rdlock,
     };
 
     trace_qcow2_add_task(qemu_coroutine_self(), bs, pool,
@@ -2217,10 +2229,24 @@  static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
                          qiov, qiov_offset);
 
     if (!pool) {
+        /*
+         * func will release rd-lock if needed and caller's expectation would be
+         * satisfied, so we should not care.
+         */
         return func(&task->task);
     }
 
+    /*
+     * We are going to run task in a different coroutine. We can't acquire lock
+     * in one coroutine and release in another. So, the new coroutine should
+     * take it's own rd-lock, and we should release ours one.
+     */
+    task->rdlock = false;
     aio_task_pool_start_task(pool, &task->task);
+    if (rdlock) {
+        assert(task->rdlock); /* caller took ownership */
+        qemu_co_rwlock_unlock(&s->discard_rw_lock);
+    }
 
     return 0;
 }
@@ -2274,6 +2300,7 @@  static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task)
     Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
 
     assert(!t->l2meta);
+    assert(!t->rdlock);
 
     return qcow2_co_preadv_task(t->bs, t->subcluster_type,
                                 t->host_offset, t->offset, t->bytes,
@@ -2320,7 +2347,7 @@  static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
             }
             ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, type,
                                  host_offset, offset, cur_bytes,
-                                 qiov, qiov_offset, NULL);
+                                 qiov, qiov_offset, NULL, false);
             if (ret < 0) {
                 goto out;
             }
@@ -2483,19 +2510,32 @@  static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
  * Called with s->lock unlocked
  * l2meta  - if not NULL, qcow2_co_pwritev_task() will consume it. Caller must
  *           not use it somehow after qcow2_co_pwritev_task() call
+ *
+ * @rdlock must be non-NULL.
+ * If *@rdlock is true it means that discard_rw_lock is already taken. We should
+ * not reacquire it, but caller expects that we release it.
+ * If *@rdlock is false, we should take it ourselves (and still release in the
+ * end). When rd-lock is taken, we should set *@rdlock to true, so that parent
+ * coroutine can check it.
  */
 static coroutine_fn int qcow2_co_pwritev_task(BlockDriverState *bs,
                                               uint64_t host_offset,
                                               uint64_t offset, uint64_t bytes,
                                               QEMUIOVector *qiov,
                                               uint64_t qiov_offset,
-                                              QCowL2Meta *l2meta)
+                                              QCowL2Meta *l2meta,
+                                              bool *rdlock)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
     void *crypt_buf = NULL;
     QEMUIOVector encrypted_qiov;
 
+    if (!*rdlock) {
+        qemu_co_rwlock_rdlock(&s->discard_rw_lock);
+        *rdlock = true;
+    }
+
     if (bs->encrypted) {
         assert(s->crypto);
         assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
@@ -2538,12 +2578,14 @@  static coroutine_fn int qcow2_co_pwritev_task(BlockDriverState *bs,
         }
     }
 
+    qemu_co_rwlock_unlock(&s->discard_rw_lock);
     qemu_co_mutex_lock(&s->lock);
 
     ret = qcow2_handle_l2meta(bs, &l2meta, true);
     goto out_locked;
 
 out_unlocked:
+    qemu_co_rwlock_unlock(&s->discard_rw_lock);
     qemu_co_mutex_lock(&s->lock);
 
 out_locked:
@@ -2563,7 +2605,7 @@  static coroutine_fn int qcow2_co_pwritev_task_entry(AioTask *task)
 
     return qcow2_co_pwritev_task(t->bs, t->host_offset,
                                  t->offset, t->bytes, t->qiov, t->qiov_offset,
-                                 t->l2meta);
+                                 t->l2meta, &t->rdlock);
 }
 
 static coroutine_fn int qcow2_co_pwritev_part(
@@ -2607,6 +2649,8 @@  static coroutine_fn int qcow2_co_pwritev_part(
             goto out_locked;
         }
 
+        qemu_co_rwlock_rdlock(&s->discard_rw_lock);
+
         qemu_co_mutex_unlock(&s->lock);
 
         if (!aio && cur_bytes != bytes) {
@@ -2614,7 +2658,10 @@  static coroutine_fn int qcow2_co_pwritev_part(
         }
         ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_task_entry, 0,
                              host_offset, offset,
-                             cur_bytes, qiov, qiov_offset, l2meta);
+                             cur_bytes, qiov, qiov_offset, l2meta, true);
+        /*
+         * now discard_rw_lock is released and we are safe to take s->lock again
+         */
         l2meta = NULL; /* l2meta is consumed by qcow2_co_pwritev_task() */
         if (ret < 0) {
             goto fail_nometa;
@@ -4094,10 +4141,15 @@  qcow2_co_copy_range_to(BlockDriverState *bs,
             goto fail;
         }
 
+        qemu_co_rwlock_rdlock(&s->discard_rw_lock);
         qemu_co_mutex_unlock(&s->lock);
+
         ret = bdrv_co_copy_range_to(src, src_offset, s->data_file, host_offset,
                                     cur_bytes, read_flags, write_flags);
+
+        qemu_co_rwlock_unlock(&s->discard_rw_lock);
         qemu_co_mutex_lock(&s->lock);
+
         if (ret < 0) {
             goto fail;
         }
@@ -4533,13 +4585,19 @@  qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
     }
 
     ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
-    qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
+        qemu_co_mutex_unlock(&s->lock);
         goto fail;
     }
 
+    qemu_co_rwlock_rdlock(&s->discard_rw_lock);
+    qemu_co_mutex_unlock(&s->lock);
+
     BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
     ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+
+    qemu_co_rwlock_unlock(&s->discard_rw_lock);
+
     if (ret < 0) {
         goto fail;
     }
@@ -4608,7 +4666,8 @@  qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
         }
 
         ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_compressed_task_entry,
-                             0, 0, offset, chunk_size, qiov, qiov_offset, NULL);
+                             0, 0, offset, chunk_size, qiov, qiov_offset, NULL,
+                             false);
         if (ret < 0) {
             break;
         }
diff --git a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
index dd9964ef3f..5df0048757 100755
--- a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
+++ b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
@@ -1,5 +1,5 @@ 
 #!/usr/bin/env bash
-# group: quick disabled
+# group: quick
 #
 # Test discarding (and reusing) host cluster during writing data to it.
 #