diff mbox series

[v6,11/12] qcow2: protect data writing by host range reference

Message ID 20210422163046.442932-12-vsementsov@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series qcow2: fix parallel rewrite and discard (lockless) | expand

Commit Message

Vladimir Sementsov-Ogievskiy April 22, 2021, 4:30 p.m. UTC
We have the following bug:

1. Start write to qcow2. Assume guest cluster G and corresponding host
   cluster is H.

2. The write requests come to the point of data writing to .file. The
   write to .file is started and qcow2 mutex is unlocked.

3. At this time refcount of H becomes 0. For example, it may be due to
   discard operation on qcow2 node, or rewriting compressed data by
   normal write, or some operation with snapshots..

4. Next, some operations occurs and leads to allocation of H for some
   other needs: it may be another write-to-qcow2-node operation, or
   allocation of L2 table or some other data or metadata cluster
   allocation.

5. So, at this point H is used for something other. Assume, L2 table is
   written into H.

6. And now, our write from [2] finishes. And pollutes L2 table in H.
   That's a bug.

To fix the bug we now have host-range-refs, which work in a
way that cluster is not "free" (and therefore will not be reused
and we don't fall into use-after-free described above) until both
refcount and host-range-ref are zero for this cluster.

Let's call qcow2_host_range_ref() in cluster allocation functions:
qcow2_alloc_host_offset() and qcow2_alloc_compressed_cluster_offset()
used on when writing host clusters. So that now these functions returns
"referenced" range, which caller should finally unref.

Iotest qcow2-discard-during-rewrite is enabled, as it works now.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/qcow2-cluster.c                            | 13 +++++++++++++
 block/qcow2.c                                    | 16 ++++++++++++++++
 .../tests/qcow2-discard-during-rewrite           |  2 +-
 3 files changed, 30 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 6105d4e7e0..999a739024 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -809,6 +809,10 @@  static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
  * already allocated at the offset, return an error.
  *
  * Return 0 on success and -errno in error cases
+ *
+ * On success the host range [*host_offset, *host_offset + compressed_size) is
+ * referenced. Caller is responsible to unref it by qcow2_host_range_unref()
+ * after finishing IO operation with this range.
  */
 int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
                                           uint64_t offset,
@@ -866,6 +870,9 @@  int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
 
     *host_offset = cluster_offset & s->cluster_offset_mask;
+
+    qcow2_host_range_ref(bs, *host_offset, compressed_size);
+
     return 0;
 }
 
@@ -1738,6 +1745,10 @@  out:
  * is queued and will be reentered when the dependency has completed.
  *
  * Return 0 on success and -errno in error cases
+ *
+ * On success the host range [*host_offset, *host_offset + *bytes) is
+ * referenced. Caller is responsible to unref it by qcow2_host_range_unref()
+ * after finishing IO operation with this range.
  */
 int qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
                             unsigned int *bytes, uint64_t *host_offset,
@@ -1848,6 +1859,8 @@  again:
     assert(offset_into_cluster(s, *host_offset) ==
            offset_into_cluster(s, offset));
 
+    qcow2_host_range_ref(bs, *host_offset, *bytes);
+
     return 0;
 }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index aa298c9e42..d0d2eaa914 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2489,6 +2489,8 @@  static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
  * Called with s->lock unlocked
  * l2meta  - if not NULL, qcow2_co_pwritev_task() will consume it. Caller must
  *           not use it somehow after qcow2_co_pwritev_task() call
+ *
+ * Function consumes range reference both on success and failure.
  */
 static coroutine_fn int qcow2_co_pwritev_task(BlockDriverState *bs,
                                               uint64_t host_offset,
@@ -2554,6 +2556,9 @@  out_unlocked:
 
 out_locked:
     qcow2_handle_l2meta(bs, &l2meta, false);
+
+    qcow2_host_range_unref(bs, host_offset, bytes);
+
     qemu_co_mutex_unlock(&s->lock);
 
     qemu_vfree(crypt_buf);
@@ -2610,6 +2615,7 @@  static coroutine_fn int qcow2_co_pwritev_part(
         ret = qcow2_pre_write_overlap_check(bs, 0, host_offset,
                                             cur_bytes, true);
         if (ret < 0) {
+            qcow2_host_range_unref(bs, host_offset, cur_bytes);
             goto out_locked;
         }
 
@@ -3151,6 +3157,9 @@  static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
             goto out;
         }
 
+        /* We do truncate under mutex, don't bother with host range refs */
+        qcow2_host_range_unref(bs, host_offset, cur_bytes);
+
         for (m = meta; m != NULL; m = m->next) {
             m->prealloc = true;
         }
@@ -4122,12 +4131,14 @@  qcow2_co_copy_range_to(BlockDriverState *bs,
         ret = qcow2_pre_write_overlap_check(bs, 0, host_offset, cur_bytes,
                                             true);
         if (ret < 0) {
+            qcow2_host_range_unref(bs, host_offset, cur_bytes);
             goto fail;
         }
 
         qemu_co_mutex_unlock(&s->lock);
         ret = bdrv_co_copy_range_to(src, src_offset, s->data_file, host_offset,
                                     cur_bytes, read_flags, write_flags);
+        qcow2_host_range_unref(bs, host_offset, cur_bytes);
         qemu_co_mutex_lock(&s->lock);
         if (ret < 0) {
             goto fail;
@@ -4540,6 +4551,7 @@  qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
     ssize_t out_len;
     uint8_t *buf, *out_buf;
     uint64_t cluster_offset;
+    bool unref_range = false;
 
     assert(bytes == s->cluster_size || (bytes < s->cluster_size &&
            (offset + bytes == bs->total_sectors << BDRV_SECTOR_BITS)));
@@ -4574,6 +4586,7 @@  qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
         qemu_co_mutex_unlock(&s->lock);
         goto fail;
     }
+    unref_range = true;
 
     ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
     qemu_co_mutex_unlock(&s->lock);
@@ -4589,6 +4602,9 @@  qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
 success:
     ret = 0;
 fail:
+    if (unref_range) {
+        qcow2_host_range_unref(bs, cluster_offset, out_len);
+    }
     qemu_vfree(buf);
     g_free(out_buf);
     return ret;
diff --git a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
index 7f0d8a107a..2e2e0d2cb0 100755
--- a/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
+++ b/tests/qemu-iotests/tests/qcow2-discard-during-rewrite
@@ -1,5 +1,5 @@ 
 #!/usr/bin/env bash
-# group: quick disabled
+# group: quick
 #
 # Test discarding (and reusing) host cluster during writing data to it.
 #