diff mbox series

[v4,17/30] qcow2: Add subcluster support to calculate_l2_meta()

Message ID 6f179204ed9ab6274a9d30b6aa9a63865a16035a.1584468723.git.berto@igalia.com (mailing list archive)
State New, archived
Headers show
Series Add subcluster allocation to qcow2 | expand

Commit Message

Alberto Garcia March 17, 2020, 6:16 p.m. UTC
If an image has subclusters then there are more copy-on-write
scenarios that we need to consider. Let's say we have a write request
from the middle of subcluster #3 until the end of the cluster:

   - If the cluster is new, then subclusters #0 to #3 from the old
     cluster must be copied into the new one.

   - If the cluster is new but the old cluster was unallocated, then
     only subcluster #3 needs copy-on-write. #0 to #2 are marked as
     unallocated in the bitmap of the new L2 entry.

   - If we are overwriting an old cluster and subcluster #3 is
     unallocated or has the all-zeroes bit set then we need
     copy-on-write on subcluster #3.

   - If we are overwriting an old cluster and subcluster #3 was
     allocated then there is no need to copy-on-write.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 140 +++++++++++++++++++++++++++++++++---------
 1 file changed, 110 insertions(+), 30 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy April 15, 2020, 8:39 a.m. UTC | #1
17.03.2020 21:16, Alberto Garcia wrote:
> If an image has subclusters then there are more copy-on-write
> scenarios that we need to consider. Let's say we have a write request
> from the middle of subcluster #3 until the end of the cluster:
> 
>     - If the cluster is new, then subclusters #0 to #3 from the old
>       cluster must be copied into the new one.
> 
>     - If the cluster is new but the old cluster was unallocated, then
>       only subcluster #3 needs copy-on-write. #0 to #2 are marked as
>       unallocated in the bitmap of the new L2 entry.
> 
>     - If we are overwriting an old cluster and subcluster #3 is
>       unallocated or has the all-zeroes bit set then we need
>       copy-on-write on subcluster #3.
> 
>     - If we are overwriting an old cluster and subcluster #3 was
>       allocated then there is no need to copy-on-write.
> 
> Signed-off-by: Alberto Garcia <berto@igalia.com>
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> ---
>   block/qcow2-cluster.c | 140 +++++++++++++++++++++++++++++++++---------
>   1 file changed, 110 insertions(+), 30 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 8cdf8a23b6..c6f3cc9237 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -1061,56 +1061,128 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
>    * If @keep_old is true it means that the clusters were already
>    * allocated and will be overwritten. If false then the clusters are
>    * new and we have to decrease the reference count of the old ones.
> + *
> + * Returns 1 on success, -errno on failure (in order to match the
> + * return value of handle_copied() and handle_alloc()).

Hmm, honestly, I don't like this idea. handle_copied and handle_alloc has special return code semantics. Here no reason for special semantics, just classic error/success. Introducing new semantics (I think, no similar functions are in qcow2-cluster.c and may be in the whole qcow2 subsystem) just because the function is used only on return-1 paths of its callers, to save several lines of code - this doesn't seem good reason for me.

Or, may be the reason will appear in the following patches? I'll see.

>    */
> -static void calculate_l2_meta(BlockDriverState *bs,
> -                              uint64_t host_cluster_offset,
> -                              uint64_t guest_offset, unsigned bytes,
> -                              uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
> +static int calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
> +                             uint64_t guest_offset, unsigned bytes,
> +                             uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
>   {
>       BDRVQcow2State *s = bs->opaque;
> -    int l2_index = offset_to_l2_slice_index(s, guest_offset);
> -    uint64_t l2_entry;
> +    int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset);
> +    uint64_t l2_entry, l2_bitmap;
>       unsigned cow_start_from, cow_end_to;
>       unsigned cow_start_to = offset_into_cluster(s, guest_offset);
>       unsigned cow_end_from = cow_start_to + bytes;
>       unsigned nb_clusters = size_to_clusters(s, cow_end_from);
>       QCowL2Meta *old_m = *m;
> -    QCow2ClusterType type;
> +    QCow2SubclusterType type;
>   
>       assert(nb_clusters <= s->l2_slice_size - l2_index);
>   
> -    /* Return if there's no COW (all clusters are normal and we keep them) */
> +    /* Return if there's no COW (all subclusters are normal and we are
> +     * keeping the clusters) */
>       if (keep_old) {
> +        unsigned first_sc = cow_start_to / s->subcluster_size;
> +        unsigned last_sc = (cow_end_from - 1) / s->subcluster_size;
>           int i;
> -        for (i = 0; i < nb_clusters; i++) {
> -            l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
> -            if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) {
> +        for (i = first_sc; i <= last_sc; i++) {
> +            unsigned c = i / s->subclusters_per_cluster;
> +            unsigned sc = i % s->subclusters_per_cluster;
> +            l2_entry = get_l2_entry(s, l2_slice, l2_index + c);
> +            l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + c);
> +            type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc);
> +            if (type == QCOW2_SUBCLUSTER_INVALID) {
> +                l2_index += c; /* Point to the invalid entry */
> +                goto fail;
> +            }
> +            if (type != QCOW2_SUBCLUSTER_NORMAL) {
>                   break;
>               }
>           }
> -        if (i == nb_clusters) {
> -            return;
> +        if (i == last_sc + 1) {
> +            return 1;
>           }
>       }
>   
>       /* Get the L2 entry of the first cluster */
>       l2_entry = get_l2_entry(s, l2_slice, l2_index);
> -    type = qcow2_get_cluster_type(bs, l2_entry);
> +    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
> +    sc_index = offset_to_sc_index(s, guest_offset);
> +    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
>   
> -    if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
> -        cow_start_from = cow_start_to;
> +    if (type == QCOW2_SUBCLUSTER_INVALID) {
> +        goto fail;
> +    }
> +
> +    if (!keep_old) {
> +        switch (type) {
> +        case QCOW2_SUBCLUSTER_NORMAL:
> +        case QCOW2_SUBCLUSTER_COMPRESSED:
> +        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
> +        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
> +            cow_start_from = 0;
> +            break;
> +        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
> +        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
> +            cow_start_from = sc_index << s->subcluster_bits;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
>       } else {
> -        cow_start_from = 0;
> +        switch (type) {
> +        case QCOW2_SUBCLUSTER_NORMAL:
> +            cow_start_from = cow_start_to;
> +            break;
> +        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
> +        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
> +            cow_start_from = sc_index << s->subcluster_bits;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
>       }
>   
>       /* Get the L2 entry of the last cluster */
> -    l2_entry = get_l2_entry(s, l2_slice, l2_index + nb_clusters - 1);
> -    type = qcow2_get_cluster_type(bs, l2_entry);
> +    l2_index += nb_clusters - 1;
> +    l2_entry = get_l2_entry(s, l2_slice, l2_index);
> +    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
> +    sc_index = offset_to_sc_index(s, guest_offset + bytes - 1);
> +    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
>   
> -    if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
> -        cow_end_to = cow_end_from;
> +    if (type == QCOW2_SUBCLUSTER_INVALID) {
> +        goto fail;
> +    }
> +
> +    if (!keep_old) {
> +        switch (type) {

Hmm, big part of code mostly copied from handling first sub-cluster.. But I'm not sure that it worth refactoring now, may be later..

> +        case QCOW2_SUBCLUSTER_NORMAL:
> +        case QCOW2_SUBCLUSTER_COMPRESSED:
> +        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
> +        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
> +            cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);

Hmm. Interesting, actually, we don't need to COW  QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC subclusters in cow-area.. But this need more modifications to cow-handling.

> +            break;
> +        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
> +        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
> +            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);


This is because in new cluster we can made previous subclusters unallocated, and don't copy from backing.
Hmm, actually, we should not just make them unallocated, but copy part of bitmap from original l2-entry.. I need to keep it in mind for next patches.

> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
>       } else {
> -        cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
> +        switch (type) {
> +        case QCOW2_SUBCLUSTER_NORMAL:
> +            cow_end_to = cow_end_from;
> +            break;
> +        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
> +        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
> +            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
>       }
>   
>       *m = g_malloc0(sizeof(**m));
> @@ -1135,6 +1207,18 @@ static void calculate_l2_meta(BlockDriverState *bs,
>   
>       qemu_co_queue_init(&(*m)->dependent_requests);
>       QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
> +
> +fail:

maybe, s/fail/out/

> +    if (type == QCOW2_SUBCLUSTER_INVALID) {
> +        uint64_t l1_index = offset_to_l1_index(s, guest_offset);
> +        uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
> +        qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found "
> +                                " (L2 offset: %#" PRIx64 ", L2 index: %#x)",
> +                                l2_offset, l2_index);
> +        return -EIO;
> +    }
> +
> +    return 1;
>   }
>   
>   /*
> @@ -1352,10 +1436,8 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
>                    - offset_into_cluster(s, guest_offset));
>           assert(*bytes != 0);
>   
> -        calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK, guest_offset,
> -                          *bytes, l2_slice, m, true);
> -
> -        ret = 1;
> +        ret = calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK,
> +                                guest_offset, *bytes, l2_slice, m, true);
>       } else {
>           ret = 0;
>       }
> @@ -1530,10 +1612,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
>       *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
>       assert(*bytes != 0);
>   
> -    calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, l2_slice,
> -                      m, false);
> -
> -    ret = 1;
> +    ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
> +                            l2_slice, m, false);
>   
>   out:
>       qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
> 

Anyway, patch should work as intended, so. if you want to keep it as is:
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Alberto Garcia April 16, 2020, 8:01 p.m. UTC | #2
On Wed 15 Apr 2020 10:39:26 AM CEST, Vladimir Sementsov-Ogievskiy wrote:
>> + * Returns 1 on success, -errno on failure (in order to match the
>> + * return value of handle_copied() and handle_alloc()).
>
> Hmm, honestly, I don't like this idea. handle_copied and handle_alloc
> has special return code semantics. Here no reason for special
> semantics, just classic error/success.

Right, the only reason is to avoid adding something like this after all
callers:

        if (ret == 0) {
            ret = 1;
        }

But you have a point, maybe I change it after all.

>> +        case QCOW2_SUBCLUSTER_NORMAL:
>> +        case QCOW2_SUBCLUSTER_COMPRESSED:
>> +        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
>> +        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
>> +            cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
>
> Hmm. Interesting, actually, we don't need to COW
> QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC subclusters in cow-area.. But this
> need more modifications to cow-handling.

True, if there are more unallocated subclusters in the cow area we could
make the copy operation smaller. I'm not sure if it's worth adding extra
code for this, but maybe I can leave a comment.

>> +            break;
>> +        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
>> +        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
>> +            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
>
>
> This is because in new cluster we can made previous subclusters
> unallocated, and don't copy from backing.
> Hmm, actually, we should not just make them unallocated, but copy part
> of bitmap from original l2-entry.. I need to keep it in mind for next
> patches.

The bitmap is always copied from the original L2 entry, you can see it
in the patch "qcow2: Update L2 bitmap in qcow2_alloc_cluster_link_l2()"

Berto
diff mbox series

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 8cdf8a23b6..c6f3cc9237 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1061,56 +1061,128 @@  void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
  * If @keep_old is true it means that the clusters were already
  * allocated and will be overwritten. If false then the clusters are
  * new and we have to decrease the reference count of the old ones.
+ *
+ * Returns 1 on success, -errno on failure (in order to match the
+ * return value of handle_copied() and handle_alloc()).
  */
-static void calculate_l2_meta(BlockDriverState *bs,
-                              uint64_t host_cluster_offset,
-                              uint64_t guest_offset, unsigned bytes,
-                              uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
+static int calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
+                             uint64_t guest_offset, unsigned bytes,
+                             uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
 {
     BDRVQcow2State *s = bs->opaque;
-    int l2_index = offset_to_l2_slice_index(s, guest_offset);
-    uint64_t l2_entry;
+    int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset);
+    uint64_t l2_entry, l2_bitmap;
     unsigned cow_start_from, cow_end_to;
     unsigned cow_start_to = offset_into_cluster(s, guest_offset);
     unsigned cow_end_from = cow_start_to + bytes;
     unsigned nb_clusters = size_to_clusters(s, cow_end_from);
     QCowL2Meta *old_m = *m;
-    QCow2ClusterType type;
+    QCow2SubclusterType type;
 
     assert(nb_clusters <= s->l2_slice_size - l2_index);
 
-    /* Return if there's no COW (all clusters are normal and we keep them) */
+    /* Return if there's no COW (all subclusters are normal and we are
+     * keeping the clusters) */
     if (keep_old) {
+        unsigned first_sc = cow_start_to / s->subcluster_size;
+        unsigned last_sc = (cow_end_from - 1) / s->subcluster_size;
         int i;
-        for (i = 0; i < nb_clusters; i++) {
-            l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
-            if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) {
+        for (i = first_sc; i <= last_sc; i++) {
+            unsigned c = i / s->subclusters_per_cluster;
+            unsigned sc = i % s->subclusters_per_cluster;
+            l2_entry = get_l2_entry(s, l2_slice, l2_index + c);
+            l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + c);
+            type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc);
+            if (type == QCOW2_SUBCLUSTER_INVALID) {
+                l2_index += c; /* Point to the invalid entry */
+                goto fail;
+            }
+            if (type != QCOW2_SUBCLUSTER_NORMAL) {
                 break;
             }
         }
-        if (i == nb_clusters) {
-            return;
+        if (i == last_sc + 1) {
+            return 1;
         }
     }
 
     /* Get the L2 entry of the first cluster */
     l2_entry = get_l2_entry(s, l2_slice, l2_index);
-    type = qcow2_get_cluster_type(bs, l2_entry);
+    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
+    sc_index = offset_to_sc_index(s, guest_offset);
+    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
 
-    if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
-        cow_start_from = cow_start_to;
+    if (type == QCOW2_SUBCLUSTER_INVALID) {
+        goto fail;
+    }
+
+    if (!keep_old) {
+        switch (type) {
+        case QCOW2_SUBCLUSTER_NORMAL:
+        case QCOW2_SUBCLUSTER_COMPRESSED:
+        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
+        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
+            cow_start_from = 0;
+            break;
+        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
+        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
+            cow_start_from = sc_index << s->subcluster_bits;
+            break;
+        default:
+            g_assert_not_reached();
+        }
     } else {
-        cow_start_from = 0;
+        switch (type) {
+        case QCOW2_SUBCLUSTER_NORMAL:
+            cow_start_from = cow_start_to;
+            break;
+        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
+        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
+            cow_start_from = sc_index << s->subcluster_bits;
+            break;
+        default:
+            g_assert_not_reached();
+        }
     }
 
     /* Get the L2 entry of the last cluster */
-    l2_entry = get_l2_entry(s, l2_slice, l2_index + nb_clusters - 1);
-    type = qcow2_get_cluster_type(bs, l2_entry);
+    l2_index += nb_clusters - 1;
+    l2_entry = get_l2_entry(s, l2_slice, l2_index);
+    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
+    sc_index = offset_to_sc_index(s, guest_offset + bytes - 1);
+    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
 
-    if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
-        cow_end_to = cow_end_from;
+    if (type == QCOW2_SUBCLUSTER_INVALID) {
+        goto fail;
+    }
+
+    if (!keep_old) {
+        switch (type) {
+        case QCOW2_SUBCLUSTER_NORMAL:
+        case QCOW2_SUBCLUSTER_COMPRESSED:
+        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
+        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
+            cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
+            break;
+        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
+        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
+            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
+            break;
+        default:
+            g_assert_not_reached();
+        }
     } else {
-        cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
+        switch (type) {
+        case QCOW2_SUBCLUSTER_NORMAL:
+            cow_end_to = cow_end_from;
+            break;
+        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
+        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
+            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
+            break;
+        default:
+            g_assert_not_reached();
+        }
     }
 
     *m = g_malloc0(sizeof(**m));
@@ -1135,6 +1207,18 @@  static void calculate_l2_meta(BlockDriverState *bs,
 
     qemu_co_queue_init(&(*m)->dependent_requests);
     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+fail:
+    if (type == QCOW2_SUBCLUSTER_INVALID) {
+        uint64_t l1_index = offset_to_l1_index(s, guest_offset);
+        uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+        qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found "
+                                " (L2 offset: %#" PRIx64 ", L2 index: %#x)",
+                                l2_offset, l2_index);
+        return -EIO;
+    }
+
+    return 1;
 }
 
 /*
@@ -1352,10 +1436,8 @@  static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
                  - offset_into_cluster(s, guest_offset));
         assert(*bytes != 0);
 
-        calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK, guest_offset,
-                          *bytes, l2_slice, m, true);
-
-        ret = 1;
+        ret = calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK,
+                                guest_offset, *bytes, l2_slice, m, true);
     } else {
         ret = 0;
     }
@@ -1530,10 +1612,8 @@  static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
     *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
     assert(*bytes != 0);
 
-    calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, l2_slice,
-                      m, false);
-
-    ret = 1;
+    ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
+                            l2_slice, m, false);
 
 out:
     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);