Message ID | 42c52037c0c975e2d1cd23b470e7b61cbd0b3fa3.1584468723.git.berto@igalia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add subcluster allocation to qcow2 | expand |
I'm sorry that I'm joining only now and may ask questions already discussed in previous versions :( 17.03.2020 21:16, Alberto Garcia wrote: > When writing to a qcow2 file there are two functions that take a > virtual offset and return a host offset, possibly allocating new > clusters if necessary: > > - handle_copied() looks for normal data clusters that are already > allocated and have a reference count of 1. In those clusters we > can simply write the data and there is no need to perform any > copy-on-write. > > - handle_alloc() looks for clusters that do need copy-on-write, > either because they haven't been allocated yet, because their > reference count is != 1 or because they are ZERO_ALLOC clusters. > > The ZERO_ALLOC case is a bit special because those are clusters that > are already allocated and they could perfectly be dealt with in > handle_copied() (as long as copy-on-write is performed when required). > > In fact, there is extra code specifically for them in handle_alloc() > that tries to reuse the existing allocation if possible and frees them > otherwise. > > This patch changes the handling of ZERO_ALLOC clusters so the > semantics of these two functions are now like this: > > - handle_copied() looks for clusters that are already allocated and > which we can overwrite (NORMAL and ZERO_ALLOC clusters with a > reference count of 1). > > - handle_alloc() looks for clusters for which we need a new > allocation (all other cases). > > One important difference after this change is that clusters found > in handle_copied() may now require copy-on-write, but this will be > necessary anyway once we add support for subclusters. > > Signed-off-by: Alberto Garcia <berto@igalia.com> > Reviewed-by: Eric Blake <eblake@redhat.com> > Reviewed-by: Max Reitz <mreitz@redhat.com> > --- > block/qcow2-cluster.c | 230 ++++++++++++++++++++++++------------------ > 1 file changed, 130 insertions(+), 100 deletions(-) > > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index e251d00890..5c81046c34 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c > @@ -1041,13 +1041,18 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) > > /* > * For a given write request, create a new QCowL2Meta structure, add > - * it to @m and the BDRVQcow2State.cluster_allocs list. > + * it to @m and the BDRVQcow2State.cluster_allocs list. If the write > + * request does not need copy-on-write or changes to the L2 metadata > + * then this function does nothing. > * > * @host_cluster_offset points to the beginning of the first cluster. > * > * @guest_offset and @bytes indicate the offset and length of the > * request. > * > + * @l2_slice contains the L2 entries of all clusters involved in this > + * write request. > + * > * If @keep_old is true it means that the clusters were already > * allocated and will be overwritten. If false then the clusters are > * new and we have to decrease the reference count of the old ones. > @@ -1055,15 +1060,53 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) > static void calculate_l2_meta(BlockDriverState *bs, > uint64_t host_cluster_offset, > uint64_t guest_offset, unsigned bytes, > - QCowL2Meta **m, bool keep_old) > + uint64_t *l2_slice, QCowL2Meta **m, bool keep_old) > { > BDRVQcow2State *s = bs->opaque; > - unsigned cow_start_from = 0; > + int l2_index = offset_to_l2_slice_index(s, guest_offset); > + uint64_t l2_entry; > + unsigned cow_start_from, cow_end_to; > unsigned cow_start_to = offset_into_cluster(s, guest_offset); > unsigned cow_end_from = cow_start_to + bytes; > - unsigned cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); > unsigned nb_clusters = size_to_clusters(s, cow_end_from); > QCowL2Meta *old_m = *m; > + QCow2ClusterType type; > + > + assert(nb_clusters <= s->l2_slice_size - l2_index); > + > + /* Return if there's no COW (all clusters are normal and we keep them) */ > + if (keep_old) { > + int i; > + for (i = 0; i < nb_clusters; i++) { > + l2_entry = be64_to_cpu(l2_slice[l2_index + i]); > + if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) { Could we also allow full ZERO_ALLOC clusters here? > + break; > + } > + } > + if (i == nb_clusters) { > + return; > + } > + } > + > + /* Get the L2 entry of the first cluster */ > + l2_entry = be64_to_cpu(l2_slice[l2_index]); > + type = qcow2_get_cluster_type(bs, l2_entry); > + > + if (type == QCOW2_CLUSTER_NORMAL && keep_old) { > + cow_start_from = cow_start_to; > + } else { > + cow_start_from = 0; > + } > + > + /* Get the L2 entry of the last cluster */ > + l2_entry = be64_to_cpu(l2_slice[l2_index + nb_clusters - 1]); > + type = qcow2_get_cluster_type(bs, l2_entry); > + > + if (type == QCOW2_CLUSTER_NORMAL && keep_old) { > + cow_end_to = cow_end_from; > + } else { > + cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); > + } These two ifs may be moved into if (keep_old), and drop "&& keep_old" from conditions. This also will allow to drop extra calculations, move new variables to if (keep_old) {} block and allow to pass l2_slice=NULL together with keep_old=false. > > *m = g_malloc0(sizeof(**m)); > **m = (QCowL2Meta) { > @@ -1089,18 +1132,22 @@ static void calculate_l2_meta(BlockDriverState *bs, > QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); > } > > -/* Returns true if writing to a cluster requires COW */ > -static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry) > +/* > + * Returns true if writing to the cluster pointed to by @l2_entry > + * requires a new allocation (that is, if the cluster is unallocated > + * or has refcount > 1 and therefore cannot be written in-place). > + */ > +static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry) > { > switch (qcow2_get_cluster_type(bs, l2_entry)) { > case QCOW2_CLUSTER_NORMAL: > + case QCOW2_CLUSTER_ZERO_ALLOC: > if (l2_entry & QCOW_OFLAG_COPIED) { > return false; > } > case QCOW2_CLUSTER_UNALLOCATED: > case QCOW2_CLUSTER_COMPRESSED: > case QCOW2_CLUSTER_ZERO_PLAIN: > - case QCOW2_CLUSTER_ZERO_ALLOC: > return true; > default: > abort(); > @@ -1108,20 +1155,38 @@ static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry) > } > > /* > - * Returns the number of contiguous clusters that can be used for an allocating > - * write, but require COW to be performed (this includes yet unallocated space, > - * which must copy from the backing file) > + * Returns the number of contiguous clusters that can be written to > + * using one single write request, starting from @l2_index. > + * At most @nb_clusters are checked. > + * > + * If @new_alloc is true this counts clusters that are either > + * unallocated, or allocated but with refcount > 1 (so they need to be > + * newly allocated and COWed). > + * > + * If @new_alloc is false this counts clusters that are already > + * allocated and can be overwritten in-place (this includes clusters > + * of type QCOW2_CLUSTER_ZERO_ALLOC). > */ > -static int count_cow_clusters(BlockDriverState *bs, int nb_clusters, > - uint64_t *l2_slice, int l2_index) > +static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters, > + uint64_t *l2_slice, int l2_index, > + bool new_alloc) > { > + BDRVQcow2State *s = bs->opaque; > + uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index]); > + uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK; > int i; > > for (i = 0; i < nb_clusters; i++) { > - uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]); > - if (!cluster_needs_cow(bs, l2_entry)) { > + l2_entry = be64_to_cpu(l2_slice[l2_index + i]); > + if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) { > break; > } > + if (!new_alloc) { > + if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) { > + break; > + } > + expected_offset += s->cluster_size; > + } > } > > assert(i <= nb_clusters); > @@ -1192,10 +1257,10 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, > } > > /* > - * Checks how many already allocated clusters that don't require a copy on > - * write there are at the given guest_offset (up to *bytes). If *host_offset is > - * not INV_OFFSET, only physically contiguous clusters beginning at this host > - * offset are counted. > + * Checks how many already allocated clusters that don't require a new > + * allocation there are at the given guest_offset (up to *bytes). > + * If *host_offset is not INV_OFFSET, only physically contiguous clusters > + * beginning at this host offset are counted. > * > * Note that guest_offset may not be cluster aligned. In this case, the > * returned *host_offset points to exact byte referenced by guest_offset and > @@ -1204,12 +1269,12 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, > * Returns: > * 0: if no allocated clusters are available at the given offset. > * *bytes is normally unchanged. It is set to 0 if the cluster > - * is allocated and doesn't need COW, but doesn't have the right > - * physical offset. > + * is allocated and can be overwritten in-place but doesn't have > + * the right physical offset. > * > - * 1: if allocated clusters that don't require a COW are available at > - * the requested offset. *bytes may have decreased and describes > - * the length of the area that can be written to. > + * 1: if allocated clusters that can be overwritten in place are > + * available at the requested offset. *bytes may have decreased > + * and describes the length of the area that can be written to. > * > * -errno: in error cases > */ > @@ -1239,7 +1304,8 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, > > l2_index = offset_to_l2_slice_index(s, guest_offset); > nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); > - assert(nb_clusters <= INT_MAX); > + /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */ > + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits); > > /* Find L2 entry for the first involved cluster */ > ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); > @@ -1249,18 +1315,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, > > cluster_offset = be64_to_cpu(l2_slice[l2_index]); It would be good to s/cluster_offset/l2_entry/ And, "cluster_offset & L2E_OFFSET_MASK" is used so many times, so, I'd not substitute, but keep both variables: l2_entry and cluster_offset.. > > - /* Check how many clusters are already allocated and don't need COW */ > - if (qcow2_get_cluster_type(bs, cluster_offset) == QCOW2_CLUSTER_NORMAL > - && (cluster_offset & QCOW_OFLAG_COPIED)) > - { > + if (!cluster_needs_new_alloc(bs, cluster_offset)) { > /* If a specific host_offset is required, check it */ > bool offset_matches = > (cluster_offset & L2E_OFFSET_MASK) == *host_offset; > > if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { > - qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " > + qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset " > "%#llx unaligned (guest offset: %#" PRIx64 > - ")", cluster_offset & L2E_OFFSET_MASK, > + ")", cluster_offset & QCOW_OFLAG_ZERO ? > + "Preallocated zero" : "Data", > + cluster_offset & L2E_OFFSET_MASK, > guest_offset); > ret = -EIO; > goto out; > @@ -1273,15 +1338,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, > } > > /* We keep all QCOW_OFLAG_COPIED clusters */ > - keep_clusters = > - count_contiguous_clusters(bs, nb_clusters, s->cluster_size, > - &l2_slice[l2_index], > - QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); > + keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice, > + l2_index, false); > assert(keep_clusters <= nb_clusters); > > *bytes = MIN(*bytes, > keep_clusters * s->cluster_size > - offset_into_cluster(s, guest_offset)); > + assert(*bytes != 0); > + > + calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK, guest_offset, > + *bytes, l2_slice, m, true); > > ret = 1; > } else { > @@ -1357,9 +1424,10 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, > } > > /* > - * Allocates new clusters for an area that either is yet unallocated or needs a > - * copy on write. If *host_offset is not INV_OFFSET, clusters are only > - * allocated if the new allocation can match the specified host offset. > + * Allocates new clusters for an area that is either still unallocated or > + * cannot be overwritten in-place. If *host_offset is not INV_OFFSET, > + * clusters are only allocated if the new allocation can match the specified > + * host offset. > * > * Note that guest_offset may not be cluster aligned. In this case, the > * returned *host_offset points to exact byte referenced by guest_offset and > @@ -1382,12 +1450,10 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, > BDRVQcow2State *s = bs->opaque; > int l2_index; > uint64_t *l2_slice; > - uint64_t entry; > uint64_t nb_clusters; > int ret; > - bool keep_old_clusters = false; > > - uint64_t alloc_cluster_offset = INV_OFFSET; > + uint64_t alloc_cluster_offset; > > trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, > *bytes); > @@ -1402,10 +1468,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, > > l2_index = offset_to_l2_slice_index(s, guest_offset); > nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); > - assert(nb_clusters <= INT_MAX); > - > - /* Limit total allocation byte count to INT_MAX */ > - nb_clusters = MIN(nb_clusters, INT_MAX >> s->cluster_bits); > + /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */ > + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits); > > /* Find L2 entry for the first involved cluster */ > ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); > @@ -1413,67 +1477,32 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, > return ret; > } > > - entry = be64_to_cpu(l2_slice[l2_index]); > - nb_clusters = count_cow_clusters(bs, nb_clusters, l2_slice, l2_index); > + nb_clusters = count_single_write_clusters(bs, nb_clusters, > + l2_slice, l2_index, true); > > /* This function is only called when there were no non-COW clusters, so if > * we can't find any unallocated or COW clusters either, something is > * wrong with our code. */ > assert(nb_clusters > 0); > > - if (qcow2_get_cluster_type(bs, entry) == QCOW2_CLUSTER_ZERO_ALLOC && > - (entry & QCOW_OFLAG_COPIED) && > - (*host_offset == INV_OFFSET || > - start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK))) > - { > - int preallocated_nb_clusters; > - > - if (offset_into_cluster(s, entry & L2E_OFFSET_MASK)) { > - qcow2_signal_corruption(bs, true, -1, -1, "Preallocated zero " > - "cluster offset %#llx unaligned (guest " > - "offset: %#" PRIx64 ")", > - entry & L2E_OFFSET_MASK, guest_offset); > - ret = -EIO; > - goto fail; > - } > - > - /* Try to reuse preallocated zero clusters; contiguous normal clusters > - * would be fine, too, but count_cow_clusters() above has limited > - * nb_clusters already to a range of COW clusters */ > - preallocated_nb_clusters = > - count_contiguous_clusters(bs, nb_clusters, s->cluster_size, > - &l2_slice[l2_index], QCOW_OFLAG_COPIED); > - assert(preallocated_nb_clusters > 0); > - > - nb_clusters = preallocated_nb_clusters; > - alloc_cluster_offset = entry & L2E_OFFSET_MASK; > - > - /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2() > - * should not free them. */ > - keep_old_clusters = true; > + /* Allocate at a given offset in the image file */ > + alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET : > + start_of_cluster(s, *host_offset); > + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, > + &nb_clusters); > + if (ret < 0) { > + goto out; > } > > - qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); actually we don't need l2_slice for keep_old=false in calculate_l2_meta, so if calculate_l2_meta modified a bit, change of function tail is not needed.. Still, may be l2_slice will be used in calculate_l2_meta() in further patches? Will see.. > - > - if (alloc_cluster_offset == INV_OFFSET) { > - /* Allocate, if necessary at a given offset in the image file */ > - alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET : > - start_of_cluster(s, *host_offset); > - ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, > - &nb_clusters); > - if (ret < 0) { > - goto fail; > - } > - > - /* Can't extend contiguous allocation */ > - if (nb_clusters == 0) { > - *bytes = 0; > - return 0; > - } > - > - assert(alloc_cluster_offset != INV_OFFSET); > + /* Can't extend contiguous allocation */ > + if (nb_clusters == 0) { > + *bytes = 0; > + ret = 0; > + goto out; > } > > + assert(alloc_cluster_offset != INV_OFFSET); > + > /* > * Save info needed for meta data update. > * > @@ -1496,13 +1525,14 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, > *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); > assert(*bytes != 0); > > - calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, > - m, keep_old_clusters); > + calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, l2_slice, > + m, false); > > - return 1; > + ret = 1; > > -fail: > - if (*m && (*m)->nb_clusters > 0) { > +out: > + qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); > + if (ret < 0 && *m && (*m)->nb_clusters > 0) { > QLIST_REMOVE(*m, next_in_flight); > } Hmm, unrelated to the patch, but why do we remove meta, which we didn't create?
On Thu 09 Apr 2020 12:59:30 PM CEST, Vladimir Sementsov-Ogievskiy wrote: >> static void calculate_l2_meta(BlockDriverState *bs, >> uint64_t host_cluster_offset, >> uint64_t guest_offset, unsigned bytes, >> - QCowL2Meta **m, bool keep_old) >> + uint64_t *l2_slice, QCowL2Meta **m, bool keep_old) >> { >> BDRVQcow2State *s = bs->opaque; >> - unsigned cow_start_from = 0; >> + int l2_index = offset_to_l2_slice_index(s, guest_offset); >> + uint64_t l2_entry; >> + unsigned cow_start_from, cow_end_to; >> unsigned cow_start_to = offset_into_cluster(s, guest_offset); >> unsigned cow_end_from = cow_start_to + bytes; >> - unsigned cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); >> unsigned nb_clusters = size_to_clusters(s, cow_end_from); >> QCowL2Meta *old_m = *m; >> + QCow2ClusterType type; >> + >> + assert(nb_clusters <= s->l2_slice_size - l2_index); >> + >> + /* Return if there's no COW (all clusters are normal and we keep them) */ >> + if (keep_old) { >> + int i; >> + for (i = 0; i < nb_clusters; i++) { >> + l2_entry = be64_to_cpu(l2_slice[l2_index + i]); >> + if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) { > > Could we also allow full ZERO_ALLOC clusters here? No, because the L2 entry needs to be modified (in order to remove the 'all zeroes' bit) and we need to create a QCowL2Meta entry for that (see qcow2_handle_l2meta()). >> + /* Get the L2 entry of the first cluster */ >> + l2_entry = be64_to_cpu(l2_slice[l2_index]); >> + type = qcow2_get_cluster_type(bs, l2_entry); >> + >> + if (type == QCOW2_CLUSTER_NORMAL && keep_old) { >> + cow_start_from = cow_start_to; >> + } else { >> + cow_start_from = 0; >> + } >> + >> + /* Get the L2 entry of the last cluster */ >> + l2_entry = be64_to_cpu(l2_slice[l2_index + nb_clusters - 1]); >> + type = qcow2_get_cluster_type(bs, l2_entry); >> + >> + if (type == QCOW2_CLUSTER_NORMAL && keep_old) { >> + cow_end_to = cow_end_from; >> + } else { >> + cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); >> + } > > These two ifs may be moved into if (keep_old), and drop "&& keep_old" > from conditions. This also will allow to drop extra calculations, move > new variables to if (keep_old) {} block and allow to pass > l2_slice=NULL together with keep_old=false. In subsequent patches we're going to have more cases than just QCOW2_CLUSTER_NORMAL so I don't think it makes sense to move the keep_old check around. >> @@ -1239,7 +1304,8 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, >> >> l2_index = offset_to_l2_slice_index(s, guest_offset); >> nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); >> - assert(nb_clusters <= INT_MAX); >> + /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */ >> + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits); >> >> /* Find L2 entry for the first involved cluster */ >> ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); >> @@ -1249,18 +1315,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, >> >> cluster_offset = be64_to_cpu(l2_slice[l2_index]); > > It would be good to s/cluster_offset/l2_entry/ > > And, "cluster_offset & L2E_OFFSET_MASK" is used so many times, so, I'd > not substitute, but keep both variables: l2_entry and cluster_offset. Sounds good, I can change that. >> + /* Allocate at a given offset in the image file */ >> + alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET : >> + start_of_cluster(s, *host_offset); >> + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, >> + &nb_clusters); >> + if (ret < 0) { >> + goto out; >> } >> >> - qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); > > actually we don't need l2_slice for keep_old=false in > calculate_l2_meta, so if calculate_l2_meta modified a bit, change of > function tail is not needed.. > > Still, may be l2_slice will be used in calculate_l2_meta() in further > patches? Will see.. We'll need it in a later patch. >> -fail: >> - if (*m && (*m)->nb_clusters > 0) { >> +out: >> + qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); >> + if (ret < 0 && *m && (*m)->nb_clusters > 0) { >> QLIST_REMOVE(*m, next_in_flight); >> } > > Hmm, unrelated to the patch, but why do we remove meta, which we > didn't create? Not sure actually, I would need to check further... Berto
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index e251d00890..5c81046c34 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1041,13 +1041,18 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) /* * For a given write request, create a new QCowL2Meta structure, add - * it to @m and the BDRVQcow2State.cluster_allocs list. + * it to @m and the BDRVQcow2State.cluster_allocs list. If the write + * request does not need copy-on-write or changes to the L2 metadata + * then this function does nothing. * * @host_cluster_offset points to the beginning of the first cluster. * * @guest_offset and @bytes indicate the offset and length of the * request. * + * @l2_slice contains the L2 entries of all clusters involved in this + * write request. + * * If @keep_old is true it means that the clusters were already * allocated and will be overwritten. If false then the clusters are * new and we have to decrease the reference count of the old ones. @@ -1055,15 +1060,53 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) static void calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset, uint64_t guest_offset, unsigned bytes, - QCowL2Meta **m, bool keep_old) + uint64_t *l2_slice, QCowL2Meta **m, bool keep_old) { BDRVQcow2State *s = bs->opaque; - unsigned cow_start_from = 0; + int l2_index = offset_to_l2_slice_index(s, guest_offset); + uint64_t l2_entry; + unsigned cow_start_from, cow_end_to; unsigned cow_start_to = offset_into_cluster(s, guest_offset); unsigned cow_end_from = cow_start_to + bytes; - unsigned cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); unsigned nb_clusters = size_to_clusters(s, cow_end_from); QCowL2Meta *old_m = *m; + QCow2ClusterType type; + + assert(nb_clusters <= s->l2_slice_size - l2_index); + + /* Return if there's no COW (all clusters are normal and we keep them) */ + if (keep_old) { + int i; + for (i = 0; i < nb_clusters; i++) { + l2_entry = be64_to_cpu(l2_slice[l2_index + i]); + if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) { + break; + } + } + if (i == nb_clusters) { + return; + } + } + + /* Get the L2 entry of the first cluster */ + l2_entry = be64_to_cpu(l2_slice[l2_index]); + type = qcow2_get_cluster_type(bs, l2_entry); + + if (type == QCOW2_CLUSTER_NORMAL && keep_old) { + cow_start_from = cow_start_to; + } else { + cow_start_from = 0; + } + + /* Get the L2 entry of the last cluster */ + l2_entry = be64_to_cpu(l2_slice[l2_index + nb_clusters - 1]); + type = qcow2_get_cluster_type(bs, l2_entry); + + if (type == QCOW2_CLUSTER_NORMAL && keep_old) { + cow_end_to = cow_end_from; + } else { + cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); + } *m = g_malloc0(sizeof(**m)); **m = (QCowL2Meta) { @@ -1089,18 +1132,22 @@ static void calculate_l2_meta(BlockDriverState *bs, QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); } -/* Returns true if writing to a cluster requires COW */ -static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry) +/* + * Returns true if writing to the cluster pointed to by @l2_entry + * requires a new allocation (that is, if the cluster is unallocated + * or has refcount > 1 and therefore cannot be written in-place). + */ +static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry) { switch (qcow2_get_cluster_type(bs, l2_entry)) { case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_ZERO_ALLOC: if (l2_entry & QCOW_OFLAG_COPIED) { return false; } case QCOW2_CLUSTER_UNALLOCATED: case QCOW2_CLUSTER_COMPRESSED: case QCOW2_CLUSTER_ZERO_PLAIN: - case QCOW2_CLUSTER_ZERO_ALLOC: return true; default: abort(); @@ -1108,20 +1155,38 @@ static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry) } /* - * Returns the number of contiguous clusters that can be used for an allocating - * write, but require COW to be performed (this includes yet unallocated space, - * which must copy from the backing file) + * Returns the number of contiguous clusters that can be written to + * using one single write request, starting from @l2_index. + * At most @nb_clusters are checked. + * + * If @new_alloc is true this counts clusters that are either + * unallocated, or allocated but with refcount > 1 (so they need to be + * newly allocated and COWed). + * + * If @new_alloc is false this counts clusters that are already + * allocated and can be overwritten in-place (this includes clusters + * of type QCOW2_CLUSTER_ZERO_ALLOC). */ -static int count_cow_clusters(BlockDriverState *bs, int nb_clusters, - uint64_t *l2_slice, int l2_index) +static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters, + uint64_t *l2_slice, int l2_index, + bool new_alloc) { + BDRVQcow2State *s = bs->opaque; + uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index]); + uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK; int i; for (i = 0; i < nb_clusters; i++) { - uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]); - if (!cluster_needs_cow(bs, l2_entry)) { + l2_entry = be64_to_cpu(l2_slice[l2_index + i]); + if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) { break; } + if (!new_alloc) { + if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) { + break; + } + expected_offset += s->cluster_size; + } } assert(i <= nb_clusters); @@ -1192,10 +1257,10 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, } /* - * Checks how many already allocated clusters that don't require a copy on - * write there are at the given guest_offset (up to *bytes). If *host_offset is - * not INV_OFFSET, only physically contiguous clusters beginning at this host - * offset are counted. + * Checks how many already allocated clusters that don't require a new + * allocation there are at the given guest_offset (up to *bytes). + * If *host_offset is not INV_OFFSET, only physically contiguous clusters + * beginning at this host offset are counted. * * Note that guest_offset may not be cluster aligned. In this case, the * returned *host_offset points to exact byte referenced by guest_offset and @@ -1204,12 +1269,12 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, * Returns: * 0: if no allocated clusters are available at the given offset. * *bytes is normally unchanged. It is set to 0 if the cluster - * is allocated and doesn't need COW, but doesn't have the right - * physical offset. + * is allocated and can be overwritten in-place but doesn't have + * the right physical offset. * - * 1: if allocated clusters that don't require a COW are available at - * the requested offset. *bytes may have decreased and describes - * the length of the area that can be written to. + * 1: if allocated clusters that can be overwritten in place are + * available at the requested offset. *bytes may have decreased + * and describes the length of the area that can be written to. * * -errno: in error cases */ @@ -1239,7 +1304,8 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, l2_index = offset_to_l2_slice_index(s, guest_offset); nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); - assert(nb_clusters <= INT_MAX); + /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */ + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits); /* Find L2 entry for the first involved cluster */ ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); @@ -1249,18 +1315,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, cluster_offset = be64_to_cpu(l2_slice[l2_index]); - /* Check how many clusters are already allocated and don't need COW */ - if (qcow2_get_cluster_type(bs, cluster_offset) == QCOW2_CLUSTER_NORMAL - && (cluster_offset & QCOW_OFLAG_COPIED)) - { + if (!cluster_needs_new_alloc(bs, cluster_offset)) { /* If a specific host_offset is required, check it */ bool offset_matches = (cluster_offset & L2E_OFFSET_MASK) == *host_offset; if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { - qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " + qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset " "%#llx unaligned (guest offset: %#" PRIx64 - ")", cluster_offset & L2E_OFFSET_MASK, + ")", cluster_offset & QCOW_OFLAG_ZERO ? + "Preallocated zero" : "Data", + cluster_offset & L2E_OFFSET_MASK, guest_offset); ret = -EIO; goto out; @@ -1273,15 +1338,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, } /* We keep all QCOW_OFLAG_COPIED clusters */ - keep_clusters = - count_contiguous_clusters(bs, nb_clusters, s->cluster_size, - &l2_slice[l2_index], - QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); + keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice, + l2_index, false); assert(keep_clusters <= nb_clusters); *bytes = MIN(*bytes, keep_clusters * s->cluster_size - offset_into_cluster(s, guest_offset)); + assert(*bytes != 0); + + calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK, guest_offset, + *bytes, l2_slice, m, true); ret = 1; } else { @@ -1357,9 +1424,10 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, } /* - * Allocates new clusters for an area that either is yet unallocated or needs a - * copy on write. If *host_offset is not INV_OFFSET, clusters are only - * allocated if the new allocation can match the specified host offset. + * Allocates new clusters for an area that is either still unallocated or + * cannot be overwritten in-place. If *host_offset is not INV_OFFSET, + * clusters are only allocated if the new allocation can match the specified + * host offset. * * Note that guest_offset may not be cluster aligned. In this case, the * returned *host_offset points to exact byte referenced by guest_offset and @@ -1382,12 +1450,10 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, BDRVQcow2State *s = bs->opaque; int l2_index; uint64_t *l2_slice; - uint64_t entry; uint64_t nb_clusters; int ret; - bool keep_old_clusters = false; - uint64_t alloc_cluster_offset = INV_OFFSET; + uint64_t alloc_cluster_offset; trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, *bytes); @@ -1402,10 +1468,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, l2_index = offset_to_l2_slice_index(s, guest_offset); nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); - assert(nb_clusters <= INT_MAX); - - /* Limit total allocation byte count to INT_MAX */ - nb_clusters = MIN(nb_clusters, INT_MAX >> s->cluster_bits); + /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */ + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits); /* Find L2 entry for the first involved cluster */ ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); @@ -1413,67 +1477,32 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, return ret; } - entry = be64_to_cpu(l2_slice[l2_index]); - nb_clusters = count_cow_clusters(bs, nb_clusters, l2_slice, l2_index); + nb_clusters = count_single_write_clusters(bs, nb_clusters, + l2_slice, l2_index, true); /* This function is only called when there were no non-COW clusters, so if * we can't find any unallocated or COW clusters either, something is * wrong with our code. */ assert(nb_clusters > 0); - if (qcow2_get_cluster_type(bs, entry) == QCOW2_CLUSTER_ZERO_ALLOC && - (entry & QCOW_OFLAG_COPIED) && - (*host_offset == INV_OFFSET || - start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK))) - { - int preallocated_nb_clusters; - - if (offset_into_cluster(s, entry & L2E_OFFSET_MASK)) { - qcow2_signal_corruption(bs, true, -1, -1, "Preallocated zero " - "cluster offset %#llx unaligned (guest " - "offset: %#" PRIx64 ")", - entry & L2E_OFFSET_MASK, guest_offset); - ret = -EIO; - goto fail; - } - - /* Try to reuse preallocated zero clusters; contiguous normal clusters - * would be fine, too, but count_cow_clusters() above has limited - * nb_clusters already to a range of COW clusters */ - preallocated_nb_clusters = - count_contiguous_clusters(bs, nb_clusters, s->cluster_size, - &l2_slice[l2_index], QCOW_OFLAG_COPIED); - assert(preallocated_nb_clusters > 0); - - nb_clusters = preallocated_nb_clusters; - alloc_cluster_offset = entry & L2E_OFFSET_MASK; - - /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2() - * should not free them. */ - keep_old_clusters = true; + /* Allocate at a given offset in the image file */ + alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET : + start_of_cluster(s, *host_offset); + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, + &nb_clusters); + if (ret < 0) { + goto out; } - qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); - - if (alloc_cluster_offset == INV_OFFSET) { - /* Allocate, if necessary at a given offset in the image file */ - alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET : - start_of_cluster(s, *host_offset); - ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, - &nb_clusters); - if (ret < 0) { - goto fail; - } - - /* Can't extend contiguous allocation */ - if (nb_clusters == 0) { - *bytes = 0; - return 0; - } - - assert(alloc_cluster_offset != INV_OFFSET); + /* Can't extend contiguous allocation */ + if (nb_clusters == 0) { + *bytes = 0; + ret = 0; + goto out; } + assert(alloc_cluster_offset != INV_OFFSET); + /* * Save info needed for meta data update. * @@ -1496,13 +1525,14 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); assert(*bytes != 0); - calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, - m, keep_old_clusters); + calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, l2_slice, + m, false); - return 1; + ret = 1; -fail: - if (*m && (*m)->nb_clusters > 0) { +out: + qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); + if (ret < 0 && *m && (*m)->nb_clusters > 0) { QLIST_REMOVE(*m, next_in_flight); } return ret;