diff mbox

[v3,5/5] qcow2: Catch more unaligned write_zero into zero cluster

Message ID 1464234529-13018-6-git-send-email-eblake@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Eric Blake May 26, 2016, 3:48 a.m. UTC
is_zero_cluster() and is_zero_cluster_top_locked() are used only
by qcow2_co_write_zeroes().  The former is too broad (we don't
care if the sectors we are about to overwrite are non-zero, only
that all other sectors in the cluster are zero), so it needs to
be called up to twice but with smaller limits - rename it along
with adding the neeeded parameter.  The latter can be inlined for
more compact code.

The testsuite change shows that we now have a sparser top file
when an unaligned write_zeroes overwrites the only portion of
the backing file with data.

Based on a patch proposal by Denis V. Lunev.

CC: Denis V. Lunev <den@openvz.org>
Signed-off-by: Eric Blake <eblake@redhat.com>
---
 block/qcow2.c              | 47 +++++++++++++++++++++++-----------------------
 tests/qemu-iotests/154.out |  8 ++++----
 2 files changed, 27 insertions(+), 28 deletions(-)

Comments

Denis V. Lunev May 26, 2016, 1:41 p.m. UTC | #1
On 05/26/2016 06:48 AM, Eric Blake wrote:
> is_zero_cluster() and is_zero_cluster_top_locked() are used only
> by qcow2_co_write_zeroes().  The former is too broad (we don't
> care if the sectors we are about to overwrite are non-zero, only
> that all other sectors in the cluster are zero), so it needs to
> be called up to twice but with smaller limits - rename it along
> with adding the neeeded parameter.  The latter can be inlined for
> more compact code.
>
> The testsuite change shows that we now have a sparser top file
> when an unaligned write_zeroes overwrites the only portion of
> the backing file with data.
>
> Based on a patch proposal by Denis V. Lunev.
>
> CC: Denis V. Lunev <den@openvz.org>
> Signed-off-by: Eric Blake <eblake@redhat.com>
> ---
>   block/qcow2.c              | 47 +++++++++++++++++++++++-----------------------
>   tests/qemu-iotests/154.out |  8 ++++----
>   2 files changed, 27 insertions(+), 28 deletions(-)
>
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 105fd5e..ecac399 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2406,26 +2406,19 @@ finish:
>   }
>
>
> -static bool is_zero_cluster(BlockDriverState *bs, int64_t start)
> +static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
> +                            uint32_t count)
>   {
> -    BDRVQcow2State *s = bs->opaque;
>       int nr;
>       BlockDriverState *file;
> -    int64_t res = bdrv_get_block_status_above(bs, NULL, start,
> -                                              s->cluster_sectors, &nr, &file);
> -    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors;
> -}
> +    int64_t res;
>
> -static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
> -{
> -    BDRVQcow2State *s = bs->opaque;
> -    int nr = s->cluster_sectors;
> -    uint64_t off;
> -    int ret;
> -
> -    ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off);
> -    assert(nr == s->cluster_sectors);
> -    return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
> +    if (!count) {
> +        return true;
> +    }
> +    res = bdrv_get_block_status_above(bs, NULL, start, count,
> +                                      &nr, &file);
> +    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
>   }
>
>   static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
> @@ -2434,27 +2427,33 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
>       int ret;
>       BDRVQcow2State *s = bs->opaque;
>
> -    int head = sector_num % s->cluster_sectors;
> -    int tail = (sector_num + nb_sectors) % s->cluster_sectors;
> +    uint32_t head = sector_num % s->cluster_sectors;
> +    uint32_t tail = (sector_num + nb_sectors) % s->cluster_sectors;
>
>       trace_qcow2_write_zeroes_start_req(qemu_coroutine_self(), sector_num,
>                                          nb_sectors);
>
> -    if (head != 0 || tail != 0) {
> +    if (head || tail) {
>           int64_t cl_start = sector_num - head;
> +        uint64_t off;
> +        int nr;
>
>           assert(cl_start + s->cluster_sectors >= sector_num + nb_sectors);
>
> -        sector_num = cl_start;
> -        nb_sectors = s->cluster_sectors;
> -
> -        if (!is_zero_cluster(bs, sector_num)) {
> +        /* check whether remainder of cluster already reads as zero */
> +        if (!(is_zero_sectors(bs, cl_start, head) &&
> +              is_zero_sectors(bs, sector_num + nb_sectors,
> +                              -tail & (s->cluster_sectors - 1)))) {

can we have cluster_sectors != 2^n?
In this case this bits logic will be broken.
Eric Blake May 26, 2016, 2:35 p.m. UTC | #2
On 05/26/2016 07:41 AM, Denis V. Lunev wrote:
> On 05/26/2016 06:48 AM, Eric Blake wrote:
>> is_zero_cluster() and is_zero_cluster_top_locked() are used only
>> by qcow2_co_write_zeroes().  The former is too broad (we don't
>> care if the sectors we are about to overwrite are non-zero, only
>> that all other sectors in the cluster are zero), so it needs to
>> be called up to twice but with smaller limits - rename it along
>> with adding the neeeded parameter.  The latter can be inlined for
>> more compact code.
>>
>> The testsuite change shows that we now have a sparser top file
>> when an unaligned write_zeroes overwrites the only portion of
>> the backing file with data.
>>
>> Based on a patch proposal by Denis V. Lunev.
>>

>> -
>> -        if (!is_zero_cluster(bs, sector_num)) {
>> +        /* check whether remainder of cluster already reads as zero */
>> +        if (!(is_zero_sectors(bs, cl_start, head) &&
>> +              is_zero_sectors(bs, sector_num + nb_sectors,
>> +                              -tail & (s->cluster_sectors - 1)))) {
> 
> can we have cluster_sectors != 2^n?

No. cluster_sectors is an inherent property of the qcow2 file format:


         20 - 23:   cluster_bits
                    Number of bits that are used for addressing an offset
                    within a cluster (1 << cluster_bits is the cluster
size).
                    Must not be less than 9 (i.e. 512 byte clusters).


As the file format uses a bit shift value, you are guaranteed to have a
power of two amount of sectors within a cluster.

If you prefer, I could have written '-tail % s->cluster_sectors', but as
% on a negative signed integer gives different results than what you get
for an unsigned number, I felt that & was nicer than % for making it
more obvious that I'm grabbing particular bits.

If you can think of any cleaner expression that represents the number of
sectors occurring after the tail until the next cluster boundary, I'm
game; the hardest part is that when tail is 0, we want the number passed
to is_zero_sectors() to also be 0, not s->cluster_sectors (so the naive
's->cluster_sectors - tail' is wrong).
Denis V. Lunev May 26, 2016, 2:56 p.m. UTC | #3
On 05/26/2016 06:48 AM, Eric Blake wrote:
> is_zero_cluster() and is_zero_cluster_top_locked() are used only
> by qcow2_co_write_zeroes().  The former is too broad (we don't
> care if the sectors we are about to overwrite are non-zero, only
> that all other sectors in the cluster are zero), so it needs to
> be called up to twice but with smaller limits - rename it along
> with adding the neeeded parameter.  The latter can be inlined for
> more compact code.
>
> The testsuite change shows that we now have a sparser top file
> when an unaligned write_zeroes overwrites the only portion of
> the backing file with data.
>
> Based on a patch proposal by Denis V. Lunev.
>
> CC: Denis V. Lunev <den@openvz.org>
> Signed-off-by: Eric Blake <eblake@redhat.com>
> ---
>   block/qcow2.c              | 47 +++++++++++++++++++++++-----------------------
>   tests/qemu-iotests/154.out |  8 ++++----
>   2 files changed, 27 insertions(+), 28 deletions(-)
>
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 105fd5e..ecac399 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2406,26 +2406,19 @@ finish:
>   }
>
>
> -static bool is_zero_cluster(BlockDriverState *bs, int64_t start)
> +static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
> +                            uint32_t count)
>   {
> -    BDRVQcow2State *s = bs->opaque;
>       int nr;
>       BlockDriverState *file;
> -    int64_t res = bdrv_get_block_status_above(bs, NULL, start,
> -                                              s->cluster_sectors, &nr, &file);
> -    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors;
> -}
> +    int64_t res;
>
> -static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
> -{
> -    BDRVQcow2State *s = bs->opaque;
> -    int nr = s->cluster_sectors;
> -    uint64_t off;
> -    int ret;
> -
> -    ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off);
> -    assert(nr == s->cluster_sectors);
> -    return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
> +    if (!count) {
> +        return true;
> +    }
> +    res = bdrv_get_block_status_above(bs, NULL, start, count,
> +                                      &nr, &file);
> +    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
>   }
>
>   static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
> @@ -2434,27 +2427,33 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
>       int ret;
>       BDRVQcow2State *s = bs->opaque;
>
> -    int head = sector_num % s->cluster_sectors;
> -    int tail = (sector_num + nb_sectors) % s->cluster_sectors;
> +    uint32_t head = sector_num % s->cluster_sectors;
> +    uint32_t tail = (sector_num + nb_sectors) % s->cluster_sectors;
>
>       trace_qcow2_write_zeroes_start_req(qemu_coroutine_self(), sector_num,
>                                          nb_sectors);
>
> -    if (head != 0 || tail != 0) {
> +    if (head || tail) {
>           int64_t cl_start = sector_num - head;
> +        uint64_t off;
> +        int nr;
>
>           assert(cl_start + s->cluster_sectors >= sector_num + nb_sectors);
>
> -        sector_num = cl_start;
> -        nb_sectors = s->cluster_sectors;
> -
> -        if (!is_zero_cluster(bs, sector_num)) {
> +        /* check whether remainder of cluster already reads as zero */
> +        if (!(is_zero_sectors(bs, cl_start, head) &&
> +              is_zero_sectors(bs, sector_num + nb_sectors,
> +                              -tail & (s->cluster_sectors - 1)))) {
>               return -ENOTSUP;
>           }
>
>           qemu_co_mutex_lock(&s->lock);
>           /* We can have new write after previous check */
> -        if (!is_zero_cluster_top_locked(bs, sector_num)) {
> +        sector_num = cl_start;
> +        nb_sectors = nr = s->cluster_sectors;
> +        ret = qcow2_get_cluster_offset(bs, cl_start << BDRV_SECTOR_BITS,
> +                                       &nr, &off);
> +        if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
>               qemu_co_mutex_unlock(&s->lock);
>               return -ENOTSUP;
>           }
> diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
> index 531fd8c..da9eabd 100644
> --- a/tests/qemu-iotests/154.out
> +++ b/tests/qemu-iotests/154.out
> @@ -102,13 +102,13 @@ wrote 2048/2048 bytes at offset 29696
>   read 4096/4096 bytes at offset 28672
>   4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>   [{ "start": 0, "length": 4096, "depth": 1, "zero": true, "data": false},
> -{ "start": 4096, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
> +{ "start": 4096, "length": 4096, "depth": 0, "zero": true, "data": false},
>   { "start": 8192, "length": 4096, "depth": 1, "zero": true, "data": false},
> -{ "start": 12288, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
> +{ "start": 12288, "length": 4096, "depth": 0, "zero": true, "data": false},
>   { "start": 16384, "length": 4096, "depth": 1, "zero": true, "data": false},
> -{ "start": 20480, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
> +{ "start": 20480, "length": 4096, "depth": 0, "zero": true, "data": false},
>   { "start": 24576, "length": 4096, "depth": 1, "zero": true, "data": false},
> -{ "start": 28672, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 32768},
> +{ "start": 28672, "length": 4096, "depth": 0, "zero": true, "data": false},
>   { "start": 32768, "length": 134184960, "depth": 1, "zero": true, "data": false}]
>
>   == spanning two clusters, non-zero before request ==
Reviewed-by: Denis V. Lunev <den@openvz.org>

Thank you very much for pushing this!
Kevin Wolf June 2, 2016, 10:14 a.m. UTC | #4
Am 26.05.2016 um 16:35 hat Eric Blake geschrieben:
> On 05/26/2016 07:41 AM, Denis V. Lunev wrote:
> > On 05/26/2016 06:48 AM, Eric Blake wrote:
> >> is_zero_cluster() and is_zero_cluster_top_locked() are used only
> >> by qcow2_co_write_zeroes().  The former is too broad (we don't
> >> care if the sectors we are about to overwrite are non-zero, only
> >> that all other sectors in the cluster are zero), so it needs to
> >> be called up to twice but with smaller limits - rename it along
> >> with adding the neeeded parameter.  The latter can be inlined for
> >> more compact code.
> >>
> >> The testsuite change shows that we now have a sparser top file
> >> when an unaligned write_zeroes overwrites the only portion of
> >> the backing file with data.
> >>
> >> Based on a patch proposal by Denis V. Lunev.
> >>
> 
> >> -
> >> -        if (!is_zero_cluster(bs, sector_num)) {
> >> +        /* check whether remainder of cluster already reads as zero */
> >> +        if (!(is_zero_sectors(bs, cl_start, head) &&
> >> +              is_zero_sectors(bs, sector_num + nb_sectors,
> >> +                              -tail & (s->cluster_sectors - 1)))) {
> > 
> > can we have cluster_sectors != 2^n?
> 
> No. cluster_sectors is an inherent property of the qcow2 file format:
> 
> 
>          20 - 23:   cluster_bits
>                     Number of bits that are used for addressing an offset
>                     within a cluster (1 << cluster_bits is the cluster
> size).
>                     Must not be less than 9 (i.e. 512 byte clusters).
> 
> 
> As the file format uses a bit shift value, you are guaranteed to have a
> power of two amount of sectors within a cluster.
> 
> If you prefer, I could have written '-tail % s->cluster_sectors', but as
> % on a negative signed integer gives different results than what you get
> for an unsigned number, I felt that & was nicer than % for making it
> more obvious that I'm grabbing particular bits.
> 
> If you can think of any cleaner expression that represents the number of
> sectors occurring after the tail until the next cluster boundary, I'm
> game; the hardest part is that when tail is 0, we want the number passed
> to is_zero_sectors() to also be 0, not s->cluster_sectors (so the naive
> 's->cluster_sectors - tail' is wrong).

The obvious one would be translating your English into C:

    tail ? s->cluster_sectors - tail : 0

Another option which doesn't require an unsigned type would be
(s->cluster_sectors - tail) % s->cluster_sectors.

I'm okay with merging the "more interesting" version, though I must
admit that I had to read it twice.

Kevin
Eric Blake June 2, 2016, 12:33 p.m. UTC | #5
On 06/02/2016 04:14 AM, Kevin Wolf wrote:

>> If you prefer, I could have written '-tail % s->cluster_sectors', but as
>> % on a negative signed integer gives different results than what you get
>> for an unsigned number, I felt that & was nicer than % for making it
>> more obvious that I'm grabbing particular bits.
>>
>> If you can think of any cleaner expression that represents the number of
>> sectors occurring after the tail until the next cluster boundary, I'm
>> game; the hardest part is that when tail is 0, we want the number passed
>> to is_zero_sectors() to also be 0, not s->cluster_sectors (so the naive
>> 's->cluster_sectors - tail' is wrong).
> 
> The obvious one would be translating your English into C:
> 
>     tail ? s->cluster_sectors - tail : 0

Would gcc optimize this into a bit operation rather than a branch?  If
not, that's a missed optimization bug that we should probably report
(that is, if gcc has enough information elsewhere that
s->cluster_sectors is a power of 2, since the bit operation optimization
depends on that fact).

> 
> Another option which doesn't require an unsigned type would be
> (s->cluster_sectors - tail) % s->cluster_sectors.

That's the same thing as '-tail % s->cluster_sectors', since the
distributive rule applies to modulo arithmetic, and since 'a % a' is 0
for non-zero 'a'.  So you can argue I was just saving typing :)

> 
> I'm okay with merging the "more interesting" version, though I must
> admit that I had to read it twice.

That's certainly your call as maintainer :)
diff mbox

Patch

diff --git a/block/qcow2.c b/block/qcow2.c
index 105fd5e..ecac399 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2406,26 +2406,19 @@  finish:
 }


-static bool is_zero_cluster(BlockDriverState *bs, int64_t start)
+static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
+                            uint32_t count)
 {
-    BDRVQcow2State *s = bs->opaque;
     int nr;
     BlockDriverState *file;
-    int64_t res = bdrv_get_block_status_above(bs, NULL, start,
-                                              s->cluster_sectors, &nr, &file);
-    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == s->cluster_sectors;
-}
+    int64_t res;

-static bool is_zero_cluster_top_locked(BlockDriverState *bs, int64_t start)
-{
-    BDRVQcow2State *s = bs->opaque;
-    int nr = s->cluster_sectors;
-    uint64_t off;
-    int ret;
-
-    ret = qcow2_get_cluster_offset(bs, start << BDRV_SECTOR_BITS, &nr, &off);
-    assert(nr == s->cluster_sectors);
-    return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
+    if (!count) {
+        return true;
+    }
+    res = bdrv_get_block_status_above(bs, NULL, start, count,
+                                      &nr, &file);
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
 }

 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
@@ -2434,27 +2427,33 @@  static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
     int ret;
     BDRVQcow2State *s = bs->opaque;

-    int head = sector_num % s->cluster_sectors;
-    int tail = (sector_num + nb_sectors) % s->cluster_sectors;
+    uint32_t head = sector_num % s->cluster_sectors;
+    uint32_t tail = (sector_num + nb_sectors) % s->cluster_sectors;

     trace_qcow2_write_zeroes_start_req(qemu_coroutine_self(), sector_num,
                                        nb_sectors);

-    if (head != 0 || tail != 0) {
+    if (head || tail) {
         int64_t cl_start = sector_num - head;
+        uint64_t off;
+        int nr;

         assert(cl_start + s->cluster_sectors >= sector_num + nb_sectors);

-        sector_num = cl_start;
-        nb_sectors = s->cluster_sectors;
-
-        if (!is_zero_cluster(bs, sector_num)) {
+        /* check whether remainder of cluster already reads as zero */
+        if (!(is_zero_sectors(bs, cl_start, head) &&
+              is_zero_sectors(bs, sector_num + nb_sectors,
+                              -tail & (s->cluster_sectors - 1)))) {
             return -ENOTSUP;
         }

         qemu_co_mutex_lock(&s->lock);
         /* We can have new write after previous check */
-        if (!is_zero_cluster_top_locked(bs, sector_num)) {
+        sector_num = cl_start;
+        nb_sectors = nr = s->cluster_sectors;
+        ret = qcow2_get_cluster_offset(bs, cl_start << BDRV_SECTOR_BITS,
+                                       &nr, &off);
+        if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
             qemu_co_mutex_unlock(&s->lock);
             return -ENOTSUP;
         }
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
index 531fd8c..da9eabd 100644
--- a/tests/qemu-iotests/154.out
+++ b/tests/qemu-iotests/154.out
@@ -102,13 +102,13 @@  wrote 2048/2048 bytes at offset 29696
 read 4096/4096 bytes at offset 28672
 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 4096, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 20480},
+{ "start": 4096, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 8192, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 12288, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 24576},
+{ "start": 12288, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 16384, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 20480, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 28672},
+{ "start": 20480, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 24576, "length": 4096, "depth": 1, "zero": true, "data": false},
-{ "start": 28672, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": 32768},
+{ "start": 28672, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 32768, "length": 134184960, "depth": 1, "zero": true, "data": false}]

 == spanning two clusters, non-zero before request ==