diff mbox series

[v2] Consider discard option when writing zeros

Message ID 20240619174022.1298578-1-nsoffer@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v2] Consider discard option when writing zeros | expand

Commit Message

Nir Soffer June 19, 2024, 5:40 p.m. UTC
When opening an image with discard=off, we punch hole in the image when
writing zeroes, making the image sparse. This breaks users that want to
ensure that writes cannot fail with ENOSPACE by using fully allocated
images.

bdrv_co_pwrite_zeroes() correctly disable BDRV_REQ_MAY_UNMAP if we
opened the child without discard=unmap or discard=on. But we don't go
through this function when accessing the top node. Move the check down
to bdrv_co_do_pwrite_zeroes() which seems to be used in all code paths.

Issues:
- We don't punch hole by default, so images are kept allocated. Before
  this change we punched holes by default. I'm not sure this is a good
  change in behavior.
- Need to run all block tests
- Not sure that we have tests covering unmapping, we may need new tests
- We may need new tests to cover this change

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---

Changes since v1:
- Replace the incorrect has_discard change with the right fix

v1 was here:
https://lists.nongnu.org/archive/html/qemu-block/2024-06/msg00198.html

 block/io.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

Comments

Nir Soffer June 19, 2024, 5:43 p.m. UTC | #1
Tested using:

$ cat test-unmap.sh
#!/bin/sh

qemu=${1:?Usage: $0 qemu-executable}
img=/tmp/test.raw

echo
echo "defaults - write zeroes"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw >/dev/null
du -sh $img

echo
echo "defaults - write zeroes unmap"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw >/dev/null
du -sh $img

echo
echo "defaults - write actual zeros"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw >/dev/null
du -sh $img

echo
echo "discard=off - write zeroes unmap"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw,discard=off >/dev/null
du -sh $img

echo
echo "detect-zeros=on - write actual zeros"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw,detect-zeroes=on >/dev/null
du -sh $img

echo
echo "detect-zeros=unmap,discard=unmap - write actual zeros"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' |  $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw,detect-zeroes=unmap,discard=unmap
>/dev/null
du -sh $img

echo
echo "discard=unmap - write zeroes"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw,discard=unmap >/dev/null
du -sh $img

echo
echo "discard=unmap - write zeroes unmap"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw,discard=unmap >/dev/null
du -sh $img

rm $img


Before this change:

$ cat before.out

defaults - write zeroes
1.0M /tmp/test.raw

defaults - write zeroes unmap
0 /tmp/test.raw

defaults - write actual zeros
1.0M /tmp/test.raw

discard=off - write zeroes unmap
0 /tmp/test.raw

detect-zeros=on - write actual zeros
1.0M /tmp/test.raw

detect-zeros=unmap,discard=unmap - write actual zeros
0 /tmp/test.raw

discard=unmap - write zeroes
1.0M /tmp/test.raw

discard=unmap - write zeroes unmap
0 /tmp/test.raw
[nsoffer build (consider-discard-option)]$


After this change:

$ cat after.out

defaults - write zeroes
1.0M /tmp/test.raw

defaults - write zeroes unmap
1.0M /tmp/test.raw

defaults - write actual zeros
1.0M /tmp/test.raw

discard=off - write zeroes unmap
1.0M /tmp/test.raw

detect-zeros=on - write actual zeros
1.0M /tmp/test.raw

detect-zeros=unmap,discard=unmap - write actual zeros
0 /tmp/test.raw

discard=unmap - write zeroes
1.0M /tmp/test.raw

discard=unmap - write zeroes unmap
0 /tmp/test.raw


Differences:

$ diff -u before.out after.out
--- before.out 2024-06-19 20:24:09.234083713 +0300
+++ after.out 2024-06-19 20:24:20.526165573 +0300
@@ -3,13 +3,13 @@
 1.0M /tmp/test.raw

 defaults - write zeroes unmap
-0 /tmp/test.raw
+1.0M /tmp/test.raw

 defaults - write actual zeros
 1.0M /tmp/test.raw

 discard=off - write zeroes unmap
-0 /tmp/test.raw
+1.0M /tmp/test.raw

On Wed, Jun 19, 2024 at 8:40 PM Nir Soffer <nsoffer@redhat.com> wrote:

> When opening an image with discard=off, we punch hole in the image when
> writing zeroes, making the image sparse. This breaks users that want to
> ensure that writes cannot fail with ENOSPACE by using fully allocated
> images.
>
> bdrv_co_pwrite_zeroes() correctly disable BDRV_REQ_MAY_UNMAP if we
> opened the child without discard=unmap or discard=on. But we don't go
> through this function when accessing the top node. Move the check down
> to bdrv_co_do_pwrite_zeroes() which seems to be used in all code paths.
>
> Issues:
> - We don't punch hole by default, so images are kept allocated. Before
>   this change we punched holes by default. I'm not sure this is a good
>   change in behavior.
> - Need to run all block tests
> - Not sure that we have tests covering unmapping, we may need new tests
> - We may need new tests to cover this change
>
> Signed-off-by: Nir Soffer <nsoffer@redhat.com>
> ---
>
> Changes since v1:
> - Replace the incorrect has_discard change with the right fix
>
> v1 was here:
> https://lists.nongnu.org/archive/html/qemu-block/2024-06/msg00198.html
>
>  block/io.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/block/io.c b/block/io.c
> index 7217cf811b..301514c880 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -1860,10 +1860,15 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
> int64_t offset, int64_t bytes,
>      /* By definition there is no user buffer so this flag doesn't make
> sense */
>      if (flags & BDRV_REQ_REGISTERED_BUF) {
>          return -EINVAL;
>      }
>
> +    /* If opened with discard=off we should never unmap. */
> +    if (!(bs->open_flags & BDRV_O_UNMAP)) {
> +        flags &= ~BDRV_REQ_MAY_UNMAP;
> +    }
> +
>      /* Invalidate the cached block-status data range if this write
> overlaps */
>      bdrv_bsc_invalidate_range(bs, offset, bytes);
>
>      assert(alignment % bs->bl.request_alignment == 0);
>      head = offset % alignment;
> @@ -2313,14 +2318,10 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild
> *child, int64_t offset,
>  {
>      IO_CODE();
>      trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
>      assert_bdrv_graph_readable();
>
> -    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
> -        flags &= ~BDRV_REQ_MAY_UNMAP;
> -    }
> -
>      return bdrv_co_pwritev(child, offset, bytes, NULL,
>                             BDRV_REQ_ZERO_WRITE | flags);
>  }
>
>  /*
> --
> 2.45.1
>
>
Nir Soffer June 19, 2024, 5:45 p.m. UTC | #2
On Wed, Jun 19, 2024 at 8:40 PM Nir Soffer <nsoffer@redhat.com> wrote:

> - Need to run all block tests
>

Stale note, make check pass
Stefan Hajnoczi June 24, 2024, 3:23 p.m. UTC | #3
On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> Tested using:

Hi Nir,
This looks like a good candidate for the qemu-iotests test suite. Adding
it to the automated tests will protect against future regressions.

Please add the script and the expected output to
tests/qemu-iotests/test/write-zeroes-unmap and run it using
`(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.

See the existing test cases in tests/qemu-iotests/ and
tests/qemu-iotests/tests/ for examples. Some are shell scripts and
others are Python. I think shell makes sense for this test case. You can
copy the test framework boilerplate from an existing test case.

Thanks,
Stefan

> 
> $ cat test-unmap.sh
> #!/bin/sh
> 
> qemu=${1:?Usage: $0 qemu-executable}
> img=/tmp/test.raw
> 
> echo
> echo "defaults - write zeroes"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw >/dev/null
> du -sh $img
> 
> echo
> echo "defaults - write zeroes unmap"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw >/dev/null
> du -sh $img
> 
> echo
> echo "defaults - write actual zeros"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' | $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw >/dev/null
> du -sh $img
> 
> echo
> echo "discard=off - write zeroes unmap"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw,discard=off >/dev/null
> du -sh $img
> 
> echo
> echo "detect-zeros=on - write actual zeros"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' | $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw,detect-zeroes=on >/dev/null
> du -sh $img
> 
> echo
> echo "detect-zeros=unmap,discard=unmap - write actual zeros"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' |  $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw,detect-zeroes=unmap,discard=unmap
> >/dev/null
> du -sh $img
> 
> echo
> echo "discard=unmap - write zeroes"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw,discard=unmap >/dev/null
> du -sh $img
> 
> echo
> echo "discard=unmap - write zeroes unmap"
> fallocate -l 1m $img
> echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \
>     -drive if=none,file=$img,format=raw,discard=unmap >/dev/null
> du -sh $img
> 
> rm $img
> 
> 
> Before this change:
> 
> $ cat before.out
> 
> defaults - write zeroes
> 1.0M /tmp/test.raw
> 
> defaults - write zeroes unmap
> 0 /tmp/test.raw
> 
> defaults - write actual zeros
> 1.0M /tmp/test.raw
> 
> discard=off - write zeroes unmap
> 0 /tmp/test.raw
> 
> detect-zeros=on - write actual zeros
> 1.0M /tmp/test.raw
> 
> detect-zeros=unmap,discard=unmap - write actual zeros
> 0 /tmp/test.raw
> 
> discard=unmap - write zeroes
> 1.0M /tmp/test.raw
> 
> discard=unmap - write zeroes unmap
> 0 /tmp/test.raw
> [nsoffer build (consider-discard-option)]$
> 
> 
> After this change:
> 
> $ cat after.out
> 
> defaults - write zeroes
> 1.0M /tmp/test.raw
> 
> defaults - write zeroes unmap
> 1.0M /tmp/test.raw
> 
> defaults - write actual zeros
> 1.0M /tmp/test.raw
> 
> discard=off - write zeroes unmap
> 1.0M /tmp/test.raw
> 
> detect-zeros=on - write actual zeros
> 1.0M /tmp/test.raw
> 
> detect-zeros=unmap,discard=unmap - write actual zeros
> 0 /tmp/test.raw
> 
> discard=unmap - write zeroes
> 1.0M /tmp/test.raw
> 
> discard=unmap - write zeroes unmap
> 0 /tmp/test.raw
> 
> 
> Differences:
> 
> $ diff -u before.out after.out
> --- before.out 2024-06-19 20:24:09.234083713 +0300
> +++ after.out 2024-06-19 20:24:20.526165573 +0300
> @@ -3,13 +3,13 @@
>  1.0M /tmp/test.raw
> 
>  defaults - write zeroes unmap
> -0 /tmp/test.raw
> +1.0M /tmp/test.raw
> 
>  defaults - write actual zeros
>  1.0M /tmp/test.raw
> 
>  discard=off - write zeroes unmap
> -0 /tmp/test.raw
> +1.0M /tmp/test.raw
> 
> On Wed, Jun 19, 2024 at 8:40 PM Nir Soffer <nsoffer@redhat.com> wrote:
> 
> > When opening an image with discard=off, we punch hole in the image when
> > writing zeroes, making the image sparse. This breaks users that want to
> > ensure that writes cannot fail with ENOSPACE by using fully allocated
> > images.
> >
> > bdrv_co_pwrite_zeroes() correctly disable BDRV_REQ_MAY_UNMAP if we
> > opened the child without discard=unmap or discard=on. But we don't go
> > through this function when accessing the top node. Move the check down
> > to bdrv_co_do_pwrite_zeroes() which seems to be used in all code paths.
> >
> > Issues:
> > - We don't punch hole by default, so images are kept allocated. Before
> >   this change we punched holes by default. I'm not sure this is a good
> >   change in behavior.
> > - Need to run all block tests
> > - Not sure that we have tests covering unmapping, we may need new tests
> > - We may need new tests to cover this change
> >
> > Signed-off-by: Nir Soffer <nsoffer@redhat.com>
> > ---
> >
> > Changes since v1:
> > - Replace the incorrect has_discard change with the right fix
> >
> > v1 was here:
> > https://lists.nongnu.org/archive/html/qemu-block/2024-06/msg00198.html
> >
> >  block/io.c | 9 +++++----
> >  1 file changed, 5 insertions(+), 4 deletions(-)
> >
> > diff --git a/block/io.c b/block/io.c
> > index 7217cf811b..301514c880 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -1860,10 +1860,15 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
> > int64_t offset, int64_t bytes,
> >      /* By definition there is no user buffer so this flag doesn't make
> > sense */
> >      if (flags & BDRV_REQ_REGISTERED_BUF) {
> >          return -EINVAL;
> >      }
> >
> > +    /* If opened with discard=off we should never unmap. */
> > +    if (!(bs->open_flags & BDRV_O_UNMAP)) {
> > +        flags &= ~BDRV_REQ_MAY_UNMAP;
> > +    }
> > +
> >      /* Invalidate the cached block-status data range if this write
> > overlaps */
> >      bdrv_bsc_invalidate_range(bs, offset, bytes);
> >
> >      assert(alignment % bs->bl.request_alignment == 0);
> >      head = offset % alignment;
> > @@ -2313,14 +2318,10 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild
> > *child, int64_t offset,
> >  {
> >      IO_CODE();
> >      trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
> >      assert_bdrv_graph_readable();
> >
> > -    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
> > -        flags &= ~BDRV_REQ_MAY_UNMAP;
> > -    }
> > -
> >      return bdrv_co_pwritev(child, offset, bytes, NULL,
> >                             BDRV_REQ_ZERO_WRITE | flags);
> >  }
> >
> >  /*
> > --
> > 2.45.1
> >
> >
Kevin Wolf June 24, 2024, 4:08 p.m. UTC | #4
Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > Tested using:
> 
> Hi Nir,
> This looks like a good candidate for the qemu-iotests test suite. Adding
> it to the automated tests will protect against future regressions.
> 
> Please add the script and the expected output to
> tests/qemu-iotests/test/write-zeroes-unmap and run it using
> `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> 
> See the existing test cases in tests/qemu-iotests/ and
> tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> others are Python. I think shell makes sense for this test case. You
> can copy the test framework boilerplate from an existing test case.

'du' can't be used like this in qemu-iotests because it makes
assumptions that depend on the filesystem. A test case replicating what
Nir did manually would likely fail on XFS with its preallocation.

Maybe we could operate on a file exposed by the FUSE export that is
backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image
to verify the allocation status. Somewhat complicated, but I think it
could work.

Kevin
Nir Soffer June 24, 2024, 9:12 p.m. UTC | #5
On Mon, Jun 24, 2024 at 7:08 PM Kevin Wolf <kwolf@redhat.com> wrote:

> Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > > Tested using:
> >
> > Hi Nir,
> > This looks like a good candidate for the qemu-iotests test suite. Adding
> > it to the automated tests will protect against future regressions.
> >
> > Please add the script and the expected output to
> > tests/qemu-iotests/test/write-zeroes-unmap and run it using
> > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> >
> > See the existing test cases in tests/qemu-iotests/ and
> > tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> > others are Python. I think shell makes sense for this test case. You
> > can copy the test framework boilerplate from an existing test case.
>
> 'du' can't be used like this in qemu-iotests because it makes
> assumptions that depend on the filesystem. A test case replicating what
> Nir did manually would likely fail on XFS with its preallocation.
>

This is why I did not try to add a new qemu-iotest yet.


> Maybe we could operate on a file exposed by the FUSE export that is
> backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image
> to verify the allocation status. Somewhat complicated, but I think it
> could work.
>

Do we have examples of using the FUSE export? It sounds complicated but
being able to test on any file system is awesome. The complexity can be
hidden behind simple test helpers.

Another option is to use a specific file system created for the tests, for
example
on a loop device. We used userstorage[1] in ovirt to test on specific file
systems
with known sector size.

But more important, are you ok with the change?

I'm not sure about not creating sparse images by default - this is not
consistent
with qemu-img convert and qemu-nbd, which do sparsify by default. The old
behavior seems better.

[1] https://github.com/nirs/userstorage

Nir
Kevin Wolf June 26, 2024, 8:42 a.m. UTC | #6
Am 24.06.2024 um 23:12 hat Nir Soffer geschrieben:
> On Mon, Jun 24, 2024 at 7:08 PM Kevin Wolf <kwolf@redhat.com> wrote:
> 
> > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > > > Tested using:
> > >
> > > Hi Nir,
> > > This looks like a good candidate for the qemu-iotests test suite. Adding
> > > it to the automated tests will protect against future regressions.
> > >
> > > Please add the script and the expected output to
> > > tests/qemu-iotests/test/write-zeroes-unmap and run it using
> > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> > >
> > > See the existing test cases in tests/qemu-iotests/ and
> > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> > > others are Python. I think shell makes sense for this test case. You
> > > can copy the test framework boilerplate from an existing test case.
> >
> > 'du' can't be used like this in qemu-iotests because it makes
> > assumptions that depend on the filesystem. A test case replicating what
> > Nir did manually would likely fail on XFS with its preallocation.
> 
> This is why I did not try to add a new qemu-iotest yet.
> 
> > Maybe we could operate on a file exposed by the FUSE export that is
> > backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image
> > to verify the allocation status. Somewhat complicated, but I think it
> > could work.
> 
> Do we have examples of using the FUSE export? It sounds complicated but
> being able to test on any file system is awesome. The complexity can be
> hidden behind simple test helpers.

We seem to have a few tests that use it, and then the fuse protocol
implementation, too. 308 and file-io-error look relevant.

> Another option is to use a specific file system created for the tests,
> for example on a loop device. We used userstorage[1] in ovirt to test
> on specific file systems with known sector size.

Creating loop devices requires root privileges. If I understand
correctly, userstorage solved that by having a setup phase as root
before running the tests as a normal user? We don't really have that in
qemu-iotests.

Some tests require passwordless sudo and are skipped otherwise, but this
means that in practice they are almost always skipped.

> But more important, are you ok with the change?
> 
> I'm not sure about not creating sparse images by default - this is not
> consistent with qemu-img convert and qemu-nbd, which do sparsify by
> default. The old behavior seems better.

Well, your patches make it do what we always claimed it would do, so
that consistency is certainly a good thing. Unmapping on write_zeroes
and ignoring truncate is a weird combination anyway that doesn't really
make any sense to me, so I don't think it's worth preserving. The other
way around could have been more defensible, but that's not how our bug
works.

Now, if ignoring all discard requests is a good default these days is a
separate question and I'm not sure really. Maybe discard=unmap should
be the default (and apply to both discard are write_zeroes, of course).

Kevin
Daniel P. Berrangé June 26, 2024, 9:17 a.m. UTC | #7
On Mon, Jun 24, 2024 at 06:08:26PM +0200, Kevin Wolf wrote:
> Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > > Tested using:
> > 
> > Hi Nir,
> > This looks like a good candidate for the qemu-iotests test suite. Adding
> > it to the automated tests will protect against future regressions.
> > 
> > Please add the script and the expected output to
> > tests/qemu-iotests/test/write-zeroes-unmap and run it using
> > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> > 
> > See the existing test cases in tests/qemu-iotests/ and
> > tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> > others are Python. I think shell makes sense for this test case. You
> > can copy the test framework boilerplate from an existing test case.
> 
> 'du' can't be used like this in qemu-iotests because it makes
> assumptions that depend on the filesystem. A test case replicating what
> Nir did manually would likely fail on XFS with its preallocation.
> 
> Maybe we could operate on a file exposed by the FUSE export that is
> backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image
> to verify the allocation status. Somewhat complicated, but I think it
> could work.

A simpler option would be to use 'du' but with a fuzzy range test,
rather than an exact equality test.

For the tests which write 1 MB, check the 'du' usage is "at least 1MB",
for the tests which expect to unmap blocks, check that the 'du' usage
is "less than 256kb". This should be within bounds of xfs speculative
allocation.

With regards,
Daniel
Nir Soffer June 26, 2024, 4:25 p.m. UTC | #8
On Wed, Jun 26, 2024 at 11:42 AM Kevin Wolf <kwolf@redhat.com> wrote:

> Am 24.06.2024 um 23:12 hat Nir Soffer geschrieben:
> > On Mon, Jun 24, 2024 at 7:08 PM Kevin Wolf <kwolf@redhat.com> wrote:
> >
> > > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> > > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > > > > Tested using:
> > > >
> > > > Hi Nir,
> > > > This looks like a good candidate for the qemu-iotests test suite.
> Adding
> > > > it to the automated tests will protect against future regressions.
> > > >
> > > > Please add the script and the expected output to
> > > > tests/qemu-iotests/test/write-zeroes-unmap and run it using
> > > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> > > >
> > > > See the existing test cases in tests/qemu-iotests/ and
> > > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> > > > others are Python. I think shell makes sense for this test case. You
> > > > can copy the test framework boilerplate from an existing test case.
> > >
> > > 'du' can't be used like this in qemu-iotests because it makes
> > > assumptions that depend on the filesystem. A test case replicating what
> > > Nir did manually would likely fail on XFS with its preallocation.
> >
> > This is why I did not try to add a new qemu-iotest yet.
> >
> > > Maybe we could operate on a file exposed by the FUSE export that is
> > > backed by qcow2, and then you can use 'qemu-img map' on that qcow2
> image
> > > to verify the allocation status. Somewhat complicated, but I think it
> > > could work.
> >
> > Do we have examples of using the FUSE export? It sounds complicated but
> > being able to test on any file system is awesome. The complexity can be
> > hidden behind simple test helpers.
>
> We seem to have a few tests that use it, and then the fuse protocol
> implementation, too. 308 and file-io-error look relevant.
>
> > Another option is to use a specific file system created for the tests,
> > for example on a loop device. We used userstorage[1] in ovirt to test
> > on specific file systems with known sector size.
>
> Creating loop devices requires root privileges. If I understand
> correctly, userstorage solved that by having a setup phase as root
> before running the tests as a normal user? We don't really have that in
> qemu-iotests.
>
> Some tests require passwordless sudo and are skipped otherwise, but this
> means that in practice they are almost always skipped.
>

Yes, this is the assumption the storage is being created before running the
tests,
for example when setting up a development or CI environment, and the tests
can run with unprivileged user.

> But more important, are you ok with the change?
> >
> > I'm not sure about not creating sparse images by default - this is not
> > consistent with qemu-img convert and qemu-nbd, which do sparsify by
> > default. The old behavior seems better.
>
> Well, your patches make it do what we always claimed it would do, so
> that consistency is certainly a good thing. Unmapping on write_zeroes
> and ignoring truncate is a weird combination anyway that doesn't really
> make any sense to me, so I don't think it's worth preserving. The other
> way around could have been more defensible, but that's not how our bug
> works.
>
> Now, if ignoring all discard requests is a good default these days is a
> separate question and I'm not sure really. Maybe discard=unmap should
> be the default (and apply to both discard are write_zeroes, of course).
>

OK, lets limit the scope to fix the code to match the current docs. We can
tweak
the defaults later.
Nir Soffer June 26, 2024, 4:27 p.m. UTC | #9
On Wed, Jun 26, 2024 at 12:17 PM Daniel P. Berrangé <berrange@redhat.com>
wrote:

> On Mon, Jun 24, 2024 at 06:08:26PM +0200, Kevin Wolf wrote:
> > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > > > Tested using:
> > >
> > > Hi Nir,
> > > This looks like a good candidate for the qemu-iotests test suite.
> Adding
> > > it to the automated tests will protect against future regressions.
> > >
> > > Please add the script and the expected output to
> > > tests/qemu-iotests/test/write-zeroes-unmap and run it using
> > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> > >
> > > See the existing test cases in tests/qemu-iotests/ and
> > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> > > others are Python. I think shell makes sense for this test case. You
> > > can copy the test framework boilerplate from an existing test case.
> >
> > 'du' can't be used like this in qemu-iotests because it makes
> > assumptions that depend on the filesystem. A test case replicating what
> > Nir did manually would likely fail on XFS with its preallocation.
> >
> > Maybe we could operate on a file exposed by the FUSE export that is
> > backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image
> > to verify the allocation status. Somewhat complicated, but I think it
> > could work.
>
> A simpler option would be to use 'du' but with a fuzzy range test,
> rather than an exact equality test.
>
> For the tests which write 1 MB, check the 'du' usage is "at least 1MB",
> for the tests which expect to unmap blocks, check that the 'du' usage
> is "less than 256kb". This should be within bounds of xfs speculative
> allocation.
>

This should work, I'll start with this approach.
Kevin Wolf June 27, 2024, 11:42 a.m. UTC | #10
Am 26.06.2024 um 18:27 hat Nir Soffer geschrieben:
> On Wed, Jun 26, 2024 at 12:17 PM Daniel P. Berrangé <berrange@redhat.com>
> wrote:
> 
> > On Mon, Jun 24, 2024 at 06:08:26PM +0200, Kevin Wolf wrote:
> > > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> > > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > > > > Tested using:
> > > >
> > > > Hi Nir,
> > > > This looks like a good candidate for the qemu-iotests test suite.
> > Adding
> > > > it to the automated tests will protect against future regressions.
> > > >
> > > > Please add the script and the expected output to
> > > > tests/qemu-iotests/test/write-zeroes-unmap and run it using
> > > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> > > >
> > > > See the existing test cases in tests/qemu-iotests/ and
> > > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> > > > others are Python. I think shell makes sense for this test case. You
> > > > can copy the test framework boilerplate from an existing test case.
> > >
> > > 'du' can't be used like this in qemu-iotests because it makes
> > > assumptions that depend on the filesystem. A test case replicating what
> > > Nir did manually would likely fail on XFS with its preallocation.
> > >
> > > Maybe we could operate on a file exposed by the FUSE export that is
> > > backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image
> > > to verify the allocation status. Somewhat complicated, but I think it
> > > could work.
> >
> > A simpler option would be to use 'du' but with a fuzzy range test,
> > rather than an exact equality test.
> >
> > For the tests which write 1 MB, check the 'du' usage is "at least 1MB",
> > for the tests which expect to unmap blocks, check that the 'du' usage
> > is "less than 256kb". This should be within bounds of xfs speculative
> > allocation.
> 
> This should work, I'll start with this approach.

If we're okay with accepting tests that depend on filesystem behaviour,
then 'qemu-img map -f raw --output=json' should be the less risky
approach than checking 'du'.

Kevin
Nir Soffer June 28, 2024, 7:51 p.m. UTC | #11
On Thu, Jun 27, 2024 at 2:42 PM Kevin Wolf <kwolf@redhat.com> wrote:

> Am 26.06.2024 um 18:27 hat Nir Soffer geschrieben:
> > On Wed, Jun 26, 2024 at 12:17 PM Daniel P. Berrangé <berrange@redhat.com
> >
> > wrote:
> >
> > > On Mon, Jun 24, 2024 at 06:08:26PM +0200, Kevin Wolf wrote:
> > > > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben:
> > > > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote:
> > > > > > Tested using:
> > > > >
> > > > > Hi Nir,
> > > > > This looks like a good candidate for the qemu-iotests test suite.
> > > Adding
> > > > > it to the automated tests will protect against future regressions.
> > > > >
> > > > > Please add the script and the expected output to
> > > > > tests/qemu-iotests/test/write-zeroes-unmap and run it using
> > > > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`.
> > > > >
> > > > > See the existing test cases in tests/qemu-iotests/ and
> > > > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and
> > > > > others are Python. I think shell makes sense for this test case.
> You
> > > > > can copy the test framework boilerplate from an existing test case.
> > > >
> > > > 'du' can't be used like this in qemu-iotests because it makes
> > > > assumptions that depend on the filesystem. A test case replicating
> what
> > > > Nir did manually would likely fail on XFS with its preallocation.
> > > >
> > > > Maybe we could operate on a file exposed by the FUSE export that is
> > > > backed by qcow2, and then you can use 'qemu-img map' on that qcow2
> image
> > > > to verify the allocation status. Somewhat complicated, but I think it
> > > > could work.
> > >
> > > A simpler option would be to use 'du' but with a fuzzy range test,
> > > rather than an exact equality test.
> > >
> > > For the tests which write 1 MB, check the 'du' usage is "at least 1MB",
> > > for the tests which expect to unmap blocks, check that the 'du' usage
> > > is "less than 256kb". This should be within bounds of xfs speculative
> > > allocation.
> >
> > This should work, I'll start with this approach.
>
> If we're okay with accepting tests that depend on filesystem behaviour,
> then 'qemu-img map -f raw --output=json' should be the less risky
> approach than checking 'du'.
>

Unfortunately it does not work since qemu-img map and qemu-nbd reports the
allocated
area as zero area with no data.

I tried this:

$ cat test-print-allocation.sh
#!/bin/sh

qemu=${1:?Usage: $0 qemu-executable}
img=/tmp/qemu-test-unmap.img

echo
echo "discard=unmap - write zeroes"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw,discard=unmap >/dev/null

echo "du:"
du -sh $img
echo "qemu-img map:"
qemu-img map -f raw --output json $img
echo "nbdinfo --map:"
nbdinfo --map -- [ qemu-nbd -r -f raw $img ]

echo
echo "discard=unmap - write zeroes unmap"
fallocate -l 1m $img
echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \
    -drive if=none,file=$img,format=raw,discard=unmap >/dev/null

echo "du:"
du -sh $img
echo "qemu-img map:"
qemu-img map -f raw --output json $img
echo "nbdinfo --map:"
nbdinfo --map -- [ qemu-nbd -r -f raw $img ]

rm $img


$ ./test-print-allocation.sh ./qemu-system-x86_64

discard=unmap - write zeroes
du:
1.0M /tmp/qemu-test-unmap.img
qemu-img map:
[{ "start": 0, "length": 1048576, "depth": 0, "present": true, "zero":
true, "data": false, "offset": 0}]
nbdinfo --map:
         0     1048576    3  hole,zero

discard=unmap - write zeroes unmap
du:
0 /tmp/qemu-test-unmap.img
qemu-img map:
[{ "start": 0, "length": 1048576, "depth": 0, "present": true, "zero":
true, "data": false, "offset": 0}]
nbdinfo --map:
         0     1048576    3  hole,zero
diff mbox series

Patch

diff --git a/block/io.c b/block/io.c
index 7217cf811b..301514c880 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1860,10 +1860,15 @@  bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
     /* By definition there is no user buffer so this flag doesn't make sense */
     if (flags & BDRV_REQ_REGISTERED_BUF) {
         return -EINVAL;
     }
 
+    /* If opened with discard=off we should never unmap. */
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        flags &= ~BDRV_REQ_MAY_UNMAP;
+    }
+
     /* Invalidate the cached block-status data range if this write overlaps */
     bdrv_bsc_invalidate_range(bs, offset, bytes);
 
     assert(alignment % bs->bl.request_alignment == 0);
     head = offset % alignment;
@@ -2313,14 +2318,10 @@  int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
 {
     IO_CODE();
     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
     assert_bdrv_graph_readable();
 
-    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
-        flags &= ~BDRV_REQ_MAY_UNMAP;
-    }
-
     return bdrv_co_pwritev(child, offset, bytes, NULL,
                            BDRV_REQ_ZERO_WRITE | flags);
 }
 
 /*