diff mbox

[for,2.7,1/1] qcow2: improve qcow2_co_write_zeroes()

Message ID 1461413127-2594-1-git-send-email-den@openvz.org (mailing list archive)
State New, archived
Headers show

Commit Message

Denis V. Lunev April 23, 2016, 12:05 p.m. UTC
Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
if the caller is using O_DIRECT and does not align in-memory data to
page. Thus qemu-nbd will call block layer with non-aligned requests.

qcow2_co_write_zeroes forcibly asks the caller to supply block-aligned
data. In the other case it rejects with ENOTSUP which is properly
handled on the upper level. The problem is that this grows the image.

This could be optimized a bit:
- particular request could be split to block aligned part and head/tail,
  which could be handled separately
- writes could be omitted when we do know that the image already contains
  zeroes at the offsets being written

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 5 deletions(-)

Comments

Kevin Wolf April 25, 2016, 9:05 a.m. UTC | #1
Am 23.04.2016 um 14:05 hat Denis V. Lunev geschrieben:
> Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
> if the caller is using O_DIRECT and does not align in-memory data to
> page. Thus qemu-nbd will call block layer with non-aligned requests.
> 
> qcow2_co_write_zeroes forcibly asks the caller to supply block-aligned
> data. In the other case it rejects with ENOTSUP which is properly
> handled on the upper level. The problem is that this grows the image.
> 
> This could be optimized a bit:
> - particular request could be split to block aligned part and head/tail,
>   which could be handled separately

In fact, this is what bdrv_co_do_write_zeroes() is already supposed to
do. qcow2 exposes its cluster size as bs->bl.write_zeroes_alignment, so
block/io.c should split the request in three.

If you see something different happening, we may have a bug there.

> - writes could be omitted when we do know that the image already contains
>   zeroes at the offsets being written

I don't think this is a valid shortcut. The semantics of a write_zeroes
operation is that the zeros (literal or as flags) are stored in this
layer and that the backing file isn't involved any more for the given
sectors. For example, a streaming operation or qemu-img rebase may
involve write_zeroes operations, and relying on the backing file would
cause corruption there (because the whole point of the operation is that
the backing file can be removed).

And to be honest, writing zero flags in memory to the cached L2 table is
an operation so quick that calling bdrv_get_block_status_above() might
actually end up slower in most cases than just setting the flag.

Kevin

> Signed-off-by: Denis V. Lunev <den@openvz.org>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Max Reitz <mreitz@redhat.com>
> ---
>  block/qcow2.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 53 insertions(+), 5 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 470734b..9bdaa15 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2411,21 +2411,69 @@ finish:
>      return ret;
>  }
>  
> +static int write_zeroes_chunk(BlockDriverState *bs, int64_t sector_num, int nr)
> +{
> +    int ret, count;
> +    BlockDriverState *file;
> +    uint8_t *buf;
> +    struct iovec iov;
> +    QEMUIOVector local_qiov;
> +
> +    ret = bdrv_get_block_status_above(bs, NULL, sector_num, nr, &count, &file);
> +    if (ret > 0 && (ret & BDRV_BLOCK_ZERO) && count == nr) {
> +        /* Nothing to do. The area is zeroed already.
> +           Worth to check to avoid image expansion for non-aligned reqs. */
> +        return 0;
> +    }
> +
> +    buf = qemu_blockalign0(bs, nr << BDRV_SECTOR_BITS);
> +    iov = (struct iovec) {
> +        .iov_base   = buf,
> +        .iov_len    = nr << BDRV_SECTOR_BITS,
> +    };
> +    qemu_iovec_init_external(&local_qiov, &iov, 1);
> +
> +    return qcow2_co_writev(bs, sector_num, nr, &local_qiov);
> +}
> +
>  static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
>  {
>      int ret;
>      BDRVQcow2State *s = bs->opaque;
>  
> -    /* Emulate misaligned zero writes */
> -    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
> -        return -ENOTSUP;
> +    int nr = sector_num % s->cluster_sectors;
> +    if (nr != 0) {
> +        nr = MIN(s->cluster_sectors - nr, nb_sectors);
> +
> +        ret = write_zeroes_chunk(bs, sector_num, nr);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        sector_num += nr;
> +        nb_sectors -= nr;
> +        if (nb_sectors == 0) {
> +            return 0;
> +        }
> +    }
> +
> +    nr = nb_sectors % s->cluster_sectors;
> +    if (nr != 0) {
> +        ret = write_zeroes_chunk(bs, sector_num + nb_sectors - nr, nr);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        nb_sectors -= nr;
> +        if (nb_sectors == 0) {
> +            return 0;
> +        }
>      }
>  
>      /* Whatever is left can use real zero clusters */
>      qemu_co_mutex_lock(&s->lock);
> -    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
> -        nb_sectors);
> +    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors);
>      qemu_co_mutex_unlock(&s->lock);
>  
>      return ret;
> -- 
> 2.5.0
>
Denis V. Lunev April 25, 2016, 10:20 a.m. UTC | #2
On 04/25/2016 12:05 PM, Kevin Wolf wrote:
> Am 23.04.2016 um 14:05 hat Denis V. Lunev geschrieben:
>> Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
>> if the caller is using O_DIRECT and does not align in-memory data to
>> page. Thus qemu-nbd will call block layer with non-aligned requests.
>>
>> qcow2_co_write_zeroes forcibly asks the caller to supply block-aligned
>> data. In the other case it rejects with ENOTSUP which is properly
>> handled on the upper level. The problem is that this grows the image.
>>
>> This could be optimized a bit:
>> - particular request could be split to block aligned part and head/tail,
>>    which could be handled separately
> In fact, this is what bdrv_co_do_write_zeroes() is already supposed to
> do. qcow2 exposes its cluster size as bs->bl.write_zeroes_alignment, so
> block/io.c should split the request in three.
>
> If you see something different happening, we may have a bug there.
>
Pls look to the commit

commit 459b4e66129d091a11e9886ecc15a8bf9f7f3d92
Author: Denis V. Lunev <den@openvz.org>
Date:   Tue May 12 17:30:56 2015 +0300

     block: align bounce buffers to page

The situation is exactly like the described there. The user
of the /dev/nbd0 writes with O_DIRECT and has unaligned
to page buffers. Thus real operations on qemu-nbd
layer becomes unaligned to block size.

Thus bdrv_co_do_write_zeroes is helpless here unfortunately.
We have this problem with 3rd party software performing
restoration from the backup.

The program is 100% reproducible. The following sequence
is performed:

#define _GNU_SOURCE

#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include <sys/fcntl.h>
#include <unistd.h>

int main(int argc, char *argv[])
{
     char *buf;
     int fd;

     if (argc != 2) {
         return -1;
     }

     fd = open(argv[1], O_WRONLY | O_DIRECT);

     do {
         buf = memalign(512, 1024 * 1024);
     } while (!(unsigned long)buf & (4096 - 1));
     memset(buf, 0, 1024 * 1024);
     write(fd, buf, 1024 * 1024);
     return 0;
}

This program is compiled as a.out.

Before the patch:
hades ~/src/qemu $ qemu-img create -f qcow2 test.qcow2 64G
Formatting 'test.qcow2', fmt=qcow2 size=68719476736 encryption=off 
cluster_size=65536 lazy_refcounts=off refcount_bits=16
hades ~/src/qemu $ sudo ./qemu-nbd  --connect=/dev/nbd3 test.qcow2 
--detect-zeroes=on --aio=native --cache=none
hades ~/src/qemu $ sudo ./a.out /dev/nbd3
hades ~/src/qemu $ ls -als test.qcow2
772 -rw-r--r-- 1 den den 851968 Apr 25 12:48 test.qcow2
hades ~/src/qemu $

After the patch:
hades ~/src/qemu $ qemu-img create -f qcow2 test.qcow2 64G
Formatting 'test.qcow2', fmt=qcow2 size=68719476736 encryption=off 
cluster_size=65536 lazy_refcounts=off refcount_bits=16
hades ~/src/qemu $ sudo ./qemu-nbd  --connect=/dev/nbd3 test.qcow2 
--detect-zeroes=on --aio=native --cache=none
hades ~/src/qemu $ sudo ./a.out /dev/nbd3
hades ~/src/qemu $ ls -als test.qcow2
260 -rw-r--r-- 1 den den 327680 Apr 25 12:50 test.qcow2
hades ~/src/qemu $




>> - writes could be omitted when we do know that the image already contains
>>    zeroes at the offsets being written
> I don't think this is a valid shortcut. The semantics of a write_zeroes
> operation is that the zeros (literal or as flags) are stored in this
> layer and that the backing file isn't involved any more for the given
> sectors. For example, a streaming operation or qemu-img rebase may
> involve write_zeroes operations, and relying on the backing file would
> cause corruption there (because the whole point of the operation is that
> the backing file can be removed).
this is not a problem. The block will be abscent and thus it will be
read as zeroes.


> And to be honest, writing zero flags in memory to the cached L2 table is
> an operation so quick that calling bdrv_get_block_status_above() might
> actually end up slower in most cases than just setting the flag.
Main fast path is not touched. bdrv_get_block_status_above() is called 
only for
non-aligned parts of the operation.

Den
Eric Blake April 25, 2016, 7:35 p.m. UTC | #3
On 04/25/2016 04:20 AM, Denis V. Lunev wrote:
> On 04/25/2016 12:05 PM, Kevin Wolf wrote:
>> Am 23.04.2016 um 14:05 hat Denis V. Lunev geschrieben:
>>> Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
>>> if the caller is using O_DIRECT and does not align in-memory data to
>>> page. Thus qemu-nbd will call block layer with non-aligned requests.

At first glance, I'd argue that any caller using O_DIRECT without
obeying memory alignment restrictions is broken; why should qemu have to
work around such broken callers?


> The program is 100% reproducible. The following sequence
> is performed:
> 
> #define _GNU_SOURCE
> 
> #include <stdio.h>
> #include <malloc.h>
> #include <string.h>
> #include <sys/fcntl.h>
> #include <unistd.h>
> 
> int main(int argc, char *argv[])
> {
>     char *buf;
>     int fd;
> 
>     if (argc != 2) {
>         return -1;
>     }
> 
>     fd = open(argv[1], O_WRONLY | O_DIRECT);
> 
>     do {
>         buf = memalign(512, 1024 * 1024);
>     } while (!(unsigned long)buf & (4096 - 1));

In other words, you are INTENTIONALLY grabbing an unaligned buffer for
use with an O_DIRECT fd, when O_DIRECT demands that you should be using
at least page alignment (4096 or greater).  Arguably, the bug is in your
program, not qemu.

That said, teaching qemu to split up a write_zeroes request into head,
tail, and aligned body, so at least the aligned part might benefit from
optimizations, seems like it might be worthwhile, particularly since my
pending NBD series changed from blk_write_zeroes (cluster-aligned) to
blk_pwrite_zeroes (byte-aligned), and it is conceivable that we  can
encounter a situation without O_DIRECT in the picture at all, where our
NBD server is connected to a client that specifically asks for the new
NBD_CMD_WRITE_ZEROES on any arbitrary byte alignment.

>     memset(buf, 0, 1024 * 1024);
>     write(fd, buf, 1024 * 1024);
>     return 0;
> }
> 
> This program is compiled as a.out.
> 
> Before the patch:
> hades ~/src/qemu $ qemu-img create -f qcow2 test.qcow2 64G
> Formatting 'test.qcow2', fmt=qcow2 size=68719476736 encryption=off
> cluster_size=65536 lazy_refcounts=off refcount_bits=16
> hades ~/src/qemu $ sudo ./qemu-nbd  --connect=/dev/nbd3 test.qcow2
> --detect-zeroes=on --aio=native --cache=none
> hades ~/src/qemu $ sudo ./a.out /dev/nbd3

Here, I'd argue that the kernel NBD module is buggy, for allowing a
user-space app to pass an unaligned buffer to an O_DIRECT fd.  But
that's outside the realm of qemu code.

But again, per my argument that you don't have to involve the kernel nor
O_DIRECT to be able to write a client that can attempt to cause an NBD
server to do unaligned operations, we can use this kernel bug as an
easier way to test any proposed fix to the qemu side of things, whether
or not the kernel module gets tightened in behavior down the road.
Denis V. Lunev April 25, 2016, 9 p.m. UTC | #4
On 04/25/2016 10:35 PM, Eric Blake wrote:
> On 04/25/2016 04:20 AM, Denis V. Lunev wrote:
>> On 04/25/2016 12:05 PM, Kevin Wolf wrote:
>>> Am 23.04.2016 um 14:05 hat Denis V. Lunev geschrieben:
>>>> Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
>>>> if the caller is using O_DIRECT and does not align in-memory data to
>>>> page. Thus qemu-nbd will call block layer with non-aligned requests.
> At first glance, I'd argue that any caller using O_DIRECT without
> obeying memory alignment restrictions is broken; why should qemu have to
> work around such broken callers?
>
Memory requirements are followed. Minimal requirement which
is fixed in kernel API is 512 bytes. For here the broken part is kernel
as we have agreed on year ago, which splits requests in a
wrong way.

Unfortunately 3.10 RHEL7 based kernel misbehaves this
way.

You could remember the situation a year ago. The QEMU behaves
exactly in the same way. Usually people start probing memory
alignment for O_DIRECT writes from 512 bytes and kernel
accepts that. This situation is VERY common :(

>> The program is 100% reproducible. The following sequence
>> is performed:
>>
>> #define _GNU_SOURCE
>>
>> #include <stdio.h>
>> #include <malloc.h>
>> #include <string.h>
>> #include <sys/fcntl.h>
>> #include <unistd.h>
>>
>> int main(int argc, char *argv[])
>> {
>>      char *buf;
>>      int fd;
>>
>>      if (argc != 2) {
>>          return -1;
>>      }
>>
>>      fd = open(argv[1], O_WRONLY | O_DIRECT);
>>
>>      do {
>>          buf = memalign(512, 1024 * 1024);
>>      } while (!(unsigned long)buf & (4096 - 1));
> In other words, you are INTENTIONALLY grabbing an unaligned buffer for
> use with an O_DIRECT fd, when O_DIRECT demands that you should be using
> at least page alignment (4096 or greater).  Arguably, the bug is in your
> program, not qemu.
Yes, as THIS IS THE TEST to reproduce the problem
observed in the 3rd party software. I have spent
some time narrowing it down to this.


> That said, teaching qemu to split up a write_zeroes request into head,
> tail, and aligned body, so at least the aligned part might benefit from
> optimizations, seems like it might be worthwhile, particularly since my
> pending NBD series changed from blk_write_zeroes (cluster-aligned) to
> blk_pwrite_zeroes (byte-aligned), and it is conceivable that we  can
> encounter a situation without O_DIRECT in the picture at all, where our
> NBD server is connected to a client that specifically asks for the new
> NBD_CMD_WRITE_ZEROES on any arbitrary byte alignment.
actually NBD server is just an innocent party which transparently
processes requests coming to it.

What will happen if we will write 4k into the middle of the zeroed
block with old code? The system will allocate 64k and write it
down. This code will skip this write at all. Thus the optimization
has some sense on its own.


>>      memset(buf, 0, 1024 * 1024);
>>      write(fd, buf, 1024 * 1024);
>>      return 0;
>> }
>>
>> This program is compiled as a.out.
>>
>> Before the patch:
>> hades ~/src/qemu $ qemu-img create -f qcow2 test.qcow2 64G
>> Formatting 'test.qcow2', fmt=qcow2 size=68719476736 encryption=off
>> cluster_size=65536 lazy_refcounts=off refcount_bits=16
>> hades ~/src/qemu $ sudo ./qemu-nbd  --connect=/dev/nbd3 test.qcow2
>> --detect-zeroes=on --aio=native --cache=none
>> hades ~/src/qemu $ sudo ./a.out /dev/nbd3
> Here, I'd argue that the kernel NBD module is buggy, for allowing a
> user-space app to pass an unaligned buffer to an O_DIRECT fd.  But
> that's outside the realm of qemu code.
no. The victim is GENERIC block layer code in the kernel.
You could look. Original report in the commit 459b4e661

"The patch changes default bounce buffer optimal alignment to
     MAX(page size, 4k). 4k is chosen as maximal known sector size on real
     HDD.

     The justification of the performance improve is quite interesting.
     From the kernel point of view each request to the disk was split
     by two. This could be seen by blktrace like this:
       9,0   11  1     0.000000000 11151  Q  WS 312737792 + 1023 [qemu-img]
       9,0   11  2     0.000007938 11151  Q  WS 312738815 + 8 [qemu-img]
       9,0   11  3     0.000030735 11151  Q  WS 312738823 + 1016 [qemu-img]
       9,0   11  4     0.000032482 11151  Q  WS 312739839 + 8 [qemu-img]
       9,0   11  5     0.000041379 11151  Q  WS 312739847 + 1016 [qemu-img]
       9,0   11  6     0.000042818 11151  Q  WS 312740863 + 8 [qemu-img]
       9,0   11  7     0.000051236 11151  Q  WS 312740871 + 1017 [qemu-img]
       9,0    5  1     0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
     After the patch the pattern becomes normal:
       9,0    6  1     0.000000000 12422  Q  WS 314834944 + 1024 [qemu-img]
       9,0    6  2     0.000038527 12422  Q  WS 314835968 + 1024 [qemu-img]
       9,0    6  3     0.000072849 12422  Q  WS 314836992 + 1024 [qemu-img]
       9,0    6  4     0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]"

was made against ext4 and xfs filesystem. Once again, the problem
is in the block layer of the kernel itself.


> But again, per my argument that you don't have to involve the kernel nor
> O_DIRECT to be able to write a client that can attempt to cause an NBD
> server to do unaligned operations, we can use this kernel bug as an
> easier way to test any proposed fix to the qemu side of things, whether
> or not the kernel module gets tightened in behavior down the road.
>
Den
Kevin Wolf April 26, 2016, 8:23 a.m. UTC | #5
Am 25.04.2016 um 12:20 hat Denis V. Lunev geschrieben:
> On 04/25/2016 12:05 PM, Kevin Wolf wrote:
> >Am 23.04.2016 um 14:05 hat Denis V. Lunev geschrieben:
> >>Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
> >>if the caller is using O_DIRECT and does not align in-memory data to
> >>page. Thus qemu-nbd will call block layer with non-aligned requests.
> >>
> >>qcow2_co_write_zeroes forcibly asks the caller to supply block-aligned
> >>data. In the other case it rejects with ENOTSUP which is properly
> >>handled on the upper level. The problem is that this grows the image.
> >>
> >>This could be optimized a bit:
> >>- particular request could be split to block aligned part and head/tail,
> >>   which could be handled separately
> >In fact, this is what bdrv_co_do_write_zeroes() is already supposed to
> >do. qcow2 exposes its cluster size as bs->bl.write_zeroes_alignment, so
> >block/io.c should split the request in three.
> >
> >If you see something different happening, we may have a bug there.
> >
> Pls look to the commit
> 
> commit 459b4e66129d091a11e9886ecc15a8bf9f7f3d92
> Author: Denis V. Lunev <den@openvz.org>
> Date:   Tue May 12 17:30:56 2015 +0300
> 
>     block: align bounce buffers to page
> 
> The situation is exactly like the described there. The user
> of the /dev/nbd0 writes with O_DIRECT and has unaligned
> to page buffers. Thus real operations on qemu-nbd
> layer becomes unaligned to block size.

I don't understand the connection to this patch. Unaligned buffers on
the NBD client shouldn't even be visible in the server, unless they
already result in the client requesting different things. If so, what is
the difference in the NBD requests? And can we reproduce the same
locally with qemu-io and no NBD involved?

> Thus bdrv_co_do_write_zeroes is helpless here unfortunately.

How can qcow2 fix something that bdrv_co_do_write_zeroes() can't
possibly fix? In particular, why does splitting the request in head,
tail and aligned part help when done by qcow2, but the same thing
doesn't help when done by bdrv_co_do_write_zeroes()?

I'd actually be interested in both parts of the answer, because I'm not
sure how _memory_ alignment on the client can possibly be fixed in
qcow2; but if it's about _disk_ alignment, I don't understand why it
can't be fixed in bdrv_co_do_write_zeroes().

> >>- writes could be omitted when we do know that the image already contains
> >>   zeroes at the offsets being written
> >I don't think this is a valid shortcut. The semantics of a write_zeroes
> >operation is that the zeros (literal or as flags) are stored in this
> >layer and that the backing file isn't involved any more for the given
> >sectors. For example, a streaming operation or qemu-img rebase may
> >involve write_zeroes operations, and relying on the backing file would
> >cause corruption there (because the whole point of the operation is that
> >the backing file can be removed).
> this is not a problem. The block will be abscent and thus it will be
> read as zeroes.

Removing a backing file doesn't mean that there won't still be another
backing file. You may have only removed one node in the backing file
chain, or in the case of rebase, you switch to another backing file.

Kevin
Denis V. Lunev April 26, 2016, 9:35 a.m. UTC | #6
On 04/26/2016 11:23 AM, Kevin Wolf wrote:
> Am 25.04.2016 um 12:20 hat Denis V. Lunev geschrieben:
>> On 04/25/2016 12:05 PM, Kevin Wolf wrote:
>>> Am 23.04.2016 um 14:05 hat Denis V. Lunev geschrieben:
>>>> Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
>>>> if the caller is using O_DIRECT and does not align in-memory data to
>>>> page. Thus qemu-nbd will call block layer with non-aligned requests.
>>>>
>>>> qcow2_co_write_zeroes forcibly asks the caller to supply block-aligned
>>>> data. In the other case it rejects with ENOTSUP which is properly
>>>> handled on the upper level. The problem is that this grows the image.
>>>>
>>>> This could be optimized a bit:
>>>> - particular request could be split to block aligned part and head/tail,
>>>>    which could be handled separately
>>> In fact, this is what bdrv_co_do_write_zeroes() is already supposed to
>>> do. qcow2 exposes its cluster size as bs->bl.write_zeroes_alignment, so
>>> block/io.c should split the request in three.
>>>
>>> If you see something different happening, we may have a bug there.
>>>
>> Pls look to the commit
>>
>> commit 459b4e66129d091a11e9886ecc15a8bf9f7f3d92
>> Author: Denis V. Lunev<den@openvz.org>
>> Date:   Tue May 12 17:30:56 2015 +0300
>>
>>      block: align bounce buffers to page
>>
>> The situation is exactly like the described there. The user
>> of the /dev/nbd0 writes with O_DIRECT and has unaligned
>> to page buffers. Thus real operations on qemu-nbd
>> layer becomes unaligned to block size.
> I don't understand the connection to this patch. Unaligned buffers on
> the NBD client shouldn't even be visible in the server, unless they
> already result in the client requesting different things. If so, what is
> the difference in the NBD requests? And can we reproduce the same
> locally with qemu-io and no NBD involved?

NBD device is mapped by the kernel as /dev/nbd0
The descriptor to /dev/nbd0 is opened with O_DIRECT
by the program. The program performs write
of 1MB at the offset 0 of the device.

There are 2 cases:
(1) the program has buffer aligned to 512 bytes
(2) the program has buffer aligned to 4096

The kernel splits writes before passing it to elevator
DIFFERENTLY for above cases.

In the case (2) the request is split by 256 KB chunks .
For the case we will have 4 requests 256Kb each with
offsets 0, 256Kb, 512Kb, 768Kb. In this case NBD and
QCOW2 driver behaves fine.

In the case (1) the kernel split packets in a very lame
way. For each 256Kb chunk actually several requests
are generated like this:

       9,0   11  1     0.000000000 11151  Q  WS 312737792 + 1023 [qemu-img]
       9,0   11  2     0.000007938 11151  Q  WS 312738815 + 8 [qemu-img]
       9,0   11  3     0.000030735 11151  Q  WS 312738823 + 1016 [qemu-img]
       9,0   11  4     0.000032482 11151  Q  WS 312739839 + 8 [qemu-img]
       9,0   11  5     0.000041379 11151  Q  WS 312739847 + 1016 [qemu-img]
       9,0   11  6     0.000042818 11151  Q  WS 312740863 + 8 [qemu-img]
       9,0   11  7     0.000051236 11151  Q  WS 312740871 + 1017 [qemu-img]
       9,0    5  1     0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]

These requests will be passed from kernel VFS to kernel
NBD client. Thus we will have requests like this in NBD client
and subsequently in QEMU (offsets in bytes)
      0..261632 (256k - 512)
      261632..261632
      etc
This is how requests splitting is working in the VFS :( and this is the
problem which can not be fixed easily.

Locally with QEMU-IO the reproduction is simple. We can repeat above
requests or could simple do the following:
     qemu-io -c "write 0xff 32k 1M" 1.img
The code without the patch will allocate 2 blocks for guest offsets
0-64k and 1M-(1M+64k) and performs writes there. The code with
the patch will skip creation of blocks if possible.

I have recorded parameters in qcow2_co_do_write_zeroes for the
reference (1 MB is written, memory is not aligned as sent in the
letter above, [sudo ./a.out /dev/nbd3]):

qcow2_co_write_zeroes off=0 size=10000
qcow2_co_write_zeroes off=1fe00 size=200
qcow2_co_write_zeroes off=3fe00 size=200
qcow2_co_write_zeroes off=5fe00 size=200
qcow2_co_write_zeroes off=7fe00 size=200
qcow2_co_write_zeroes off=9fe00 size=200
qcow2_co_write_zeroes off=bfe00 size=200
qcow2_co_write_zeroes off=dfe00 size=200
qcow2_co_write_zeroes off=ffe00 size=200
qcow2_co_write_zeroes off=10000 size=fe00
qcow2_co_write_zeroes off=20000 size=10000
qcow2_co_write_zeroes off=30000 size=fe00
qcow2_co_write_zeroes off=60000 size=10000
qcow2_co_write_zeroes off=70000 size=fe00
qcow2_co_write_zeroes off=c0000 size=10000
qcow2_co_write_zeroes off=d0000 size=fe00
qcow2_co_write_zeroes off=e0000 size=10000
qcow2_co_write_zeroes off=f0000 size=fe00
qcow2_co_write_zeroes off=80000 size=10000
qcow2_co_write_zeroes off=90000 size=fe00
qcow2_co_write_zeroes off=a0000 size=10000
qcow2_co_write_zeroes off=b0000 size=fe00
qcow2_co_write_zeroes off=40000 size=10000
qcow2_co_write_zeroes off=50000 size=fe00


>> Thus bdrv_co_do_write_zeroes is helpless here unfortunately.
> How can qcow2 fix something that bdrv_co_do_write_zeroes() can't
> possibly fix?
Yes. We are writing zeroes. If the block is not allocated - we could
skip the operation entirely as soon as there is no backing file or
there is no block at this guest offset in entire backing chain.

>   In particular, why does splitting the request in head,
> tail and aligned part help when done by qcow2, but the same thing
> doesn't help when done by bdrv_co_do_write_zeroes()?
The operation is skipped as far as you could see. May be we could
just return ENOTSUP if the block is allocated to allow upper
level to work. Something like

+static int write_zeroes_chunk(BlockDriverState *bs, int64_t sector_num, int nr)
+{
+    int ret, count;
+    BlockDriverState *file;
+
+    ret = bdrv_get_block_status_above(bs, NULL, sector_num, nr, &count, &file);
+    if (ret > 0 && (ret & BDRV_BLOCK_ZERO) && count == nr) {
+        /* Nothing to do. The area is zeroed already.
+           Worth to check to avoid image expansion for non-aligned reqs. */
+        return 0;
+    }
+    return -ENOTSUP;
+}



> I'd actually be interested in both parts of the answer, because I'm not
> sure how _memory_ alignment on the client can possibly be fixed in
> qcow2; but if it's about _disk_ alignment, I don't understand why it
> can't be fixed in bdrv_co_do_write_zeroes().
The question is "why to write zeroes if we know that we will read
zeroes on the next attempt?" We could skip this write. This is the
idea, see above.

>>>> - writes could be omitted when we do know that the image already contains
>>>>    zeroes at the offsets being written
>>> I don't think this is a valid shortcut. The semantics of a write_zeroes
>>> operation is that the zeros (literal or as flags) are stored in this
>>> layer and that the backing file isn't involved any more for the given
>>> sectors. For example, a streaming operation or qemu-img rebase may
>>> involve write_zeroes operations, and relying on the backing file would
>>> cause corruption there (because the whole point of the operation is that
>>> the backing file can be removed).
>> this is not a problem. The block will be abscent and thus it will be
>> read as zeroes.
> Removing a backing file doesn't mean that there won't still be another
> backing file. You may have only removed one node in the backing file
> chain, or in the case of rebase, you switch to another backing file.
hmmm.... We are on a tricky ground. We have read zeroes but can not
read zeroes on a next attempt especially if backing chain is changed.

Den
Kevin Wolf April 26, 2016, 10:19 a.m. UTC | #7
Am 26.04.2016 um 11:35 hat Denis V. Lunev geschrieben:
> On 04/26/2016 11:23 AM, Kevin Wolf wrote:
> >Am 25.04.2016 um 12:20 hat Denis V. Lunev geschrieben:
> >>On 04/25/2016 12:05 PM, Kevin Wolf wrote:
> >>>Am 23.04.2016 um 14:05 hat Denis V. Lunev geschrieben:
> >>>>Unfortunately Linux kernel could send non-aligned requests to qemu-nbd
> >>>>if the caller is using O_DIRECT and does not align in-memory data to
> >>>>page. Thus qemu-nbd will call block layer with non-aligned requests.
> >>>>
> >>>>qcow2_co_write_zeroes forcibly asks the caller to supply block-aligned
> >>>>data. In the other case it rejects with ENOTSUP which is properly
> >>>>handled on the upper level. The problem is that this grows the image.
> >>>>
> >>>>This could be optimized a bit:
> >>>>- particular request could be split to block aligned part and head/tail,
> >>>>   which could be handled separately
> >>>In fact, this is what bdrv_co_do_write_zeroes() is already supposed to
> >>>do. qcow2 exposes its cluster size as bs->bl.write_zeroes_alignment, so
> >>>block/io.c should split the request in three.
> >>>
> >>>If you see something different happening, we may have a bug there.
> >>>
> >>Pls look to the commit
> >>
> >>commit 459b4e66129d091a11e9886ecc15a8bf9f7f3d92
> >>Author: Denis V. Lunev<den@openvz.org>
> >>Date:   Tue May 12 17:30:56 2015 +0300
> >>
> >>     block: align bounce buffers to page
> >>
> >>The situation is exactly like the described there. The user
> >>of the /dev/nbd0 writes with O_DIRECT and has unaligned
> >>to page buffers. Thus real operations on qemu-nbd
> >>layer becomes unaligned to block size.
> >I don't understand the connection to this patch. Unaligned buffers on
> >the NBD client shouldn't even be visible in the server, unless they
> >already result in the client requesting different things. If so, what is
> >the difference in the NBD requests? And can we reproduce the same
> >locally with qemu-io and no NBD involved?
> 
> NBD device is mapped by the kernel as /dev/nbd0
> The descriptor to /dev/nbd0 is opened with O_DIRECT
> by the program. The program performs write
> of 1MB at the offset 0 of the device.
> 
> There are 2 cases:
> (1) the program has buffer aligned to 512 bytes
> (2) the program has buffer aligned to 4096
> 
> The kernel splits writes before passing it to elevator
> DIFFERENTLY for above cases.
> 
> In the case (2) the request is split by 256 KB chunks .
> For the case we will have 4 requests 256Kb each with
> offsets 0, 256Kb, 512Kb, 768Kb. In this case NBD and
> QCOW2 driver behaves fine.
> 
> In the case (1) the kernel split packets in a very lame
> way. For each 256Kb chunk actually several requests
> are generated like this:
> 
>       9,0   11  1     0.000000000 11151  Q  WS 312737792 + 1023 [qemu-img]
>       9,0   11  2     0.000007938 11151  Q  WS 312738815 + 8 [qemu-img]
>       9,0   11  3     0.000030735 11151  Q  WS 312738823 + 1016 [qemu-img]
>       9,0   11  4     0.000032482 11151  Q  WS 312739839 + 8 [qemu-img]
>       9,0   11  5     0.000041379 11151  Q  WS 312739847 + 1016 [qemu-img]
>       9,0   11  6     0.000042818 11151  Q  WS 312740863 + 8 [qemu-img]
>       9,0   11  7     0.000051236 11151  Q  WS 312740871 + 1017 [qemu-img]
>       9,0    5  1     0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
> 
> These requests will be passed from kernel VFS to kernel
> NBD client. Thus we will have requests like this in NBD client
> and subsequently in QEMU (offsets in bytes)
>      0..261632 (256k - 512)
>      261632..261632
>      etc
> This is how requests splitting is working in the VFS :( and this is the
> problem which can not be fixed easily.

Did you ever talk to the kernel people?

We can try to make the best out of suboptimal requests in qemu, but it
looks to me as if the real fix was in the kernel, and if we don't get it
fixed there, we'll see more and more of this kind of problems. I think
this is relevant not only for VMs, but probably on real hardware as
well.

> Locally with QEMU-IO the reproduction is simple. We can repeat above
> requests or could simple do the following:
>     qemu-io -c "write 0xff 32k 1M" 1.img

I assume you mean "-z" instead of "0xff"?

> The code without the patch will allocate 2 blocks for guest offsets
> 0-64k and 1M-(1M+64k) and performs writes there. The code with
> the patch will skip creation of blocks if possible.

Okay, that's the second of the optimisations you mentioned in your
commit message. I can see how this adds something that the generic block
layer can't easily add, if it can be made safe (I think it can, even
though your patch doesn't get it completely right yet, see below).

> I have recorded parameters in qcow2_co_do_write_zeroes for the
> reference (1 MB is written, memory is not aligned as sent in the
> letter above, [sudo ./a.out /dev/nbd3]):
> 
> qcow2_co_write_zeroes off=0 size=10000
> qcow2_co_write_zeroes off=1fe00 size=200
> qcow2_co_write_zeroes off=3fe00 size=200
> qcow2_co_write_zeroes off=5fe00 size=200
> qcow2_co_write_zeroes off=7fe00 size=200
> qcow2_co_write_zeroes off=9fe00 size=200
> qcow2_co_write_zeroes off=bfe00 size=200
> qcow2_co_write_zeroes off=dfe00 size=200
> qcow2_co_write_zeroes off=ffe00 size=200
> qcow2_co_write_zeroes off=10000 size=fe00
> qcow2_co_write_zeroes off=20000 size=10000
> qcow2_co_write_zeroes off=30000 size=fe00
> qcow2_co_write_zeroes off=60000 size=10000
> qcow2_co_write_zeroes off=70000 size=fe00
> qcow2_co_write_zeroes off=c0000 size=10000
> qcow2_co_write_zeroes off=d0000 size=fe00
> qcow2_co_write_zeroes off=e0000 size=10000
> qcow2_co_write_zeroes off=f0000 size=fe00
> qcow2_co_write_zeroes off=80000 size=10000
> qcow2_co_write_zeroes off=90000 size=fe00
> qcow2_co_write_zeroes off=a0000 size=10000
> qcow2_co_write_zeroes off=b0000 size=fe00
> qcow2_co_write_zeroes off=40000 size=10000
> qcow2_co_write_zeroes off=50000 size=fe00

I don't see any requests here where your code actually ends up splitting
the request into head, aligned part and tail. Which is expected because
bdrv_co_do_write_zeroes() already does that.

What I can't see here is whether this actually happened (10000 + fe00
could be a split request) or whether it already came in this way over
NBD.

> >>Thus bdrv_co_do_write_zeroes is helpless here unfortunately.
> >How can qcow2 fix something that bdrv_co_do_write_zeroes() can't
> >possibly fix?
> Yes. We are writing zeroes. If the block is not allocated - we could
> skip the operation entirely as soon as there is no backing file or
> there is no block at this guest offset in entire backing chain.
> 
> >  In particular, why does splitting the request in head,
> >tail and aligned part help when done by qcow2, but the same thing
> >doesn't help when done by bdrv_co_do_write_zeroes()?
> The operation is skipped as far as you could see. May be we could
> just return ENOTSUP if the block is allocated to allow upper
> level to work. Something like
> 
> +static int write_zeroes_chunk(BlockDriverState *bs, int64_t sector_num, int nr)
> +{
> +    int ret, count;
> +    BlockDriverState *file;
> +
> +    ret = bdrv_get_block_status_above(bs, NULL, sector_num, nr, &count, &file);
> +    if (ret > 0 && (ret & BDRV_BLOCK_ZERO) && count == nr) {
> +        /* Nothing to do. The area is zeroed already.
> +           Worth to check to avoid image expansion for non-aligned reqs. */
> +        return 0;
> +    }
> +    return -ENOTSUP;
> +}

This is getting closer, but still has the same problems I mentioned.

> >I'd actually be interested in both parts of the answer, because I'm not
> >sure how _memory_ alignment on the client can possibly be fixed in
> >qcow2; but if it's about _disk_ alignment, I don't understand why it
> >can't be fixed in bdrv_co_do_write_zeroes().
> The question is "why to write zeroes if we know that we will read
> zeroes on the next attempt?" We could skip this write. This is the
> idea, see above.
> 
> >>>>- writes could be omitted when we do know that the image already contains
> >>>>   zeroes at the offsets being written
> >>>I don't think this is a valid shortcut. The semantics of a write_zeroes
> >>>operation is that the zeros (literal or as flags) are stored in this
> >>>layer and that the backing file isn't involved any more for the given
> >>>sectors. For example, a streaming operation or qemu-img rebase may
> >>>involve write_zeroes operations, and relying on the backing file would
> >>>cause corruption there (because the whole point of the operation is that
> >>>the backing file can be removed).
> >>this is not a problem. The block will be abscent and thus it will be
> >>read as zeroes.
> >Removing a backing file doesn't mean that there won't still be another
> >backing file. You may have only removed one node in the backing file
> >chain, or in the case of rebase, you switch to another backing file.
> hmmm.... We are on a tricky ground. We have read zeroes but can not
> read zeroes on a next attempt especially if backing chain is changed.

Here is the trick that I think will save us:

On a misaligned call, we call bdrv_get_block_status_above() for the
whole cluster that we're in. We know that it's only a single cluster
because bdrv_co_do_write_zeroes() splits things this way; only aligned
requests can be longer than a sector (we can even assert this).

If the result is that the cluster already reads as zero, instead of
doing nothing and possibly breaking backing chain manipulations, we
simply extend the write zeroes operation to the whole cluster and
continue as normal with an aligned request. This way we end up with a
zero cluster instead of an unallocated one, and that should be safe.

If the result is that the cluster isn't completed zeroed, return
-ENOTSUP as you did in the snippet above.

That approach should probably result in an (even) simpler patch, too.


Hm... Or actually, if we want something more complex that will help all
block drivers, extending the range of the request could even be done in
bdrv_co_do_write_zeroes(), I guess. I won't insist on it, though.

Kevin
Denis V. Lunev April 27, 2016, 7:07 a.m. UTC | #8
On 04/26/2016 01:19 PM, Kevin Wolf wrote:

[skipped]
> Did you ever talk to the kernel people?
>
> We can try to make the best out of suboptimal requests in qemu, but it
> looks to me as if the real fix was in the kernel, and if we don't get it
> fixed there, we'll see more and more of this kind of problems. I think
> this is relevant not only for VMs, but probably on real hardware as
> well.
it is difficult and could be done partially only :(


>> Locally with QEMU-IO the reproduction is simple. We can repeat above
>> requests or could simple do the following:
>>      qemu-io -c "write 0xff 32k 1M" 1.img
> I assume you mean "-z" instead of "0xff"?
sure


>> The code without the patch will allocate 2 blocks for guest offsets
>> 0-64k and 1M-(1M+64k) and performs writes there. The code with
>> the patch will skip creation of blocks if possible.
> Okay, that's the second of the optimisations you mentioned in your
> commit message. I can see how this adds something that the generic block
> layer can't easily add, if it can be made safe (I think it can, even
> though your patch doesn't get it completely right yet, see below).
>
>> I have recorded parameters in qcow2_co_do_write_zeroes for the
>> reference (1 MB is written, memory is not aligned as sent in the
>> letter above, [sudo ./a.out /dev/nbd3]):
>>
>> qcow2_co_write_zeroes off=0 size=10000
>> qcow2_co_write_zeroes off=1fe00 size=200
>> qcow2_co_write_zeroes off=3fe00 size=200
>> qcow2_co_write_zeroes off=5fe00 size=200
>> qcow2_co_write_zeroes off=7fe00 size=200
>> qcow2_co_write_zeroes off=9fe00 size=200
>> qcow2_co_write_zeroes off=bfe00 size=200
>> qcow2_co_write_zeroes off=dfe00 size=200
>> qcow2_co_write_zeroes off=ffe00 size=200
>> qcow2_co_write_zeroes off=10000 size=fe00
>> qcow2_co_write_zeroes off=20000 size=10000
>> qcow2_co_write_zeroes off=30000 size=fe00
>> qcow2_co_write_zeroes off=60000 size=10000
>> qcow2_co_write_zeroes off=70000 size=fe00
>> qcow2_co_write_zeroes off=c0000 size=10000
>> qcow2_co_write_zeroes off=d0000 size=fe00
>> qcow2_co_write_zeroes off=e0000 size=10000
>> qcow2_co_write_zeroes off=f0000 size=fe00
>> qcow2_co_write_zeroes off=80000 size=10000
>> qcow2_co_write_zeroes off=90000 size=fe00
>> qcow2_co_write_zeroes off=a0000 size=10000
>> qcow2_co_write_zeroes off=b0000 size=fe00
>> qcow2_co_write_zeroes off=40000 size=10000
>> qcow2_co_write_zeroes off=50000 size=fe00
> I don't see any requests here where your code actually ends up splitting
> the request into head, aligned part and tail. Which is expected because
> bdrv_co_do_write_zeroes() already does that.
>
> What I can't see here is whether this actually happened (10000 + fe00
> could be a split request) or whether it already came in this way over
> NBD.

bdrv_aligned_pwritev off=0 size=1fe00
qcow2_co_write_zeroes off=0 size=10000
qcow2_co_write_zeroes off=10000 size=fe00
bdrv_aligned_pwritev off=1fe00 size=20000
qcow2_co_write_zeroes off=1fe00 size=200
qcow2_co_write_zeroes off=20000 size=10000
qcow2_co_write_zeroes off=30000 size=fe00
bdrv_aligned_pwritev off=3fe00 size=20000
qcow2_co_write_zeroes off=3fe00 size=200
qcow2_co_write_zeroes off=40000 size=10000
qcow2_co_write_zeroes off=50000 size=fe00
bdrv_aligned_pwritev off=5fe00 size=20000
qcow2_co_write_zeroes off=5fe00 size=200
qcow2_co_write_zeroes off=60000 size=10000
qcow2_co_write_zeroes off=70000 size=fe00
bdrv_aligned_pwritev off=7fe00 size=20000
qcow2_co_write_zeroes off=7fe00 size=200
qcow2_co_write_zeroes off=80000 size=10000
qcow2_co_write_zeroes off=90000 size=fe00
bdrv_aligned_pwritev off=9fe00 size=20000
qcow2_co_write_zeroes off=9fe00 size=200
qcow2_co_write_zeroes off=a0000 size=10000
qcow2_co_write_zeroes off=b0000 size=fe00
bdrv_aligned_pwritev off=bfe00 size=20000
qcow2_co_write_zeroes off=bfe00 size=200
qcow2_co_write_zeroes off=c0000 size=10000
qcow2_co_write_zeroes off=d0000 size=fe00
bdrv_aligned_pwritev off=dfe00 size=20000
qcow2_co_write_zeroes off=dfe00 size=200
qcow2_co_write_zeroes off=e0000 size=10000
qcow2_co_write_zeroes off=f0000 size=fe00
bdrv_aligned_pwritev off=ffe00 size=200
qcow2_co_write_zeroes off=ffe00 size=200


[skipped]

> Here is the trick that I think will save us:
>
> On a misaligned call, we call bdrv_get_block_status_above() for the
> whole cluster that we're in. We know that it's only a single cluster
> because bdrv_co_do_write_zeroes() splits things this way; only aligned
> requests can be longer than a sector (we can even assert this).
>
> If the result is that the cluster already reads as zero, instead of
> doing nothing and possibly breaking backing chain manipulations, we
> simply extend the write zeroes operation to the whole cluster and
> continue as normal with an aligned request. This way we end up with a
> zero cluster instead of an unallocated one, and that should be safe.
>
> If the result is that the cluster isn't completed zeroed, return
> -ENOTSUP as you did in the snippet above.
>
> That approach should probably result in an (even) simpler patch, too.

let's start from the simple thing. I'll send a patch in a couple of minutes.
It really looks MUCH better than the original one. Thank you for this cool
suggestion.

>
> Hm... Or actually, if we want something more complex that will help all
> block drivers, extending the range of the request could even be done in
> bdrv_co_do_write_zeroes(), I guess. I won't insist on it, though.
I do not think that this should be done now. This patch should go into
our own stable and thus it would better be as simple as possible.

Den
Kevin Wolf April 27, 2016, 8:12 a.m. UTC | #9
Am 27.04.2016 um 09:07 hat Denis V. Lunev geschrieben:
> On 04/26/2016 01:19 PM, Kevin Wolf wrote:
> 
> [skipped]
> >Did you ever talk to the kernel people?
> >
> >We can try to make the best out of suboptimal requests in qemu, but it
> >looks to me as if the real fix was in the kernel, and if we don't get it
> >fixed there, we'll see more and more of this kind of problems. I think
> >this is relevant not only for VMs, but probably on real hardware as
> >well.
> it is difficult and could be done partially only :(

Was there a discussion on some public mailing list? If so, do you have a
pointer?

> >Here is the trick that I think will save us:
> >
> >On a misaligned call, we call bdrv_get_block_status_above() for the
> >whole cluster that we're in. We know that it's only a single cluster
> >because bdrv_co_do_write_zeroes() splits things this way; only aligned
> >requests can be longer than a sector (we can even assert this).
> >
> >If the result is that the cluster already reads as zero, instead of
> >doing nothing and possibly breaking backing chain manipulations, we
> >simply extend the write zeroes operation to the whole cluster and
> >continue as normal with an aligned request. This way we end up with a
> >zero cluster instead of an unallocated one, and that should be safe.
> >
> >If the result is that the cluster isn't completed zeroed, return
> >-ENOTSUP as you did in the snippet above.
> >
> >That approach should probably result in an (even) simpler patch, too.
> 
> let's start from the simple thing. I'll send a patch in a couple of minutes.
> It really looks MUCH better than the original one. Thank you for this cool
> suggestion.
> 
> >
> >Hm... Or actually, if we want something more complex that will help all
> >block drivers, extending the range of the request could even be done in
> >bdrv_co_do_write_zeroes(), I guess. I won't insist on it, though.
> I do not think that this should be done now. This patch should go into
> our own stable and thus it would better be as simple as possible.

As I said, I'm okay with the simple patch for now, but I must point out
that "your own stable" is an invalid argument here.

What we merge in upstream qemu must make sense for upstream qemu, and
for nothing else, even if it may be more inconvenient for downstreams
and if it may mean that downstreams have to carry non-upstream patches.
I have rejected patches upstream even though they made more sense for
RHEL, and I will reject patches that suit other downstreams if they
aren't doing the right thing for the upstream project.

So on qemu-devel, as a rule of thumb always argue with the merits that a
patch has for upstream, not with advantages for downstream.

Kevin
Denis V. Lunev April 27, 2016, 8:32 a.m. UTC | #10
On 04/27/2016 11:12 AM, Kevin Wolf wrote:
> Am 27.04.2016 um 09:07 hat Denis V. Lunev geschrieben:
>> On 04/26/2016 01:19 PM, Kevin Wolf wrote:
>>
>> [skipped]
>>> Did you ever talk to the kernel people?
>>>
>>> We can try to make the best out of suboptimal requests in qemu, but it
>>> looks to me as if the real fix was in the kernel, and if we don't get it
>>> fixed there, we'll see more and more of this kind of problems. I think
>>> this is relevant not only for VMs, but probably on real hardware as
>>> well.
>> it is difficult and could be done partially only :(
> Was there a discussion on some public mailing list? If so, do you have a
> pointer?
we have discussed this here a year ago but I am unable to see traces in the
LKML so far. It seems that the situation was resolved and patience
was lost. Though there were some efforts...

Here is the thread with Paolo and Dmitry Monakhov,
who is working with kernel block layer in Virtuozzo.

http://lists.nongnu.org/archive/html/qemu-devel/2015-06/msg00182.html


>>> Here is the trick that I think will save us:
>>>
>>> On a misaligned call, we call bdrv_get_block_status_above() for the
>>> whole cluster that we're in. We know that it's only a single cluster
>>> because bdrv_co_do_write_zeroes() splits things this way; only aligned
>>> requests can be longer than a sector (we can even assert this).
>>>
>>> If the result is that the cluster already reads as zero, instead of
>>> doing nothing and possibly breaking backing chain manipulations, we
>>> simply extend the write zeroes operation to the whole cluster and
>>> continue as normal with an aligned request. This way we end up with a
>>> zero cluster instead of an unallocated one, and that should be safe.
>>>
>>> If the result is that the cluster isn't completed zeroed, return
>>> -ENOTSUP as you did in the snippet above.
>>>
>>> That approach should probably result in an (even) simpler patch, too.
>> let's start from the simple thing. I'll send a patch in a couple of minutes.
>> It really looks MUCH better than the original one. Thank you for this cool
>> suggestion.
>>
>>> Hm... Or actually, if we want something more complex that will help all
>>> block drivers, extending the range of the request could even be done in
>>> bdrv_co_do_write_zeroes(), I guess. I won't insist on it, though.
>> I do not think that this should be done now. This patch should go into
>> our own stable and thus it would better be as simple as possible.
> As I said, I'm okay with the simple patch for now, but I must point out
> that "your own stable" is an invalid argument here.
>
> What we merge in upstream qemu must make sense for upstream qemu, and
> for nothing else, even if it may be more inconvenient for downstreams
> and if it may mean that downstreams have to carry non-upstream patches.
> I have rejected patches upstream even though they made more sense for
> RHEL, and I will reject patches that suit other downstreams if they
> aren't doing the right thing for the upstream project.
>
> So on qemu-devel, as a rule of thumb always argue with the merits that a
> patch has for upstream, not with advantages for downstream.
ok, but I have not used this as an argument. You have said that "simple 
patch is OK",
the purpose of my clause was to note "simple patch is better for me at 
the moment".
Sorry, if this has wrong intention.

Den
diff mbox

Patch

diff --git a/block/qcow2.c b/block/qcow2.c
index 470734b..9bdaa15 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2411,21 +2411,69 @@  finish:
     return ret;
 }
 
+static int write_zeroes_chunk(BlockDriverState *bs, int64_t sector_num, int nr)
+{
+    int ret, count;
+    BlockDriverState *file;
+    uint8_t *buf;
+    struct iovec iov;
+    QEMUIOVector local_qiov;
+
+    ret = bdrv_get_block_status_above(bs, NULL, sector_num, nr, &count, &file);
+    if (ret > 0 && (ret & BDRV_BLOCK_ZERO) && count == nr) {
+        /* Nothing to do. The area is zeroed already.
+           Worth to check to avoid image expansion for non-aligned reqs. */
+        return 0;
+    }
+
+    buf = qemu_blockalign0(bs, nr << BDRV_SECTOR_BITS);
+    iov = (struct iovec) {
+        .iov_base   = buf,
+        .iov_len    = nr << BDRV_SECTOR_BITS,
+    };
+    qemu_iovec_init_external(&local_qiov, &iov, 1);
+
+    return qcow2_co_writev(bs, sector_num, nr, &local_qiov);
+}
+
 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
-    /* Emulate misaligned zero writes */
-    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
-        return -ENOTSUP;
+    int nr = sector_num % s->cluster_sectors;
+    if (nr != 0) {
+        nr = MIN(s->cluster_sectors - nr, nb_sectors);
+
+        ret = write_zeroes_chunk(bs, sector_num, nr);
+        if (ret < 0) {
+            return ret;
+        }
+
+        sector_num += nr;
+        nb_sectors -= nr;
+        if (nb_sectors == 0) {
+            return 0;
+        }
+    }
+
+    nr = nb_sectors % s->cluster_sectors;
+    if (nr != 0) {
+        ret = write_zeroes_chunk(bs, sector_num + nb_sectors - nr, nr);
+        if (ret < 0) {
+            return ret;
+        }
+
+        nb_sectors -= nr;
+        if (nb_sectors == 0) {
+            return 0;
+        }
     }
 
     /* Whatever is left can use real zero clusters */
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors);
+    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors);
     qemu_co_mutex_unlock(&s->lock);
 
     return ret;