diff mbox series

qcow2: Reduce write_zeroes size in handle_alloc_space()

Message ID 20200609140859.142230-1-kwolf@redhat.com (mailing list archive)
State New, archived
Headers show
Series qcow2: Reduce write_zeroes size in handle_alloc_space() | expand

Commit Message

Kevin Wolf June 9, 2020, 2:08 p.m. UTC
Since commit c8bb23cbdbe, handle_alloc_space() is called for newly
allocated clusters to efficiently initialise the COW areas with zeros if
necessary. It skips the whole operation if both start_cow nor end_cow
are empty. However, it requests zeroing the whole request size (possibly
multiple megabytes) even if only one end of the request actually needs
this.

This patch reduces the write_zeroes request size in this case so that we
don't unnecessarily zero-initialise a region that we're going to
overwrite immediately.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy June 9, 2020, 2:28 p.m. UTC | #1
09.06.2020 17:08, Kevin Wolf wrote:
> Since commit c8bb23cbdbe, handle_alloc_space() is called for newly
> allocated clusters to efficiently initialise the COW areas with zeros if
> necessary. It skips the whole operation if both start_cow nor end_cow
> are empty. However, it requests zeroing the whole request size (possibly
> multiple megabytes) even if only one end of the request actually needs
> this.
> 
> This patch reduces the write_zeroes request size in this case so that we
> don't unnecessarily zero-initialise a region that we're going to
> overwrite immediately.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>   block/qcow2.c | 16 +++++++++++-----
>   1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 0cd2e6757e..77742877fb 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2403,6 +2403,8 @@ static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>       }
>   
>       for (m = l2meta; m != NULL; m = m->next) {
> +        uint64_t start = m->alloc_offset;
> +        uint64_t len = m->nb_clusters * s->cluster_size;
>           int ret;
>   
>           if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
> @@ -2413,21 +2415,25 @@ static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>               continue;
>           }
>   
> +        if (!m->cow_start.nb_bytes) {
> +            start += m->cow_end.offset;
> +            len -= m->cow_end.offset;
> +        } else if (!m->cow_end.nb_bytes) {
> +            len = m->cow_start.nb_bytes;
> +        }
> +

Hmm, I'm afraid, that this may make things worse in some cases, as with one big write-zero request
we preallocate data-region in the protocol file, so we have better locality for the clusters we
are going to write. And, in the same time, with BDRV_REQ_NO_FALLBACK flag write-zero must be
fast anyway (especially in comparison with the following write request).

>           /*
>            * instead of writing zero COW buffers,
>            * efficiently zero out the whole clusters
>            */
>   
> -        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
> -                                            m->nb_clusters * s->cluster_size,
> -                                            true);
> +        ret = qcow2_pre_write_overlap_check(bs, 0, start, len, true);
>           if (ret < 0) {
>               return ret;
>           }
>   
>           BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
> -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
> -                                    m->nb_clusters * s->cluster_size,
> +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
>                                       BDRV_REQ_NO_FALLBACK);
>           if (ret < 0) {
>               if (ret != -ENOTSUP && ret != -EAGAIN) {
>
Eric Blake June 9, 2020, 2:43 p.m. UTC | #2
On 6/9/20 9:08 AM, Kevin Wolf wrote:
> Since commit c8bb23cbdbe, handle_alloc_space() is called for newly
> allocated clusters to efficiently initialise the COW areas with zeros if
> necessary. It skips the whole operation if both start_cow nor end_cow

s/nor/and/

> are empty. However, it requests zeroing the whole request size (possibly
> multiple megabytes) even if only one end of the request actually needs
> this.
> 
> This patch reduces the write_zeroes request size in this case so that we
> don't unnecessarily zero-initialise a region that we're going to
> overwrite immediately.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>   block/qcow2.c | 16 +++++++++++-----
>   1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 0cd2e6757e..77742877fb 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2403,6 +2403,8 @@ static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>       }
>   
>       for (m = l2meta; m != NULL; m = m->next) {
> +        uint64_t start = m->alloc_offset;
> +        uint64_t len = m->nb_clusters * s->cluster_size;
>           int ret;
>   
>           if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
> @@ -2413,21 +2415,25 @@ static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>               continue;
>           }
>   
> +        if (!m->cow_start.nb_bytes) {
> +            start += m->cow_end.offset;
> +            len -= m->cow_end.offset;

So if the head was aligned, we reduce the request to only zero the tail;

> +        } else if (!m->cow_end.nb_bytes) {
> +            len = m->cow_start.nb_bytes;
> +        }

and if the tail was aligned, we reduce the request to only zero the head.

But if both ends are needed, the request still covers the entire 
request, including the clusters in the middle that will be overwritten.

> +
>           /*
>            * instead of writing zero COW buffers,
>            * efficiently zero out the whole clusters
>            */
>   
> -        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
> -                                            m->nb_clusters * s->cluster_size,
> -                                            true);
> +        ret = qcow2_pre_write_overlap_check(bs, 0, start, len, true);
>           if (ret < 0) {
>               return ret;
>           }
>   
>           BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
> -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
> -                                    m->nb_clusters * s->cluster_size,
> +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
>                                       BDRV_REQ_NO_FALLBACK);

If both head and tail are unaligned and need COW, but lie in different 
clusters, would we be better off using two separate 
bdrv_co_pwrite_zeroes() calls?

>           if (ret < 0) {
>               if (ret != -ENOTSUP && ret != -EAGAIN) {
>
Eric Blake June 9, 2020, 2:46 p.m. UTC | #3
On 6/9/20 9:28 AM, Vladimir Sementsov-Ogievskiy wrote:
> 09.06.2020 17:08, Kevin Wolf wrote:
>> Since commit c8bb23cbdbe, handle_alloc_space() is called for newly
>> allocated clusters to efficiently initialise the COW areas with zeros if
>> necessary. It skips the whole operation if both start_cow nor end_cow
>> are empty. However, it requests zeroing the whole request size (possibly
>> multiple megabytes) even if only one end of the request actually needs
>> this.
>>
>> This patch reduces the write_zeroes request size in this case so that we
>> don't unnecessarily zero-initialise a region that we're going to
>> overwrite immediately.
>>

> 
> Hmm, I'm afraid, that this may make things worse in some cases, as with 
> one big write-zero request
> we preallocate data-region in the protocol file, so we have better 
> locality for the clusters we
> are going to write. And, in the same time, with BDRV_REQ_NO_FALLBACK 
> flag write-zero must be
> fast anyway (especially in comparison with the following write request).
> 
>>           /*
>>            * instead of writing zero COW buffers,
>>            * efficiently zero out the whole clusters
>>            */
>> -        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
>> -                                            m->nb_clusters * 
>> s->cluster_size,
>> -                                            true);
>> +        ret = qcow2_pre_write_overlap_check(bs, 0, start, len, true);
>>           if (ret < 0) {
>>               return ret;
>>           }
>>           BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>> -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
>> -                                    m->nb_clusters * s->cluster_size,
>> +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
>>                                       BDRV_REQ_NO_FALLBACK);

Good point.  If we weren't using BDRV_REQ_NO_FALLBACK, then avoiding a 
pre-zero pass over the middle is essential.  But since we are insisting 
that the pre-zero pass be fast or else immediately fail, the time spent 
in pre-zeroing should not be a concern.  Do you have benchmark numbers 
stating otherwise?
Kevin Wolf June 9, 2020, 3:18 p.m. UTC | #4
Am 09.06.2020 um 16:46 hat Eric Blake geschrieben:
> On 6/9/20 9:28 AM, Vladimir Sementsov-Ogievskiy wrote:
> > 09.06.2020 17:08, Kevin Wolf wrote:
> > > Since commit c8bb23cbdbe, handle_alloc_space() is called for newly
> > > allocated clusters to efficiently initialise the COW areas with zeros if
> > > necessary. It skips the whole operation if both start_cow nor end_cow
> > > are empty. However, it requests zeroing the whole request size (possibly
> > > multiple megabytes) even if only one end of the request actually needs
> > > this.
> > > 
> > > This patch reduces the write_zeroes request size in this case so that we
> > > don't unnecessarily zero-initialise a region that we're going to
> > > overwrite immediately.
> > > 
> 
> > 
> > Hmm, I'm afraid, that this may make things worse in some cases, as with
> > one big write-zero request
> > we preallocate data-region in the protocol file, so we have better
> > locality for the clusters we
> > are going to write. And, in the same time, with BDRV_REQ_NO_FALLBACK
> > flag write-zero must be
> > fast anyway (especially in comparison with the following write request).
> > 
> > >           /*
> > >            * instead of writing zero COW buffers,
> > >            * efficiently zero out the whole clusters
> > >            */
> > > -        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
> > > -                                            m->nb_clusters *
> > > s->cluster_size,
> > > -                                            true);
> > > +        ret = qcow2_pre_write_overlap_check(bs, 0, start, len, true);
> > >           if (ret < 0) {
> > >               return ret;
> > >           }
> > >           BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
> > > -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
> > > -                                    m->nb_clusters * s->cluster_size,
> > > +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
> > >                                       BDRV_REQ_NO_FALLBACK);
> 
> Good point.  If we weren't using BDRV_REQ_NO_FALLBACK, then avoiding a
> pre-zero pass over the middle is essential.  But since we are insisting that
> the pre-zero pass be fast or else immediately fail, the time spent in
> pre-zeroing should not be a concern.  Do you have benchmark numbers stating
> otherwise?

I stumbled across this behaviour (write_zeros for 2 MB, then overwrite
almost everything) in the context of a different bug, and it just didn't
make much sense to me. Is there really a file system where fragmentation
is introduced by not zeroing the area first and then overwriting it?

I'm not insisting on making this change because the behaviour is
harmless if odd, but if we think that writing twice to some blocks is an
optimisation, maybe we should actually measure and document this.


Anyway, let's talk about the reported bug that made me look at the
strace that showed this behaviour because I feel it supports my last
point. It's a bit messy, but anyway:

    https://bugzilla.redhat.com/show_bug.cgi?id=1666864

So initially, bad performance on a fragmented image file was reported.
Not much to do there, but then in comment 16, QA reported a performance
regression in this case between 4.0 and 4.2. And this change caused by
c8bb23cbdbe, i.e. the commit that introduced handle_alloc_space().

Turns out that BDRV_REQ_NO_FALLBACK doesn't always guarantee that it's
_really_ fast. fallocate(FALLOC_FL_ZERO_RANGE) causes some kind of flush
on XFS and buffered writes don't. So with the old code, qemu-img convert
to a file on a very full filesystem that will cause fragmentation, was
much faster with writing a zero buffer than with write_zeroes (because
it didn't flush the result).

I don't fully understand why this is and hope that XFS can do something
about it. I also don't really think we should revert the change in QEMU,
though I'm not completely sure. But I just wanted to share this to show
that "obvious" characteristics of certain types of requests aren't
always true and doing obscure optimisations based on what we think
filesystems may do can actually achieve the opposite in some cases.

Kevin
Vladimir Sementsov-Ogievskiy June 9, 2020, 3:29 p.m. UTC | #5
09.06.2020 18:18, Kevin Wolf wrote:
> Am 09.06.2020 um 16:46 hat Eric Blake geschrieben:
>> On 6/9/20 9:28 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> 09.06.2020 17:08, Kevin Wolf wrote:
>>>> Since commit c8bb23cbdbe, handle_alloc_space() is called for newly
>>>> allocated clusters to efficiently initialise the COW areas with zeros if
>>>> necessary. It skips the whole operation if both start_cow nor end_cow
>>>> are empty. However, it requests zeroing the whole request size (possibly
>>>> multiple megabytes) even if only one end of the request actually needs
>>>> this.
>>>>
>>>> This patch reduces the write_zeroes request size in this case so that we
>>>> don't unnecessarily zero-initialise a region that we're going to
>>>> overwrite immediately.
>>>>
>>
>>>
>>> Hmm, I'm afraid, that this may make things worse in some cases, as with
>>> one big write-zero request
>>> we preallocate data-region in the protocol file, so we have better
>>> locality for the clusters we
>>> are going to write. And, in the same time, with BDRV_REQ_NO_FALLBACK
>>> flag write-zero must be
>>> fast anyway (especially in comparison with the following write request).
>>>
>>>>           /*
>>>>            * instead of writing zero COW buffers,
>>>>            * efficiently zero out the whole clusters
>>>>            */
>>>> -        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
>>>> -                                            m->nb_clusters *
>>>> s->cluster_size,
>>>> -                                            true);
>>>> +        ret = qcow2_pre_write_overlap_check(bs, 0, start, len, true);
>>>>           if (ret < 0) {
>>>>               return ret;
>>>>           }
>>>>           BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>>>> -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
>>>> -                                    m->nb_clusters * s->cluster_size,
>>>> +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
>>>>                                       BDRV_REQ_NO_FALLBACK);
>>
>> Good point.  If we weren't using BDRV_REQ_NO_FALLBACK, then avoiding a
>> pre-zero pass over the middle is essential.  But since we are insisting that
>> the pre-zero pass be fast or else immediately fail, the time spent in
>> pre-zeroing should not be a concern.  Do you have benchmark numbers stating
>> otherwise?
> 
> I stumbled across this behaviour (write_zeros for 2 MB, then overwrite
> almost everything) in the context of a different bug, and it just didn't
> make much sense to me. Is there really a file system where fragmentation
> is introduced by not zeroing the area first and then overwriting it?
> 
> I'm not insisting on making this change because the behaviour is
> harmless if odd, but if we think that writing twice to some blocks is an
> optimisation, maybe we should actually measure and document this.

Not to same blocks: first we do write-zeroes to the area aligned-up to cluster bound. So it's more probable that the resulting clusters would be contigous on file-system.. With your patch it may be split into two parts. (a bit too theoretical, I'd better prove it by example)

Also, we (Virtuozzo) have to support some custom distributed fs, where allocation itself is expensive, so the additional benefit of first (larger) write-zero request is that we have one allocation request instead of two (with your patch) or three (if we decide to make two write-zero opersions).

> 
> 
> Anyway, let's talk about the reported bug that made me look at the
> strace that showed this behaviour because I feel it supports my last
> point. It's a bit messy, but anyway:
> 
>      https://bugzilla.redhat.com/show_bug.cgi?id=1666864
> 
> So initially, bad performance on a fragmented image file was reported.
> Not much to do there, but then in comment 16, QA reported a performance
> regression in this case between 4.0 and 4.2. And this change caused by
> c8bb23cbdbe, i.e. the commit that introduced handle_alloc_space().
> 
> Turns out that BDRV_REQ_NO_FALLBACK doesn't always guarantee that it's
> _really_ fast. fallocate(FALLOC_FL_ZERO_RANGE) causes some kind of flush
> on XFS and buffered writes don't. So with the old code, qemu-img convert
> to a file on a very full filesystem that will cause fragmentation, was
> much faster with writing a zero buffer than with write_zeroes (because
> it didn't flush the result).
> 
> I don't fully understand why this is and hope that XFS can do something
> about it. I also don't really think we should revert the change in QEMU,
> though I'm not completely sure. But I just wanted to share this to show
> that "obvious" characteristics of certain types of requests aren't
> always true and doing obscure optimisations based on what we think
> filesystems may do can actually achieve the opposite in some cases.
> 
> Kevin
>
Eric Blake June 9, 2020, 4:19 p.m. UTC | #6
On 6/9/20 10:18 AM, Kevin Wolf wrote:

>>>> -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
>>>> -                                    m->nb_clusters * s->cluster_size,
>>>> +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
>>>>                                        BDRV_REQ_NO_FALLBACK);
>>
>> Good point.  If we weren't using BDRV_REQ_NO_FALLBACK, then avoiding a
>> pre-zero pass over the middle is essential.  But since we are insisting that
>> the pre-zero pass be fast or else immediately fail, the time spent in
>> pre-zeroing should not be a concern.  Do you have benchmark numbers stating
>> otherwise?
> 
> I stumbled across this behaviour (write_zeros for 2 MB, then overwrite
> almost everything) in the context of a different bug, and it just didn't
> make much sense to me. Is there really a file system where fragmentation
> is introduced by not zeroing the area first and then overwriting it?
> 
> I'm not insisting on making this change because the behaviour is
> harmless if odd, but if we think that writing twice to some blocks is an
> optimisation, maybe we should actually measure and document this.
> 
> 
> Anyway, let's talk about the reported bug that made me look at the
> strace that showed this behaviour because I feel it supports my last
> point. It's a bit messy, but anyway:
> 
>      https://bugzilla.redhat.com/show_bug.cgi?id=1666864
> 
> So initially, bad performance on a fragmented image file was reported.
> Not much to do there, but then in comment 16, QA reported a performance
> regression in this case between 4.0 and 4.2. And this change caused by
> c8bb23cbdbe, i.e. the commit that introduced handle_alloc_space().
> 
> Turns out that BDRV_REQ_NO_FALLBACK doesn't always guarantee that it's
> _really_ fast. fallocate(FALLOC_FL_ZERO_RANGE) causes some kind of flush
> on XFS and buffered writes don't. So with the old code, qemu-img convert
> to a file on a very full filesystem that will cause fragmentation, was
> much faster with writing a zero buffer than with write_zeroes (because
> it didn't flush the result).

Wow. That makes it sound like we should NOT attempt 
fallocate(FALLOC_FL_ZERO_RANGE) on the fast path, because we don't have 
guarantees that it is fast.

I really wish the kernel would give us 
fallocate(FALLOC_FL_ZERO_RANGE|FALLOC_FL_NO_FALLBACK) which would fail 
fast rather than doing a flush or other slow fallback.

> 
> I don't fully understand why this is and hope that XFS can do something
> about it. I also don't really think we should revert the change in QEMU,
> though I'm not completely sure. But I just wanted to share this to show
> that "obvious" characteristics of certain types of requests aren't
> always true and doing obscure optimisations based on what we think
> filesystems may do can actually achieve the opposite in some cases.

It also goes to show us that the kernel does NOT yet give us enough 
fine-grained control over what we really want (which is: 'pre-zero this 
if it is fast, but don't waste time if it is not).  Most of the kernel 
interfaces end up being 'pre-zero this, and it might be fast, fail fast, 
or even fall back to something safe but slow, and you can't tell the 
difference short of trying'.
Vladimir Sementsov-Ogievskiy June 10, 2020, 6:50 a.m. UTC | #7
09.06.2020 19:19, Eric Blake wrote:
> On 6/9/20 10:18 AM, Kevin Wolf wrote:
> 
>>>>> -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
>>>>> -                                    m->nb_clusters * s->cluster_size,
>>>>> +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
>>>>>                                        BDRV_REQ_NO_FALLBACK);
>>>
>>> Good point.  If we weren't using BDRV_REQ_NO_FALLBACK, then avoiding a
>>> pre-zero pass over the middle is essential.  But since we are insisting that
>>> the pre-zero pass be fast or else immediately fail, the time spent in
>>> pre-zeroing should not be a concern.  Do you have benchmark numbers stating
>>> otherwise?
>>
>> I stumbled across this behaviour (write_zeros for 2 MB, then overwrite
>> almost everything) in the context of a different bug, and it just didn't
>> make much sense to me. Is there really a file system where fragmentation
>> is introduced by not zeroing the area first and then overwriting it?
>>
>> I'm not insisting on making this change because the behaviour is
>> harmless if odd, but if we think that writing twice to some blocks is an
>> optimisation, maybe we should actually measure and document this.
>>
>>
>> Anyway, let's talk about the reported bug that made me look at the
>> strace that showed this behaviour because I feel it supports my last
>> point. It's a bit messy, but anyway:
>>
>>      https://bugzilla.redhat.com/show_bug.cgi?id=1666864
>>
>> So initially, bad performance on a fragmented image file was reported.
>> Not much to do there, but then in comment 16, QA reported a performance
>> regression in this case between 4.0 and 4.2. And this change caused by
>> c8bb23cbdbe, i.e. the commit that introduced handle_alloc_space().
>>
>> Turns out that BDRV_REQ_NO_FALLBACK doesn't always guarantee that it's
>> _really_ fast. fallocate(FALLOC_FL_ZERO_RANGE) causes some kind of flush
>> on XFS and buffered writes don't. So with the old code, qemu-img convert
>> to a file on a very full filesystem that will cause fragmentation, was
>> much faster with writing a zero buffer than with write_zeroes (because
>> it didn't flush the result).
> 
> Wow. That makes it sound like we should NOT attempt fallocate(FALLOC_FL_ZERO_RANGE) on the fast path, because we don't have guarantees that it is fast.
> 
> I really wish the kernel would give us fallocate(FALLOC_FL_ZERO_RANGE|FALLOC_FL_NO_FALLBACK) which would fail fast rather than doing a flush or other slow fallback.
> 
>>
>> I don't fully understand why this is and hope that XFS can do something
>> about it. I also don't really think we should revert the change in QEMU,
>> though I'm not completely sure. But I just wanted to share this to show
>> that "obvious" characteristics of certain types of requests aren't
>> always true and doing obscure optimisations based on what we think
>> filesystems may do can actually achieve the opposite in some cases.
> 
> It also goes to show us that the kernel does NOT yet give us enough fine-grained control over what we really want (which is: 'pre-zero this if it is fast, but don't waste time if it is not).  Most of the kernel interfaces end up being 'pre-zero this, and it might be fast, fail fast, or even fall back to something safe but slow, and you can't tell the difference short of trying'.
> 

Hmm, actually, for small cow areas (several bytes? several sectors?), I'm not surprised that direct writing zeroed buffer may work faster than any kind of WRITE_ZERO request. Especially, expanding write-request for a small amount of bytes may be faster than doing intead two requests. Possibly, we need some heuristics here. And I think, it would be good to add some benchmarks based on scripts/simplebench to have real numbers (we'll try).
Vladimir Sementsov-Ogievskiy June 10, 2020, 8:38 a.m. UTC | #8
09.06.2020 18:29, Vladimir Sementsov-Ogievskiy wrote:
> 09.06.2020 18:18, Kevin Wolf wrote:
>> Am 09.06.2020 um 16:46 hat Eric Blake geschrieben:
>>> On 6/9/20 9:28 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> 09.06.2020 17:08, Kevin Wolf wrote:
>>>>> Since commit c8bb23cbdbe, handle_alloc_space() is called for newly
>>>>> allocated clusters to efficiently initialise the COW areas with zeros if
>>>>> necessary. It skips the whole operation if both start_cow nor end_cow
>>>>> are empty. However, it requests zeroing the whole request size (possibly
>>>>> multiple megabytes) even if only one end of the request actually needs
>>>>> this.
>>>>>
>>>>> This patch reduces the write_zeroes request size in this case so that we
>>>>> don't unnecessarily zero-initialise a region that we're going to
>>>>> overwrite immediately.
>>>>>
>>>
>>>>
>>>> Hmm, I'm afraid, that this may make things worse in some cases, as with
>>>> one big write-zero request
>>>> we preallocate data-region in the protocol file, so we have better
>>>> locality for the clusters we
>>>> are going to write. And, in the same time, with BDRV_REQ_NO_FALLBACK
>>>> flag write-zero must be
>>>> fast anyway (especially in comparison with the following write request).
>>>>
>>>>>           /*
>>>>>            * instead of writing zero COW buffers,
>>>>>            * efficiently zero out the whole clusters
>>>>>            */
>>>>> -        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
>>>>> -                                            m->nb_clusters *
>>>>> s->cluster_size,
>>>>> -                                            true);
>>>>> +        ret = qcow2_pre_write_overlap_check(bs, 0, start, len, true);
>>>>>           if (ret < 0) {
>>>>>               return ret;
>>>>>           }
>>>>>           BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>>>>> -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
>>>>> -                                    m->nb_clusters * s->cluster_size,
>>>>> +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
>>>>>                                       BDRV_REQ_NO_FALLBACK);
>>>
>>> Good point.  If we weren't using BDRV_REQ_NO_FALLBACK, then avoiding a
>>> pre-zero pass over the middle is essential.  But since we are insisting that
>>> the pre-zero pass be fast or else immediately fail, the time spent in
>>> pre-zeroing should not be a concern.  Do you have benchmark numbers stating
>>> otherwise?
>>
>> I stumbled across this behaviour (write_zeros for 2 MB, then overwrite
>> almost everything) in the context of a different bug, and it just didn't
>> make much sense to me. Is there really a file system where fragmentation
>> is introduced by not zeroing the area first and then overwriting it?
>>
>> I'm not insisting on making this change because the behaviour is
>> harmless if odd, but if we think that writing twice to some blocks is an
>> optimisation, maybe we should actually measure and document this.
> 
> Not to same blocks: first we do write-zeroes to the area aligned-up to cluster bound. So it's more probable that the resulting clusters would be contigous on file-system.. With your patch it may be split into two parts. (a bit too theoretical, I'd better prove it by example)
> 
> Also, we (Virtuozzo) have to support some custom distributed fs, where allocation itself is expensive, so the additional benefit of first (larger) write-zero request is that we have one allocation request instead of two (with your patch) or three (if we decide to make two write-zero opersions).

Hmm, Denis Lunev said me that double allocation should not be a problem, as it is happening almost in the same time, fs should handle this. So probably my counter-arguments are wrong. Still, handle_alloc_space() attracts attention often, and some benchmark-tests around it will not hurt.
Kevin Wolf June 10, 2020, 11:25 a.m. UTC | #9
Am 10.06.2020 um 08:50 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 09.06.2020 19:19, Eric Blake wrote:
> > On 6/9/20 10:18 AM, Kevin Wolf wrote:
> > 
> > > > > > -        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
> > > > > > -                                    m->nb_clusters * s->cluster_size,
> > > > > > +        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
> > > > > >                                        BDRV_REQ_NO_FALLBACK);
> > > > 
> > > > Good point.  If we weren't using BDRV_REQ_NO_FALLBACK, then avoiding a
> > > > pre-zero pass over the middle is essential.  But since we are insisting that
> > > > the pre-zero pass be fast or else immediately fail, the time spent in
> > > > pre-zeroing should not be a concern.  Do you have benchmark numbers stating
> > > > otherwise?
> > > 
> > > I stumbled across this behaviour (write_zeros for 2 MB, then overwrite
> > > almost everything) in the context of a different bug, and it just didn't
> > > make much sense to me. Is there really a file system where fragmentation
> > > is introduced by not zeroing the area first and then overwriting it?
> > > 
> > > I'm not insisting on making this change because the behaviour is
> > > harmless if odd, but if we think that writing twice to some blocks is an
> > > optimisation, maybe we should actually measure and document this.
> > > 
> > > 
> > > Anyway, let's talk about the reported bug that made me look at the
> > > strace that showed this behaviour because I feel it supports my last
> > > point. It's a bit messy, but anyway:
> > > 
> > >      https://bugzilla.redhat.com/show_bug.cgi?id=1666864
> > > 
> > > So initially, bad performance on a fragmented image file was reported.
> > > Not much to do there, but then in comment 16, QA reported a performance
> > > regression in this case between 4.0 and 4.2. And this change caused by
> > > c8bb23cbdbe, i.e. the commit that introduced handle_alloc_space().
> > > 
> > > Turns out that BDRV_REQ_NO_FALLBACK doesn't always guarantee that it's
> > > _really_ fast. fallocate(FALLOC_FL_ZERO_RANGE) causes some kind of flush
> > > on XFS and buffered writes don't. So with the old code, qemu-img convert
> > > to a file on a very full filesystem that will cause fragmentation, was
> > > much faster with writing a zero buffer than with write_zeroes (because
> > > it didn't flush the result).
> > 
> > Wow. That makes it sound like we should NOT attempt
> > fallocate(FALLOC_FL_ZERO_RANGE) on the fast path, because we don't
> > have guarantees that it is fast.
> > 
> > I really wish the kernel would give us
> > fallocate(FALLOC_FL_ZERO_RANGE|FALLOC_FL_NO_FALLBACK) which would
> > fail fast rather than doing a flush or other slow fallback.
> > 
> > > 
> > > I don't fully understand why this is and hope that XFS can do
> > > something about it. I also don't really think we should revert the
> > > change in QEMU, though I'm not completely sure. But I just wanted
> > > to share this to show that "obvious" characteristics of certain
> > > types of requests aren't always true and doing obscure
> > > optimisations based on what we think filesystems may do can
> > > actually achieve the opposite in some cases.
> > 
> > It also goes to show us that the kernel does NOT yet give us enough
> > fine-grained control over what we really want (which is: 'pre-zero
> > this if it is fast, but don't waste time if it is not).  Most of the
> > kernel interfaces end up being 'pre-zero this, and it might be fast,
> > fail fast, or even fall back to something safe but slow, and you
> > can't tell the difference short of trying'.
> 
> Hmm, actually, for small cow areas (several bytes? several sectors?),
> I'm not surprised that direct writing zeroed buffer may work faster
> than any kind of WRITE_ZERO request. Especially, expanding
> write-request for a small amount of bytes may be faster than doing
> intead two requests. Possibly, we need some heuristics here. And I
> think, it would be good to add some benchmarks based on
> scripts/simplebench to have real numbers (we'll try).

I'll continue the discussion in the BZ, but yes, at the moment the
recommendation of the XFS people seems to be that we avoid fallocate()
(at least without FALLOC_FL_KEEP_SIZE and on local filesystems) for
small sizes.

It's not obvious what "small sizes" means in practice, but I wouldn't be
surprised if a qcow2 cluster is always in this category, even if it's
2 MB. (The pathological qemu-img convert case does use 2 MB buffers.)

Kevin
diff mbox series

Patch

diff --git a/block/qcow2.c b/block/qcow2.c
index 0cd2e6757e..77742877fb 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2403,6 +2403,8 @@  static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
     }
 
     for (m = l2meta; m != NULL; m = m->next) {
+        uint64_t start = m->alloc_offset;
+        uint64_t len = m->nb_clusters * s->cluster_size;
         int ret;
 
         if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
@@ -2413,21 +2415,25 @@  static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
             continue;
         }
 
+        if (!m->cow_start.nb_bytes) {
+            start += m->cow_end.offset;
+            len -= m->cow_end.offset;
+        } else if (!m->cow_end.nb_bytes) {
+            len = m->cow_start.nb_bytes;
+        }
+
         /*
          * instead of writing zero COW buffers,
          * efficiently zero out the whole clusters
          */
 
-        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
-                                            m->nb_clusters * s->cluster_size,
-                                            true);
+        ret = qcow2_pre_write_overlap_check(bs, 0, start, len, true);
         if (ret < 0) {
             return ret;
         }
 
         BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
-        ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset,
-                                    m->nb_clusters * s->cluster_size,
+        ret = bdrv_co_pwrite_zeroes(s->data_file, start, len,
                                     BDRV_REQ_NO_FALLBACK);
         if (ret < 0) {
             if (ret != -ENOTSUP && ret != -EAGAIN) {