diff mbox

[v5,4/4] block: ignore flush requests when storage is clean

Message ID 1467643124-29778-5-git-send-email-den@openvz.org
State New
Headers show

Commit Message

Denis V. Lunev July 4, 2016, 2:38 p.m. UTC
From: Evgeny Yakovlev <eyakovlev@virtuozzo.com>

Some guests (win2008 server for example) do a lot of unnecessary
flushing when underlying media has not changed. This adds additional
overhead on host when calling fsync/fdatasync.

This change introduces a write generation scheme in BlockDriverState.
Current write generation is checked against last flushed generation to
avoid unnessesary flushes.

The problem with excessive flushing was found by a performance test
which does parallel directory tree creation (from 2 processes).
Results improved from 0.424 loops/sec to 0.432 loops/sec.
Each loop creates 10^3 directories with 10 files in each.

Signed-off-by: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Max Reitz <mreitz@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Fam Zheng <famz@redhat.com>
CC: John Snow <jsnow@redhat.com>
---
 block.c                   |  3 +++
 block/io.c                | 18 ++++++++++++++++++
 include/block/block_int.h |  5 +++++
 3 files changed, 26 insertions(+)

Comments

Eric Blake July 7, 2016, 11:04 p.m. UTC | #1
On 07/04/2016 08:38 AM, Denis V. Lunev wrote:
> From: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
> 
> Some guests (win2008 server for example) do a lot of unnecessary
> flushing when underlying media has not changed. This adds additional
> overhead on host when calling fsync/fdatasync.
> 
> This change introduces a write generation scheme in BlockDriverState.
> Current write generation is checked against last flushed generation to
> avoid unnessesary flushes.
> 
> The problem with excessive flushing was found by a performance test
> which does parallel directory tree creation (from 2 processes).
> Results improved from 0.424 loops/sec to 0.432 loops/sec.
> Each loop creates 10^3 directories with 10 files in each.
> 

> +++ b/block/io.c
> @@ -1294,6 +1294,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
>      }
>      bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
>  
> +    ++bs->write_gen;

Why pre-increment?  Most code uses post-increment when done as a
statement in isolation.

>      bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
>  
>      if (bs->wr_highest_offset < offset + bytes) {
> @@ -2211,6 +2212,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>  {
>      int ret;
>      BdrvTrackedRequest req;
> +    int current_gen = bs->write_gen;
>  
>      if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
>          bdrv_is_sg(bs)) {
> @@ -2219,6 +2221,12 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>  
>      tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
>  
> +    /* Wait until any previous flushes are completed */
> +    while (bs->flush_started_gen != bs->flushed_gen) {

Should this be an inequality, as in s/!=/</, in case several flushes can
be started in parallel and where the later flush ends up finishing
before the earlier flush?

> +        qemu_co_queue_wait(&bs->flush_queue);
> +    }
> +    bs->flush_started_gen = current_gen;
> +
>      /* Write back all layers by calling one driver function */
>      if (bs->drv->bdrv_co_flush) {
>          ret = bs->drv->bdrv_co_flush(bs);
> @@ -2239,6 +2247,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>          goto flush_parent;
>      }
>  
> +    /* Check if we really need to flush anything */
> +    if (bs->flushed_gen == current_gen) {

Likewise, if you are tracking generations, should this be s/==/<=/ (am I
getting the direction correct)?

> +++ b/include/block/block_int.h
> @@ -420,6 +420,11 @@ struct BlockDriverState {
>                           note this is a reference count */
>      bool probed;
>  
> +    CoQueue flush_queue;            /* Serializing flush queue */
> +    unsigned int write_gen;         /* Current data generation */
> +    unsigned int flush_started_gen; /* Generation for which flush has started */
> +    unsigned int flushed_gen;       /* Flushed write generation */

Should these be 64-bit integers to avoid risk of overflow after just
2^32 flush attempts?
Evgeny Yakovlev July 8, 2016, 3:19 p.m. UTC | #2
On 08.07.2016 02:04, Eric Blake wrote:
> On 07/04/2016 08:38 AM, Denis V. Lunev wrote:
>> From: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
>>
>> Some guests (win2008 server for example) do a lot of unnecessary
>> flushing when underlying media has not changed. This adds additional
>> overhead on host when calling fsync/fdatasync.
>>
>> This change introduces a write generation scheme in BlockDriverState.
>> Current write generation is checked against last flushed generation to
>> avoid unnessesary flushes.
>>
>> The problem with excessive flushing was found by a performance test
>> which does parallel directory tree creation (from 2 processes).
>> Results improved from 0.424 loops/sec to 0.432 loops/sec.
>> Each loop creates 10^3 directories with 10 files in each.
>>
>> +++ b/block/io.c
>> @@ -1294,6 +1294,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
>>       }
>>       bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
>>   
>> +    ++bs->write_gen;
> Why pre-increment?  Most code uses post-increment when done as a
> statement in isolation.

Just a habit of mine, from C++ days, where you can never be sure if 
someone overloaded post-increment operator and it will then generate a 
temporary object because post-increment is defined to return previous 
value. Now it's just a muscle memory :)

>
>>       bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
>>   
>>       if (bs->wr_highest_offset < offset + bytes) {
>> @@ -2211,6 +2212,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>>   {
>>       int ret;
>>       BdrvTrackedRequest req;
>> +    int current_gen = bs->write_gen;
>>   
>>       if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
>>           bdrv_is_sg(bs)) {
>> @@ -2219,6 +2221,12 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>>   
>>       tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
>>   
>> +    /* Wait until any previous flushes are completed */
>> +    while (bs->flush_started_gen != bs->flushed_gen) {
> Should this be an inequality, as in s/!=/</, in case several flushes can
> be started in parallel and where the later flush ends up finishing
> before the earlier flush?

flush_started_gen is always ahead of flushed_gen or is equal to it no 
matter how many requests we have in flight. using != allows us to ignore 
checking for unsigned overflow (which you also mention in this email).

>
>> +        qemu_co_queue_wait(&bs->flush_queue);
>> +    }
>> +    bs->flush_started_gen = current_gen;
>> +
>>       /* Write back all layers by calling one driver function */
>>       if (bs->drv->bdrv_co_flush) {
>>           ret = bs->drv->bdrv_co_flush(bs);
>> @@ -2239,6 +2247,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>>           goto flush_parent;
>>       }
>>   
>> +    /* Check if we really need to flush anything */
>> +    if (bs->flushed_gen == current_gen) {
> Likewise, if you are tracking generations, should this be s/==/<=/ (am I
> getting the direction correct)?

Same here - '==' is so that we don't have to worry about unsigned overflow.

>
>> +++ b/include/block/block_int.h
>> @@ -420,6 +420,11 @@ struct BlockDriverState {
>>                            note this is a reference count */
>>       bool probed;
>>   
>> +    CoQueue flush_queue;            /* Serializing flush queue */
>> +    unsigned int write_gen;         /* Current data generation */
>> +    unsigned int flush_started_gen; /* Generation for which flush has started */
>> +    unsigned int flushed_gen;       /* Flushed write generation */
> Should these be 64-bit integers to avoid risk of overflow after just
> 2^32 flush attempts?
>

We don't have to care about unsigned overflow. It has a well defined 
behavior and we only check if generations are equal or not.
John Snow July 8, 2016, 6:44 p.m. UTC | #3
On 07/04/2016 10:38 AM, Denis V. Lunev wrote:
> From: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
> 
> Some guests (win2008 server for example) do a lot of unnecessary
> flushing when underlying media has not changed. This adds additional
> overhead on host when calling fsync/fdatasync.
> 
> This change introduces a write generation scheme in BlockDriverState.
> Current write generation is checked against last flushed generation to
> avoid unnessesary flushes.
> 
> The problem with excessive flushing was found by a performance test
> which does parallel directory tree creation (from 2 processes).
> Results improved from 0.424 loops/sec to 0.432 loops/sec.
> Each loop creates 10^3 directories with 10 files in each.
> 
> Signed-off-by: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Max Reitz <mreitz@redhat.com>
> CC: Stefan Hajnoczi <stefanha@redhat.com>
> CC: Fam Zheng <famz@redhat.com>
> CC: John Snow <jsnow@redhat.com>
> ---
>  block.c                   |  3 +++
>  block/io.c                | 18 ++++++++++++++++++
>  include/block/block_int.h |  5 +++++
>  3 files changed, 26 insertions(+)
> 
> diff --git a/block.c b/block.c
> index f4648e9..366fad6 100644
> --- a/block.c
> +++ b/block.c
> @@ -234,6 +234,8 @@ BlockDriverState *bdrv_new(void)
>      bs->refcnt = 1;
>      bs->aio_context = qemu_get_aio_context();
>  
> +    qemu_co_queue_init(&bs->flush_queue);
> +
>      QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
>  
>      return bs;
> @@ -2582,6 +2584,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
>          ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
>          bdrv_dirty_bitmap_truncate(bs);
>          bdrv_parent_cb_resize(bs);
> +        ++bs->write_gen;
>      }
>      return ret;
>  }
> diff --git a/block/io.c b/block/io.c
> index 7cf3645..a5451b6 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -1294,6 +1294,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
>      }
>      bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
>  
> +    ++bs->write_gen;
>      bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
>  
>      if (bs->wr_highest_offset < offset + bytes) {
> @@ -2211,6 +2212,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>  {
>      int ret;
>      BdrvTrackedRequest req;
> +    int current_gen = bs->write_gen;
>  
>      if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
>          bdrv_is_sg(bs)) {
> @@ -2219,6 +2221,12 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>  
>      tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
>  
> +    /* Wait until any previous flushes are completed */
> +    while (bs->flush_started_gen != bs->flushed_gen) {
> +        qemu_co_queue_wait(&bs->flush_queue);
> +    }
> +    bs->flush_started_gen = current_gen;
> +
>      /* Write back all layers by calling one driver function */
>      if (bs->drv->bdrv_co_flush) {
>          ret = bs->drv->bdrv_co_flush(bs);
> @@ -2239,6 +2247,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>          goto flush_parent;
>      }
>  
> +    /* Check if we really need to flush anything */
> +    if (bs->flushed_gen == current_gen) {
> +        goto flush_parent;
> +    }
> +
>      BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
>      if (bs->drv->bdrv_co_flush_to_disk) {
>          ret = bs->drv->bdrv_co_flush_to_disk(bs);
> @@ -2279,6 +2292,10 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>  flush_parent:
>      ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
>  out:
> +    /* Notify any pending flushes that we have completed */
> +    bs->flushed_gen = current_gen;
> +    qemu_co_queue_restart_all(&bs->flush_queue);
> +
>      tracked_request_end(&req);
>      return ret;
>  }
> @@ -2402,6 +2419,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>      }
>      ret = 0;
>  out:
> +    ++bs->write_gen;
>      bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
>                     req.bytes >> BDRV_SECTOR_BITS);
>      tracked_request_end(&req);
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 2057156..8543daf 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -420,6 +420,11 @@ struct BlockDriverState {
>                           note this is a reference count */
>      bool probed;
>  
> +    CoQueue flush_queue;            /* Serializing flush queue */
> +    unsigned int write_gen;         /* Current data generation */
> +    unsigned int flush_started_gen; /* Generation for which flush has started */
> +    unsigned int flushed_gen;       /* Flushed write generation */
> +
>      BlockDriver *drv; /* NULL means no media */
>      void *opaque;
>  
> 

Breaks qcow2 iotests 026 089 141 144

--js
Evgeny Yakovlev July 11, 2016, 10:12 a.m. UTC | #4
On 08.07.2016 21:44, John Snow wrote:
>
> On 07/04/2016 10:38 AM, Denis V. Lunev wrote:
>> From: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
>>
>> Some guests (win2008 server for example) do a lot of unnecessary
>> flushing when underlying media has not changed. This adds additional
>> overhead on host when calling fsync/fdatasync.
>>
>> This change introduces a write generation scheme in BlockDriverState.
>> Current write generation is checked against last flushed generation to
>> avoid unnessesary flushes.
>>
>> The problem with excessive flushing was found by a performance test
>> which does parallel directory tree creation (from 2 processes).
>> Results improved from 0.424 loops/sec to 0.432 loops/sec.
>> Each loop creates 10^3 directories with 10 files in each.
>>
>> Signed-off-by: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> CC: Kevin Wolf <kwolf@redhat.com>
>> CC: Max Reitz <mreitz@redhat.com>
>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>> CC: Fam Zheng <famz@redhat.com>
>> CC: John Snow <jsnow@redhat.com>
>> ---
>>   block.c                   |  3 +++
>>   block/io.c                | 18 ++++++++++++++++++
>>   include/block/block_int.h |  5 +++++
>>   3 files changed, 26 insertions(+)
>>
>> diff --git a/block.c b/block.c
>> index f4648e9..366fad6 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -234,6 +234,8 @@ BlockDriverState *bdrv_new(void)
>>       bs->refcnt = 1;
>>       bs->aio_context = qemu_get_aio_context();
>>   
>> +    qemu_co_queue_init(&bs->flush_queue);
>> +
>>       QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
>>   
>>       return bs;
>> @@ -2582,6 +2584,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
>>           ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
>>           bdrv_dirty_bitmap_truncate(bs);
>>           bdrv_parent_cb_resize(bs);
>> +        ++bs->write_gen;
>>       }
>>       return ret;
>>   }
>> diff --git a/block/io.c b/block/io.c
>> index 7cf3645..a5451b6 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -1294,6 +1294,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
>>       }
>>       bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
>>   
>> +    ++bs->write_gen;
>>       bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
>>   
>>       if (bs->wr_highest_offset < offset + bytes) {
>> @@ -2211,6 +2212,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>>   {
>>       int ret;
>>       BdrvTrackedRequest req;
>> +    int current_gen = bs->write_gen;
>>   
>>       if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
>>           bdrv_is_sg(bs)) {
>> @@ -2219,6 +2221,12 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>>   
>>       tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
>>   
>> +    /* Wait until any previous flushes are completed */
>> +    while (bs->flush_started_gen != bs->flushed_gen) {
>> +        qemu_co_queue_wait(&bs->flush_queue);
>> +    }
>> +    bs->flush_started_gen = current_gen;
>> +
>>       /* Write back all layers by calling one driver function */
>>       if (bs->drv->bdrv_co_flush) {
>>           ret = bs->drv->bdrv_co_flush(bs);
>> @@ -2239,6 +2247,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>>           goto flush_parent;
>>       }
>>   
>> +    /* Check if we really need to flush anything */
>> +    if (bs->flushed_gen == current_gen) {
>> +        goto flush_parent;
>> +    }
>> +
>>       BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
>>       if (bs->drv->bdrv_co_flush_to_disk) {
>>           ret = bs->drv->bdrv_co_flush_to_disk(bs);
>> @@ -2279,6 +2292,10 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
>>   flush_parent:
>>       ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
>>   out:
>> +    /* Notify any pending flushes that we have completed */
>> +    bs->flushed_gen = current_gen;
>> +    qemu_co_queue_restart_all(&bs->flush_queue);
>> +
>>       tracked_request_end(&req);
>>       return ret;
>>   }
>> @@ -2402,6 +2419,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>>       }
>>       ret = 0;
>>   out:
>> +    ++bs->write_gen;
>>       bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
>>                      req.bytes >> BDRV_SECTOR_BITS);
>>       tracked_request_end(&req);
>> diff --git a/include/block/block_int.h b/include/block/block_int.h
>> index 2057156..8543daf 100644
>> --- a/include/block/block_int.h
>> +++ b/include/block/block_int.h
>> @@ -420,6 +420,11 @@ struct BlockDriverState {
>>                            note this is a reference count */
>>       bool probed;
>>   
>> +    CoQueue flush_queue;            /* Serializing flush queue */
>> +    unsigned int write_gen;         /* Current data generation */
>> +    unsigned int flush_started_gen; /* Generation for which flush has started */
>> +    unsigned int flushed_gen;       /* Flushed write generation */
>> +
>>       BlockDriver *drv; /* NULL means no media */
>>       void *opaque;
>>   
>>
> Breaks qcow2 iotests 026 089 141 144

Sorry, didn't knew those tests existed, only ran make check previously.
Looking at 026, looks like it is the same problem as in IDE and AHCI. 
Test case injects blkdebug write errors which should be triggered by 
flushes and expects to see them in output. However those flushes are now 
skipped and no events are generated. Otherwise resulting image looks 
consistent, all data was flushed. Expect the same problem to be with 
other tests, but maybe test case is incorrect now?

>
> --js
John Snow July 11, 2016, 9:01 p.m. UTC | #5
On 07/11/2016 06:12 AM, Evgeny Yakovlev wrote:
> 
> 
> On 08.07.2016 21:44, John Snow wrote:
>>
>> On 07/04/2016 10:38 AM, Denis V. Lunev wrote:
>>> From: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
>>>
>>> Some guests (win2008 server for example) do a lot of unnecessary
>>> flushing when underlying media has not changed. This adds additional
>>> overhead on host when calling fsync/fdatasync.
>>>
>>> This change introduces a write generation scheme in BlockDriverState.
>>> Current write generation is checked against last flushed generation to
>>> avoid unnessesary flushes.
>>>
>>> The problem with excessive flushing was found by a performance test
>>> which does parallel directory tree creation (from 2 processes).
>>> Results improved from 0.424 loops/sec to 0.432 loops/sec.
>>> Each loop creates 10^3 directories with 10 files in each.
>>>
>>> Signed-off-by: Evgeny Yakovlev <eyakovlev@virtuozzo.com>
>>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>>> CC: Kevin Wolf <kwolf@redhat.com>
>>> CC: Max Reitz <mreitz@redhat.com>
>>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>>> CC: Fam Zheng <famz@redhat.com>
>>> CC: John Snow <jsnow@redhat.com>
>>> ---
>>>   block.c                   |  3 +++
>>>   block/io.c                | 18 ++++++++++++++++++
>>>   include/block/block_int.h |  5 +++++
>>>   3 files changed, 26 insertions(+)
>>>
>>> diff --git a/block.c b/block.c
>>> index f4648e9..366fad6 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -234,6 +234,8 @@ BlockDriverState *bdrv_new(void)
>>>       bs->refcnt = 1;
>>>       bs->aio_context = qemu_get_aio_context();
>>>   +    qemu_co_queue_init(&bs->flush_queue);
>>> +
>>>       QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
>>>         return bs;
>>> @@ -2582,6 +2584,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t
>>> offset)
>>>           ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
>>>           bdrv_dirty_bitmap_truncate(bs);
>>>           bdrv_parent_cb_resize(bs);
>>> +        ++bs->write_gen;
>>>       }
>>>       return ret;
>>>   }
>>> diff --git a/block/io.c b/block/io.c
>>> index 7cf3645..a5451b6 100644
>>> --- a/block/io.c
>>> +++ b/block/io.c
>>> @@ -1294,6 +1294,7 @@ static int coroutine_fn
>>> bdrv_aligned_pwritev(BlockDriverState *bs,
>>>       }
>>>       bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
>>>   +    ++bs->write_gen;
>>>       bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
>>>         if (bs->wr_highest_offset < offset + bytes) {
>>> @@ -2211,6 +2212,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState
>>> *bs)
>>>   {
>>>       int ret;
>>>       BdrvTrackedRequest req;
>>> +    int current_gen = bs->write_gen;
>>>         if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
>>>           bdrv_is_sg(bs)) {
>>> @@ -2219,6 +2221,12 @@ int coroutine_fn
>>> bdrv_co_flush(BlockDriverState *bs)
>>>         tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
>>>   +    /* Wait until any previous flushes are completed */
>>> +    while (bs->flush_started_gen != bs->flushed_gen) {
>>> +        qemu_co_queue_wait(&bs->flush_queue);
>>> +    }
>>> +    bs->flush_started_gen = current_gen;
>>> +
>>>       /* Write back all layers by calling one driver function */
>>>       if (bs->drv->bdrv_co_flush) {
>>>           ret = bs->drv->bdrv_co_flush(bs);
>>> @@ -2239,6 +2247,11 @@ int coroutine_fn
>>> bdrv_co_flush(BlockDriverState *bs)
>>>           goto flush_parent;
>>>       }
>>>   +    /* Check if we really need to flush anything */
>>> +    if (bs->flushed_gen == current_gen) {
>>> +        goto flush_parent;
>>> +    }
>>> +
>>>       BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
>>>       if (bs->drv->bdrv_co_flush_to_disk) {
>>>           ret = bs->drv->bdrv_co_flush_to_disk(bs);
>>> @@ -2279,6 +2292,10 @@ int coroutine_fn
>>> bdrv_co_flush(BlockDriverState *bs)
>>>   flush_parent:
>>>       ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
>>>   out:
>>> +    /* Notify any pending flushes that we have completed */
>>> +    bs->flushed_gen = current_gen;
>>> +    qemu_co_queue_restart_all(&bs->flush_queue);
>>> +
>>>       tracked_request_end(&req);
>>>       return ret;
>>>   }
>>> @@ -2402,6 +2419,7 @@ int coroutine_fn
>>> bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>>>       }
>>>       ret = 0;
>>>   out:
>>> +    ++bs->write_gen;
>>>       bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
>>>                      req.bytes >> BDRV_SECTOR_BITS);
>>>       tracked_request_end(&req);
>>> diff --git a/include/block/block_int.h b/include/block/block_int.h
>>> index 2057156..8543daf 100644
>>> --- a/include/block/block_int.h
>>> +++ b/include/block/block_int.h
>>> @@ -420,6 +420,11 @@ struct BlockDriverState {
>>>                            note this is a reference count */
>>>       bool probed;
>>>   +    CoQueue flush_queue;            /* Serializing flush queue */
>>> +    unsigned int write_gen;         /* Current data generation */
>>> +    unsigned int flush_started_gen; /* Generation for which flush
>>> has started */
>>> +    unsigned int flushed_gen;       /* Flushed write generation */
>>> +
>>>       BlockDriver *drv; /* NULL means no media */
>>>       void *opaque;
>>>  
>> Breaks qcow2 iotests 026 089 141 144
> 
> Sorry, didn't knew those tests existed, only ran make check previously.
> Looking at 026, looks like it is the same problem as in IDE and AHCI.
> Test case injects blkdebug write errors which should be triggered by
> flushes and expects to see them in output. However those flushes are now
> skipped and no events are generated. Otherwise resulting image looks
> consistent, all data was flushed. Expect the same problem to be with
> other tests, but maybe test case is incorrect now?
> 

No problem -- these tests don't *always* get run before merge but I like
to enforce it for my tree.

Yes, it just looks like most of the test output has to be updated.

In the case of 026, it looks like most test cases that are testing for
failures have other error text to confirm that the error did indeed
happen, and that the flush messages are just "extraneous" failure
messages -- i.e. not the primary effect being tested for, so it should
be safe to just update the reference output.

I suspect the other tests are similar.

Thank you,
--John
diff mbox

Patch

diff --git a/block.c b/block.c
index f4648e9..366fad6 100644
--- a/block.c
+++ b/block.c
@@ -234,6 +234,8 @@  BlockDriverState *bdrv_new(void)
     bs->refcnt = 1;
     bs->aio_context = qemu_get_aio_context();
 
+    qemu_co_queue_init(&bs->flush_queue);
+
     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
 
     return bs;
@@ -2582,6 +2584,7 @@  int bdrv_truncate(BlockDriverState *bs, int64_t offset)
         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
         bdrv_dirty_bitmap_truncate(bs);
         bdrv_parent_cb_resize(bs);
+        ++bs->write_gen;
     }
     return ret;
 }
diff --git a/block/io.c b/block/io.c
index 7cf3645..a5451b6 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1294,6 +1294,7 @@  static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     }
     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
 
+    ++bs->write_gen;
     bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
 
     if (bs->wr_highest_offset < offset + bytes) {
@@ -2211,6 +2212,7 @@  int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
     int ret;
     BdrvTrackedRequest req;
+    int current_gen = bs->write_gen;
 
     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
         bdrv_is_sg(bs)) {
@@ -2219,6 +2221,12 @@  int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
 
+    /* Wait until any previous flushes are completed */
+    while (bs->flush_started_gen != bs->flushed_gen) {
+        qemu_co_queue_wait(&bs->flush_queue);
+    }
+    bs->flush_started_gen = current_gen;
+
     /* Write back all layers by calling one driver function */
     if (bs->drv->bdrv_co_flush) {
         ret = bs->drv->bdrv_co_flush(bs);
@@ -2239,6 +2247,11 @@  int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
         goto flush_parent;
     }
 
+    /* Check if we really need to flush anything */
+    if (bs->flushed_gen == current_gen) {
+        goto flush_parent;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
     if (bs->drv->bdrv_co_flush_to_disk) {
         ret = bs->drv->bdrv_co_flush_to_disk(bs);
@@ -2279,6 +2292,10 @@  int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 flush_parent:
     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
 out:
+    /* Notify any pending flushes that we have completed */
+    bs->flushed_gen = current_gen;
+    qemu_co_queue_restart_all(&bs->flush_queue);
+
     tracked_request_end(&req);
     return ret;
 }
@@ -2402,6 +2419,7 @@  int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
     }
     ret = 0;
 out:
+    ++bs->write_gen;
     bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
                    req.bytes >> BDRV_SECTOR_BITS);
     tracked_request_end(&req);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 2057156..8543daf 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -420,6 +420,11 @@  struct BlockDriverState {
                          note this is a reference count */
     bool probed;
 
+    CoQueue flush_queue;            /* Serializing flush queue */
+    unsigned int write_gen;         /* Current data generation */
+    unsigned int flush_started_gen; /* Generation for which flush has started */
+    unsigned int flushed_gen;       /* Flushed write generation */
+
     BlockDriver *drv; /* NULL means no media */
     void *opaque;