diff mbox

[V2,1/1] linux-aio: prevent submitting more than MAX_EVENTS

Message ID 1468396629-26094-1-git-send-email-roman.penyaev@profitbricks.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roman Pen July 13, 2016, 7:57 a.m. UTC
v1..v2:

  o comment tweaks.
  o fix QEMU coding style.

Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us
with specified number of events.  But kernel ring buffer allocation logic
is a bit tricky (ring buffer is page size aligned + some percpu allocation
are required) so eventually more than requested events number is allocated.

From a userspace side we have to follow the convention and should not try
to io_submit() more or logic, which consumes completed events, should be
changed accordingly.  The pitfall is in the following sequence:

    MAX_EVENTS = 128
    io_setup(MAX_EVENTS)

    io_submit(MAX_EVENTS)
    io_submit(MAX_EVENTS)

    /* now 256 events are in-flight */

    io_getevents(MAX_EVENTS) = 128

    /* we can handle only 128 events at once, to be sure
     * that nothing is pended the io_getevents(MAX_EVENTS)
     * call must be invoked once more or hang will happen. */

To prevent the hang or reiteration of io_getevents() call this patch
restricts the number of in-flights, which is now limited to MAX_EVENTS.

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-devel@nongnu.org
---
 block/linux-aio.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

Comments

Paolo Bonzini July 13, 2016, 10:31 a.m. UTC | #1
On 13/07/2016 09:57, Roman Pen wrote:
> v1..v2:
> 
>   o comment tweaks.
>   o fix QEMU coding style.
> 
> Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us
> with specified number of events.  But kernel ring buffer allocation logic
> is a bit tricky (ring buffer is page size aligned + some percpu allocation
> are required) so eventually more than requested events number is allocated.
> 
> From a userspace side we have to follow the convention and should not try
> to io_submit() more or logic, which consumes completed events, should be
> changed accordingly.  The pitfall is in the following sequence:
> 
>     MAX_EVENTS = 128
>     io_setup(MAX_EVENTS)
> 
>     io_submit(MAX_EVENTS)
>     io_submit(MAX_EVENTS)
> 
>     /* now 256 events are in-flight */
> 
>     io_getevents(MAX_EVENTS) = 128
> 
>     /* we can handle only 128 events at once, to be sure
>      * that nothing is pended the io_getevents(MAX_EVENTS)
>      * call must be invoked once more or hang will happen. */
> 
> To prevent the hang or reiteration of io_getevents() call this patch
> restricts the number of in-flights, which is now limited to MAX_EVENTS.
> 
> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
> Reviewed-by: Fam Zheng <famz@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: qemu-devel@nongnu.org
> ---
>  block/linux-aio.c | 26 ++++++++++++++++----------
>  1 file changed, 16 insertions(+), 10 deletions(-)
> 
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index e468960..78f4524 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -28,8 +28,6 @@
>   */
>  #define MAX_EVENTS 128
>  
> -#define MAX_QUEUED_IO  128
> -
>  struct qemu_laiocb {
>      BlockAIOCB common;
>      Coroutine *co;
> @@ -44,7 +42,8 @@ struct qemu_laiocb {
>  
>  typedef struct {
>      int plugged;
> -    unsigned int n;
> +    unsigned int in_queue;
> +    unsigned int in_flight;
>      bool blocked;
>      QSIMPLEQ_HEAD(, qemu_laiocb) pending;
>  } LaioQueue;
> @@ -129,6 +128,7 @@ static void qemu_laio_completion_bh(void *opaque)
>              s->event_max = 0;
>              return; /* no more events */
>          }
> +        s->io_q.in_flight -= s->event_max;
>      }
>  
>      /* Reschedule so nested event loops see currently pending completions */
> @@ -190,7 +190,8 @@ static void ioq_init(LaioQueue *io_q)
>  {
>      QSIMPLEQ_INIT(&io_q->pending);
>      io_q->plugged = 0;
> -    io_q->n = 0;
> +    io_q->in_queue = 0;
> +    io_q->in_flight = 0;
>      io_q->blocked = false;
>  }
>  
> @@ -198,14 +199,17 @@ static void ioq_submit(LinuxAioState *s)
>  {
>      int ret, len;
>      struct qemu_laiocb *aiocb;
> -    struct iocb *iocbs[MAX_QUEUED_IO];
> +    struct iocb *iocbs[MAX_EVENTS];
>      QSIMPLEQ_HEAD(, qemu_laiocb) completed;
>  
>      do {
> +        if (s->io_q.in_flight >= MAX_EVENTS) {
> +            break;
> +        }
>          len = 0;
>          QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
>              iocbs[len++] = &aiocb->iocb;
> -            if (len == MAX_QUEUED_IO) {
> +            if (s->io_q.in_flight + len >= MAX_EVENTS) {
>                  break;
>              }
>          }
> @@ -218,11 +222,12 @@ static void ioq_submit(LinuxAioState *s)
>              abort();
>          }
>  
> -        s->io_q.n -= ret;
> +        s->io_q.in_flight += ret;
> +        s->io_q.in_queue  -= ret;
>          aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
>          QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
>      } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
> -    s->io_q.blocked = (s->io_q.n > 0);
> +    s->io_q.blocked = (s->io_q.in_queue > 0);
>  }
>  
>  void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
> @@ -263,9 +268,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
>      io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
>  
>      QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
> -    s->io_q.n++;
> +    s->io_q.in_queue++;
>      if (!s->io_q.blocked &&
> -        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
> +        (!s->io_q.plugged ||
> +         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
>          ioq_submit(s);
>      }
>  
> 

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Roman Pen July 13, 2016, 11:33 a.m. UTC | #2
On Wed, Jul 13, 2016 at 12:31 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 13/07/2016 09:57, Roman Pen wrote:
>> v1..v2:
>>
>>   o comment tweaks.
>>   o fix QEMU coding style.
>>
>> Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us
>> with specified number of events.  But kernel ring buffer allocation logic
>> is a bit tricky (ring buffer is page size aligned + some percpu allocation
>> are required) so eventually more than requested events number is allocated.
>>
>> From a userspace side we have to follow the convention and should not try
>> to io_submit() more or logic, which consumes completed events, should be
>> changed accordingly.  The pitfall is in the following sequence:
>>
>>     MAX_EVENTS = 128
>>     io_setup(MAX_EVENTS)
>>
>>     io_submit(MAX_EVENTS)
>>     io_submit(MAX_EVENTS)
>>
>>     /* now 256 events are in-flight */
>>
>>     io_getevents(MAX_EVENTS) = 128
>>
>>     /* we can handle only 128 events at once, to be sure
>>      * that nothing is pended the io_getevents(MAX_EVENTS)
>>      * call must be invoked once more or hang will happen. */
>>
>> To prevent the hang or reiteration of io_getevents() call this patch
>> restricts the number of in-flights, which is now limited to MAX_EVENTS.
>>
>> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
>> Reviewed-by: Fam Zheng <famz@redhat.com>
>> Cc: Stefan Hajnoczi <stefanha@redhat.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: qemu-devel@nongnu.org
>> ---
>>  block/linux-aio.c | 26 ++++++++++++++++----------
>>  1 file changed, 16 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/linux-aio.c b/block/linux-aio.c
>> index e468960..78f4524 100644
>> --- a/block/linux-aio.c
>> +++ b/block/linux-aio.c
>> @@ -28,8 +28,6 @@
>>   */
>>  #define MAX_EVENTS 128
>>
>> -#define MAX_QUEUED_IO  128
>> -
>>  struct qemu_laiocb {
>>      BlockAIOCB common;
>>      Coroutine *co;
>> @@ -44,7 +42,8 @@ struct qemu_laiocb {
>>
>>  typedef struct {
>>      int plugged;
>> -    unsigned int n;
>> +    unsigned int in_queue;
>> +    unsigned int in_flight;
>>      bool blocked;
>>      QSIMPLEQ_HEAD(, qemu_laiocb) pending;
>>  } LaioQueue;
>> @@ -129,6 +128,7 @@ static void qemu_laio_completion_bh(void *opaque)
>>              s->event_max = 0;
>>              return; /* no more events */
>>          }
>> +        s->io_q.in_flight -= s->event_max;
>>      }
>>
>>      /* Reschedule so nested event loops see currently pending completions */
>> @@ -190,7 +190,8 @@ static void ioq_init(LaioQueue *io_q)
>>  {
>>      QSIMPLEQ_INIT(&io_q->pending);
>>      io_q->plugged = 0;
>> -    io_q->n = 0;
>> +    io_q->in_queue = 0;
>> +    io_q->in_flight = 0;
>>      io_q->blocked = false;
>>  }
>>
>> @@ -198,14 +199,17 @@ static void ioq_submit(LinuxAioState *s)
>>  {
>>      int ret, len;
>>      struct qemu_laiocb *aiocb;
>> -    struct iocb *iocbs[MAX_QUEUED_IO];
>> +    struct iocb *iocbs[MAX_EVENTS];
>>      QSIMPLEQ_HEAD(, qemu_laiocb) completed;
>>
>>      do {
>> +        if (s->io_q.in_flight >= MAX_EVENTS) {
>> +            break;
>> +        }
>>          len = 0;
>>          QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
>>              iocbs[len++] = &aiocb->iocb;
>> -            if (len == MAX_QUEUED_IO) {
>> +            if (s->io_q.in_flight + len >= MAX_EVENTS) {
>>                  break;
>>              }
>>          }
>> @@ -218,11 +222,12 @@ static void ioq_submit(LinuxAioState *s)
>>              abort();
>>          }
>>
>> -        s->io_q.n -= ret;
>> +        s->io_q.in_flight += ret;
>> +        s->io_q.in_queue  -= ret;
>>          aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
>>          QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
>>      } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
>> -    s->io_q.blocked = (s->io_q.n > 0);
>> +    s->io_q.blocked = (s->io_q.in_queue > 0);
>>  }
>>
>>  void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
>> @@ -263,9 +268,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
>>      io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
>>
>>      QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
>> -    s->io_q.n++;
>> +    s->io_q.in_queue++;
>>      if (!s->io_q.blocked &&
>> -        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
>> +        (!s->io_q.plugged ||
>> +         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
>>          ioq_submit(s);
>>      }
>>
>>
>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>

Just to be sure that we are on the same page:

1. We have this commit "linux-aio: Cancel BH if not needed" which

   a) introduces performance regression on my fio workloads on the
      following config: "iothread=1, VCPU=8, MQ=8". Performance
      dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is
      ~14%.

   b) reproduces IO hang, because of in-flights > MAX_EVENTS.

 So probably this commit should be reverted because of a) not b).

2. Stefan has fix for 1.b) issue repeating io_getevents(), which
   is obviously an excess for generic cases where MQ=1 (queue depth
   for virtio_blk is also set to 128 i.e. equal to MAX_EVENTS on
   QEMU side).

3. The current patch also aims to fix 1.b) issue restricting number
   of in-flights.


Reverting 1. will fix all the problems, without any need to apply
2. or 3.  The most lazy variant.

Restricting in-flights is also a step forward, since submitting
till -EAGAIN is also possible, but leads (as we already know) to
IO hang on specific loads and conditions.

But 2. and 3. are mutual exclusive and should not be applied
together.

So we have several alternatives and a choice what to follow.

--
Roman
Kevin Wolf July 13, 2016, 11:45 a.m. UTC | #3
Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben:
> Just to be sure that we are on the same page:
> 
> 1. We have this commit "linux-aio: Cancel BH if not needed" which
> 
>    a) introduces performance regression on my fio workloads on the
>       following config: "iothread=1, VCPU=8, MQ=8". Performance
>       dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is
>       ~14%.

Do we already understand why the performance regresses with the patch?
As long as we don't, everything we do is just guesswork.

Kevin

>    b) reproduces IO hang, because of in-flights > MAX_EVENTS.
> 
>  So probably this commit should be reverted because of a) not b).
> 
> 2. Stefan has fix for 1.b) issue repeating io_getevents(), which
>    is obviously an excess for generic cases where MQ=1 (queue depth
>    for virtio_blk is also set to 128 i.e. equal to MAX_EVENTS on
>    QEMU side).
> 
> 3. The current patch also aims to fix 1.b) issue restricting number
>    of in-flights.
> 
> 
> Reverting 1. will fix all the problems, without any need to apply
> 2. or 3.  The most lazy variant.
> 
> Restricting in-flights is also a step forward, since submitting
> till -EAGAIN is also possible, but leads (as we already know) to
> IO hang on specific loads and conditions.
> 
> But 2. and 3. are mutual exclusive and should not be applied
> together.
> 
> So we have several alternatives and a choice what to follow.
> 
> --
> Roman
Eric Blake July 13, 2016, 12:22 p.m. UTC | #4
On 07/13/2016 01:57 AM, Roman Pen wrote:
> v1..v2:
> 
>   o comment tweaks.
>   o fix QEMU coding style.

The above comments should be delayed...

> 
> Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us
> with specified number of events.  But kernel ring buffer allocation logic
> is a bit tricky (ring buffer is page size aligned + some percpu allocation
> are required) so eventually more than requested events number is allocated.
> 

...

> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
> Reviewed-by: Fam Zheng <famz@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: qemu-devel@nongnu.org
> ---

...until here, after the --- separator.  They are useful to reviewers,
but won't make much sense a year from now in qemu.git (when we don't
care what other versions were on list, only the version that got committed).

Also, if you use 'git send-email -v2' (or 'git format-patch -v2'), your
subject line will resemble most other versioned patches (which use
[PATCH v2] rather than [PATCH V2]).  We also recommend that v2 patches
be sent as top-level threads, rather than in-reply to v1.

More submission hints at http://wiki.qemu.org/Contribute/SubmitAPatch
Roman Pen July 13, 2016, 12:57 p.m. UTC | #5
On Wed, Jul 13, 2016 at 2:22 PM, Eric Blake <eblake@redhat.com> wrote:
> On 07/13/2016 01:57 AM, Roman Pen wrote:
>> v1..v2:
>>
>>   o comment tweaks.
>>   o fix QEMU coding style.
>
> The above comments should be delayed...
>
>>
>> Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us
>> with specified number of events.  But kernel ring buffer allocation logic
>> is a bit tricky (ring buffer is page size aligned + some percpu allocation
>> are required) so eventually more than requested events number is allocated.
>>
>
> ...
>
>> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
>> Reviewed-by: Fam Zheng <famz@redhat.com>
>> Cc: Stefan Hajnoczi <stefanha@redhat.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: qemu-devel@nongnu.org
>> ---
>
> ...until here, after the --- separator.  They are useful to reviewers,
> but won't make much sense a year from now in qemu.git (when we don't
> care what other versions were on list, only the version that got committed).
>
> Also, if you use 'git send-email -v2' (or 'git format-patch -v2'), your
> subject line will resemble most other versioned patches (which use
> [PATCH v2] rather than [PATCH V2]).  We also recommend that v2 patches
> be sent as top-level threads, rather than in-reply to v1.
>
> More submission hints at http://wiki.qemu.org/Contribute/SubmitAPatch

Thanks for tips.  Will resend to top-level shortly.

--
Roman
Roman Pen July 13, 2016, 2:53 p.m. UTC | #6
On Wed, Jul 13, 2016 at 1:45 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben:
>> Just to be sure that we are on the same page:
>>
>> 1. We have this commit "linux-aio: Cancel BH if not needed" which
>>
>>    a) introduces performance regression on my fio workloads on the
>>       following config: "iothread=1, VCPU=8, MQ=8". Performance
>>       dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is
>>       ~14%.
>
> Do we already understand why the performance regresses with the patch?

This is very good question from the author of the patch.  Speaking for
myself, I do not understand.

> As long as we don't, everything we do is just guesswork.

Kevin, did you miss the ending "with Stefan's fix" ?  Since your patch
reproduces another problem it is impossible to test it isolated or IO
hangs.

I tested four variations and invite you to do the same, since you are
the author of the debatable patch:

1. as-is, i.e.
   + Stefan's "virtio-blk: dataplane multiqueue support"
   + yours "linux-aio: Cancel BH if not needed"

   As we already discussed - IO hangs.

2. + Stefan's "linux-aio: keep processing events if MAX_EVENTS reached"

   READ: io=48199MB, aggrb=1606.5MB/s, minb=1606.5MB/s,
maxb=1606.5MB/s, mint=30003msec, maxt=30003msec
  WRITE: io=48056MB, aggrb=1601.8MB/s, minb=1601.8MB/s,
maxb=1601.8MB/s, mint=30003msec, maxt=30003msec

3. - Stefan's "linux-aio: keep processing events if MAX_EVENTS reached"
   + my "linux-aio: prevent submitting more than MAX_EVENTS"

   READ: io=53294MB, aggrb=1776.3MB/s, minb=1776.3MB/s,
maxb=1776.3MB/s, mint=30003msec, maxt=30003msec
  WRITE: io=53177MB, aggrb=1772.4MB/s, minb=1772.4MB/s,
maxb=1772.4MB/s, mint=30003msec, maxt=30003msec

4. - my "linux-aio: prevent submitting more than MAX_EVENTS"
   - yours "linux-aio: Cancel BH if not needed"

   READ: io=56362MB, aggrb=1878.4MB/s, minb=1878.4MB/s,
maxb=1878.4MB/s, mint=30007msec, maxt=30007msec
  WRITE: io=56255MB, aggrb=1874.8MB/s, minb=1874.8MB/s,
maxb=1874.8MB/s, mint=30007msec, maxt=30007msec

The drop from 1878MB/s to 1776MB/s is ~5% and probably can be ignored
(I say probably because would be nice to have some other numbers from
 you or anybody else)

The drop from 1878MB/s to 1606Mb/s is ~14% and seems like a serious
degradation.

Also to go deeper and to avoid possible suspicions I tested isolated
Stefan's "linux-aio: keep processing events if MAX_EVENTS reached"
patch, without yours "linux-aio: Cancel BH if not needed":

   READ: io=109970MB, aggrb=1832.8MB/s, minb=1832.8MB/s,
maxb=1832.8MB/s, mint=60003msec, maxt=60003msec
  WRITE: io=109820MB, aggrb=1830.3MB/s, minb=1830.3MB/s,
maxb=1830.3MB/s, mint=60003msec, maxt=60003msec

As you can see no significant drop.  That means that only the following pair:

   Stefan's "linux-aio: keep processing events if MAX_EVENTS reached"
   yours "linux-aio: Cancel BH if not needed"

impacts the performance.

As I already told we have different choices to follow and one of them
(the simplest) is to revert everything.

--
Roman

>
> Kevin
>
>>    b) reproduces IO hang, because of in-flights > MAX_EVENTS.
>>
>>  So probably this commit should be reverted because of a) not b).
>>
>> 2. Stefan has fix for 1.b) issue repeating io_getevents(), which
>>    is obviously an excess for generic cases where MQ=1 (queue depth
>>    for virtio_blk is also set to 128 i.e. equal to MAX_EVENTS on
>>    QEMU side).
>>
>> 3. The current patch also aims to fix 1.b) issue restricting number
>>    of in-flights.
>>
>>
>> Reverting 1. will fix all the problems, without any need to apply
>> 2. or 3.  The most lazy variant.
>>
>> Restricting in-flights is also a step forward, since submitting
>> till -EAGAIN is also possible, but leads (as we already know) to
>> IO hang on specific loads and conditions.
>>
>> But 2. and 3. are mutual exclusive and should not be applied
>> together.
>>
>> So we have several alternatives and a choice what to follow.
>>
>> --
>> Roman
Stefan Hajnoczi July 14, 2016, 12:18 p.m. UTC | #7
On Wed, Jul 13, 2016 at 09:57:09AM +0200, Roman Pen wrote:

Please send each new revision of a patch series as a separate email
thread.  Do not thread revisions with Reply-To:, References:.

See http://qemu-project.org/Contribute/SubmitAPatch for all the
guidelines on submitting patches.

> v1..v2:
> 
>   o comment tweaks.
>   o fix QEMU coding style.

The changelog is useful for reviewers but is no longer useful once the
patch has been merged.  Therefore it goes below the '---' so that
git-am(1) doesn't include it when merging.

> Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us
> with specified number of events.  But kernel ring buffer allocation logic
> is a bit tricky (ring buffer is page size aligned + some percpu allocation
> are required) so eventually more than requested events number is allocated.
> 
> From a userspace side we have to follow the convention and should not try
> to io_submit() more or logic, which consumes completed events, should be
> changed accordingly.  The pitfall is in the following sequence:
> 
>     MAX_EVENTS = 128
>     io_setup(MAX_EVENTS)
> 
>     io_submit(MAX_EVENTS)
>     io_submit(MAX_EVENTS)
> 
>     /* now 256 events are in-flight */
> 
>     io_getevents(MAX_EVENTS) = 128
> 
>     /* we can handle only 128 events at once, to be sure
>      * that nothing is pended the io_getevents(MAX_EVENTS)
>      * call must be invoked once more or hang will happen. */
> 
> To prevent the hang or reiteration of io_getevents() call this patch
> restricts the number of in-flights, which is now limited to MAX_EVENTS.
> 
> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
> Reviewed-by: Fam Zheng <famz@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: qemu-devel@nongnu.org
> ---
>  block/linux-aio.c | 26 ++++++++++++++++----------
>  1 file changed, 16 insertions(+), 10 deletions(-)
> 
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index e468960..78f4524 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -28,8 +28,6 @@
>   */
>  #define MAX_EVENTS 128
>  
> -#define MAX_QUEUED_IO  128
> -
>  struct qemu_laiocb {
>      BlockAIOCB common;
>      Coroutine *co;
> @@ -44,7 +42,8 @@ struct qemu_laiocb {
>  
>  typedef struct {
>      int plugged;
> -    unsigned int n;
> +    unsigned int in_queue;
> +    unsigned int in_flight;
>      bool blocked;
>      QSIMPLEQ_HEAD(, qemu_laiocb) pending;
>  } LaioQueue;
> @@ -129,6 +128,7 @@ static void qemu_laio_completion_bh(void *opaque)
>              s->event_max = 0;
>              return; /* no more events */
>          }
> +        s->io_q.in_flight -= s->event_max;
>      }
>  
>      /* Reschedule so nested event loops see currently pending completions */
> @@ -190,7 +190,8 @@ static void ioq_init(LaioQueue *io_q)
>  {
>      QSIMPLEQ_INIT(&io_q->pending);
>      io_q->plugged = 0;
> -    io_q->n = 0;
> +    io_q->in_queue = 0;
> +    io_q->in_flight = 0;
>      io_q->blocked = false;
>  }
>  
> @@ -198,14 +199,17 @@ static void ioq_submit(LinuxAioState *s)
>  {
>      int ret, len;
>      struct qemu_laiocb *aiocb;
> -    struct iocb *iocbs[MAX_QUEUED_IO];
> +    struct iocb *iocbs[MAX_EVENTS];
>      QSIMPLEQ_HEAD(, qemu_laiocb) completed;
>  
>      do {
> +        if (s->io_q.in_flight >= MAX_EVENTS) {
> +            break;
> +        }
>          len = 0;
>          QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
>              iocbs[len++] = &aiocb->iocb;
> -            if (len == MAX_QUEUED_IO) {
> +            if (s->io_q.in_flight + len >= MAX_EVENTS) {
>                  break;
>              }
>          }
> @@ -218,11 +222,12 @@ static void ioq_submit(LinuxAioState *s)
>              abort();
>          }
>  
> -        s->io_q.n -= ret;
> +        s->io_q.in_flight += ret;
> +        s->io_q.in_queue  -= ret;
>          aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
>          QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
>      } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
> -    s->io_q.blocked = (s->io_q.n > 0);
> +    s->io_q.blocked = (s->io_q.in_queue > 0);
>  }
>  
>  void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
> @@ -263,9 +268,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
>      io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
>  
>      QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
> -    s->io_q.n++;
> +    s->io_q.in_queue++;
>      if (!s->io_q.blocked &&
> -        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
> +        (!s->io_q.plugged ||
> +         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
>          ioq_submit(s);
>      }
>  
> -- 
> 2.8.2
> 
>
Roman Pen July 15, 2016, 9:18 a.m. UTC | #8
On Wed, Jul 13, 2016 at 1:45 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben:
>> Just to be sure that we are on the same page:
>>
>> 1. We have this commit "linux-aio: Cancel BH if not needed" which
>>
>>    a) introduces performance regression on my fio workloads on the
>>       following config: "iothread=1, VCPU=8, MQ=8". Performance
>>       dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is
>>       ~14%.
>
> Do we already understand why the performance regresses with the patch?
> As long as we don't, everything we do is just guesswork.

Eventually the issue is clear.  I test on /dev/nullb0, which completes
all submitted bios almost immediately.  That means, that after io_submit()
is called it is worth trying to check completed requests and not to
accumulate them in-flight.

That is the theory.  On practise happens the following:

-------------------------------------------------------------------
>>> sys_poll
<<< sys_poll
>>> aio_dispatch
        >>> aio_bh_poll
        <<< aio_bh_poll
        >>> node->io_read
                !!! ioq_submit(), submitted=98
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=49
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=47
        <<< node->io_read
<<< aio_dispatch
>>> sys_poll
<<< sys_poll
>>> aio_dispatch
        >>> aio_bh_poll
        <<< aio_bh_poll
        >>> node->io_read
                !!! ioq_submit(), submitted=50
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=43
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=43
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=8
        <<< node->io_read
        >>> node->io_read
                ~~~ qemu_laio_completion_bh completed=338
        <<< node->io_read
<<< aio_dispatch
-------------------------------------------------------------------
        * this run gave 1461MB/s *

This is the very common hunk of the log which I see running fio load with
the "linux-aio: Cancel BH if not needed" patch applied.

The important thing which is worth paying attention to is submission of
338 requests (almost whole ring buffer of AIO context) before consuming
requests completions.

Very fast backend device completes submitted requests almost immediately,
but we get a chance to fetch completions only some time later.

The following is the common part of the log when
"linux-aio: Cancel BH if not needed" is reverted:

-------------------------------------------------------------------
>>> sys_poll
<<< sys_poll
>>> dispatch
        >>> aio_bh_poll
                ~~~ qemu_laio_completion_bh completed=199
        <<< aio_bh_poll
        >>> node->io_read
                !!! ioq_submit(), submitted=47
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=49
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=50
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=43
        <<< node->io_read
        >>> node->io_read
        <<< node->io_read
<<< dispatch
>>> sys_poll
<<< sys_poll
>>> dispatch
        >>> aio_bh_poll
                ~~~ qemu_laio_completion_bh, completed=189
        <<< aio_bh_poll
        >>> node->io_read
                !!! ioq_submit(), submitted=46
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=46
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=51
        <<< node->io_read
        >>> node->io_read
                !!! ioq_submit(), submitted=51
        <<< node->io_read
<<< dispatch
-------------------------------------------------------------------
        * this run gave 1805MB/s *

According to this part of the log I can say, that completions happen
frequently, i.e. we get a chance to fetch completions more often, thus
queue is always "refreshed" by new comming requests.

To be more precise I collected some statistics: each time I enter
qemu_laio_completion_bh() I account the number of collected requests in the
bucket, e.g.:

   "~~~ qemu_laio_completion_bh completed=199"

       bucket[199] += 1;

   "~~~ qemu_laio_completion_bh, completed=189"

       bucket[189] += 1;

   ....

When fio finishes I have a distribution of number of completed requests
which I have observed in the ring buffer.

Here is the sheet:

https://docs.google.com/spreadsheets/d/12CIt6EKiJLqNx0OHNqiabR-oFBrqkH0LN3mjzZ5jGeo/edit?usp=sharing

(Frankly, I could not think of anything better than to send a link on
 google docs, sorry if that insults someone).

There is a chart which shows the whole picture of distribution:

   o  X axis is a number of requests completed at once.
   o  Y axis is a number of times we observe that number of requests.

To avoid scaling problems I plotted the chart starting from 10 requests,
since low numbers of requests do not have much impact but have huge
values.

Those 3 red spikes and a blue hill is what we have to focus on.  The
blue hill at the right corner of the chart means that almost always the
ring buffer was observed as full, i.e. qemu_laio_completion_bh() got
a chance to reap completions not very often, meanwhile completed
requests stand in the ring buffer for quite a long time which degrades
the overall performance.

The results covered by the red line are much better and that can be
explained by those 3 red spikes, which are almost in the middle of the
whole distribution, i.e. qemu_laio_completion_bh() is called more often,
completed requests do not stall, giving fio a chance to submit new fresh
requests.

The theoretical fix would be to schedule completion BH just after
successful io_submit, i.e.:

---------------------------------------------------------------------
@@ -228,6 +228,8 @@ static void ioq_submit(LinuxAioState *s)
         QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
     } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
     s->io_q.blocked = (s->io_q.n > 0);
+
+    qemu_bh_schedule(s->completion_bh);
 }
---------------------------------------------------------------------

This theoretical fix works pretty fine and numbers return to expected
~1800MB/s.

So believe me or not but BH, which was not accidentally canceled, gives
better results on very fast backend devices.

The other interesting observation is the following: submission limitation
(which I did in the "linux-aio: prevent submitting more than MAX_EVENTS"
patch) also fixes the issue, because before submitting more than MAX_EVENTS
we have to reap something, which obviously do not let already completed
requests stall in the queue for a long time.

--
Roman
Paolo Bonzini July 15, 2016, 9:58 a.m. UTC | #9
On 15/07/2016 11:18, Roman Penyaev wrote:
> Those 3 red spikes and a blue hill is what we have to focus on.  The
> blue hill at the right corner of the chart means that almost always the
> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got
> a chance to reap completions not very often, meanwhile completed
> requests stand in the ring buffer for quite a long time which degrades
> the overall performance.
> 
> The results covered by the red line are much better and that can be
> explained by those 3 red spikes, which are almost in the middle of the
> whole distribution, i.e. qemu_laio_completion_bh() is called more often,
> completed requests do not stall, giving fio a chance to submit new fresh
> requests.
> 
> The theoretical fix would be to schedule completion BH just after
> successful io_submit, i.e.:

What about removing the qemu_bh_cancel but keeping the rest of the patch?

I'm also interested in a graph with this patch ("linux-aio: prevent
submitting more than MAX_EVENTS") on top of origin/master.

Thanks for the analysis.  Sometimes a picture _is_ worth a thousand
words, even if it's measuring "only" second-order effects (# of
completions is not what causes the slowdown, but # of completions
affects latency which causes the slowdown).

Paolo
Roman Pen July 15, 2016, 10:17 a.m. UTC | #10
On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/07/2016 11:18, Roman Penyaev wrote:
>> Those 3 red spikes and a blue hill is what we have to focus on.  The
>> blue hill at the right corner of the chart means that almost always the
>> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got
>> a chance to reap completions not very often, meanwhile completed
>> requests stand in the ring buffer for quite a long time which degrades
>> the overall performance.
>>
>> The results covered by the red line are much better and that can be
>> explained by those 3 red spikes, which are almost in the middle of the
>> whole distribution, i.e. qemu_laio_completion_bh() is called more often,
>> completed requests do not stall, giving fio a chance to submit new fresh
>> requests.
>>
>> The theoretical fix would be to schedule completion BH just after
>> successful io_submit, i.e.:
>
> What about removing the qemu_bh_cancel but keeping the rest of the patch?

That exactly what I did.  Numbers go to expected from ~1600MB/s to ~1800MB/s.
So basically this hunk of the debatable patch:

     if (event_notifier_test_and_clear(&s->e)) {
-        qemu_bh_schedule(s->completion_bh);
+        qemu_laio_completion_bh(s);
     }

does not have any impact and can be ignored.  At least I did not notice
anything important.

>
> I'm also interested in a graph with this patch ("linux-aio: prevent
> submitting more than MAX_EVENTS") on top of origin/master.

I can plot it also of course.

>
> Thanks for the analysis.  Sometimes a picture _is_ worth a thousand
> words, even if it's measuring "only" second-order effects (# of
> completions is not what causes the slowdown, but # of completions
> affects latency which causes the slowdown).

Yes, you are right, latency.  With userspace io_getevents ~0 costs we
can peek requests as often as we like to decrease latency on very
fast devices.  That can also bring something.  Probably after each
io_submit() it makes sense to peek and complete something.

--
Roman
Paolo Bonzini July 15, 2016, 10:37 a.m. UTC | #11
On 15/07/2016 12:17, Roman Penyaev wrote:
> On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 15/07/2016 11:18, Roman Penyaev wrote:
>>> Those 3 red spikes and a blue hill is what we have to focus on.  The
>>> blue hill at the right corner of the chart means that almost always the
>>> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got
>>> a chance to reap completions not very often, meanwhile completed
>>> requests stand in the ring buffer for quite a long time which degrades
>>> the overall performance.
>>>
>>> The results covered by the red line are much better and that can be
>>> explained by those 3 red spikes, which are almost in the middle of the
>>> whole distribution, i.e. qemu_laio_completion_bh() is called more often,
>>> completed requests do not stall, giving fio a chance to submit new fresh
>>> requests.
>>>
>>> The theoretical fix would be to schedule completion BH just after
>>> successful io_submit, i.e.:
>>
>> What about removing the qemu_bh_cancel but keeping the rest of the patch?
> 
> That exactly what I did.  Numbers go to expected from ~1600MB/s to ~1800MB/s.
> So basically this hunk of the debatable patch:
> 
>      if (event_notifier_test_and_clear(&s->e)) {
> -        qemu_bh_schedule(s->completion_bh);
> +        qemu_laio_completion_bh(s);
>      }
> 
> does not have any impact and can be ignored.  At least I did not notice
> anything important.
> 
>>
>> I'm also interested in a graph with this patch ("linux-aio: prevent
>> submitting more than MAX_EVENTS") on top of origin/master.
> 
> I can plot it also of course.
> 
>>
>> Thanks for the analysis.  Sometimes a picture _is_ worth a thousand
>> words, even if it's measuring "only" second-order effects (# of
>> completions is not what causes the slowdown, but # of completions
>> affects latency which causes the slowdown).
> 
> Yes, you are right, latency.  With userspace io_getevents ~0 costs we
> can peek requests as often as we like to decrease latency on very
> fast devices.  That can also bring something.  Probably after each
> io_submit() it makes sense to peek and complete something.

Right, especially 1) because io_getevents with timeout 0 is cheap (it
peeks at the ring buffer before the syscall); 2) because we want anyway
to replace io_getevents with userspace code through your other patch.

Paolo
Roman Pen July 15, 2016, 11:35 a.m. UTC | #12
On Fri, Jul 15, 2016 at 12:17 PM, Roman Penyaev
<roman.penyaev@profitbricks.com> wrote:
> On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 15/07/2016 11:18, Roman Penyaev wrote:
>>> Those 3 red spikes and a blue hill is what we have to focus on.  The
>>> blue hill at the right corner of the chart means that almost always the
>>> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got
>>> a chance to reap completions not very often, meanwhile completed
>>> requests stand in the ring buffer for quite a long time which degrades
>>> the overall performance.
>>>
>>> The results covered by the red line are much better and that can be
>>> explained by those 3 red spikes, which are almost in the middle of the
>>> whole distribution, i.e. qemu_laio_completion_bh() is called more often,
>>> completed requests do not stall, giving fio a chance to submit new fresh
>>> requests.
>>>
>>> The theoretical fix would be to schedule completion BH just after
>>> successful io_submit, i.e.:
>>
>> What about removing the qemu_bh_cancel but keeping the rest of the patch?
>
> That exactly what I did.  Numbers go to expected from ~1600MB/s to ~1800MB/s.
> So basically this hunk of the debatable patch:
>
>      if (event_notifier_test_and_clear(&s->e)) {
> -        qemu_bh_schedule(s->completion_bh);
> +        qemu_laio_completion_bh(s);
>      }
>
> does not have any impact and can be ignored.  At least I did not notice
> anything important.
>
>>
>> I'm also interested in a graph with this patch ("linux-aio: prevent
>> submitting more than MAX_EVENTS") on top of origin/master.
>
> I can plot it also of course.

So, finally I have it.

Same link:
https://docs.google.com/spreadsheets/d/12CIt6EKiJLqNx0OHNqiabR-oFBrqkH0LN3mjzZ5jGeo/edit?usp=sharing

last sheet:
"1789MB/s"

Not that much interesting: almost all the time we complete maximum:
MAX_LIMIT requests at once.  But of course that expected on such
device.  Probably other good metrics should be taken into account.

--
Roman
Paolo Bonzini July 15, 2016, 12:57 p.m. UTC | #13
On 15/07/2016 13:35, Roman Penyaev wrote:
> On Fri, Jul 15, 2016 at 12:17 PM, Roman Penyaev
> <roman.penyaev@profitbricks.com> wrote:
>> On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>>
>>> On 15/07/2016 11:18, Roman Penyaev wrote:
>>>> Those 3 red spikes and a blue hill is what we have to focus on.  The
>>>> blue hill at the right corner of the chart means that almost always the
>>>> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got
>>>> a chance to reap completions not very often, meanwhile completed
>>>> requests stand in the ring buffer for quite a long time which degrades
>>>> the overall performance.
>>>>
>>>> The results covered by the red line are much better and that can be
>>>> explained by those 3 red spikes, which are almost in the middle of the
>>>> whole distribution, i.e. qemu_laio_completion_bh() is called more often,
>>>> completed requests do not stall, giving fio a chance to submit new fresh
>>>> requests.
>>>>
>>>> The theoretical fix would be to schedule completion BH just after
>>>> successful io_submit, i.e.:
>>>
>>> What about removing the qemu_bh_cancel but keeping the rest of the patch?
>>
>> That exactly what I did.  Numbers go to expected from ~1600MB/s to ~1800MB/s.
>> So basically this hunk of the debatable patch:
>>
>>      if (event_notifier_test_and_clear(&s->e)) {
>> -        qemu_bh_schedule(s->completion_bh);
>> +        qemu_laio_completion_bh(s);
>>      }
>>
>> does not have any impact and can be ignored.  At least I did not notice
>> anything important.

Thanks, this means that we should either add back the other line, or
wrap qemu_laio_completion_bh in a loop.  The rationale is that an
io_getevents that doesn't find any event is extremely cheap.

>>> I'm also interested in a graph with this patch ("linux-aio: prevent
>>> submitting more than MAX_EVENTS") on top of origin/master.
>>
>> I can plot it also of course.
> 
> So, finally I have it.
> 
> Same link:
> https://docs.google.com/spreadsheets/d/12CIt6EKiJLqNx0OHNqiabR-oFBrqkH0LN3mjzZ5jGeo/edit?usp=sharing
> 
> last sheet:
> "1789MB/s"
> 
> Not that much interesting: almost all the time we complete maximum:
> MAX_LIMIT requests at once.  But of course that expected on such
> device.  Probably other good metrics should be taken into account.

And this means that we probably should raise MAX_LIMIT.

Paolo
Roman Pen July 15, 2016, 3:03 p.m. UTC | #14
On Fri, Jul 15, 2016 at 11:18 AM, Roman Penyaev
<roman.penyaev@profitbricks.com> wrote:
> On Wed, Jul 13, 2016 at 1:45 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben:
>>> Just to be sure that we are on the same page:
>>>
>>> 1. We have this commit "linux-aio: Cancel BH if not needed" which
>>>
>>>    a) introduces performance regression on my fio workloads on the
>>>       following config: "iothread=1, VCPU=8, MQ=8". Performance
>>>       dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is
>>>       ~14%.
>>
>> Do we already understand why the performance regresses with the patch?
>> As long as we don't, everything we do is just guesswork.
>
> Eventually the issue is clear.  I test on /dev/nullb0, which completes
> all submitted bios almost immediately.  That means, that after io_submit()
> is called it is worth trying to check completed requests and not to
> accumulate them in-flight.
>

[snip]

>
> The theoretical fix would be to schedule completion BH just after
> successful io_submit, i.e.:
>
> ---------------------------------------------------------------------
> @@ -228,6 +228,8 @@ static void ioq_submit(LinuxAioState *s)
>          QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
>      } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
>      s->io_q.blocked = (s->io_q.n > 0);
> +
> +    qemu_bh_schedule(s->completion_bh);
>  }
> ---------------------------------------------------------------------
>
> This theoretical fix works pretty fine and numbers return to expected
> ~1800MB/s.
>
> So believe me or not but BH, which was not accidentally canceled, gives
> better results on very fast backend devices.
>
> The other interesting observation is the following: submission limitation
> (which I did in the "linux-aio: prevent submitting more than MAX_EVENTS"
> patch) also fixes the issue, because before submitting more than MAX_EVENTS
> we have to reap something, which obviously do not let already completed
> requests stall in the queue for a long time.

Got expected but nevertheless interesting latencies from fio:

---------------------------------------------------------------------------
   master
   + "linux-aio: keep processing events if MAX_EVENTS reached"

read : io=47995MB, bw=1599.8MB/s, iops=157530, runt= 30002msec
    clat (usec): min=1, max=19754, avg=1223.26, stdev=358.03
    clat percentiles (usec):
     | 30.00th=[ 1080], 40.00th=[ 1160], 50.00th=[ 1224], 60.00th=[ 1288],
    lat (usec) : 750=6.55%, 1000=14.19%, 2000=75.38%


---------------------------------------------------------------------------
   master
   + "linux-aio: prevent submitting more than MAX_EVENTS"

read : io=53746MB, bw=1791.4MB/s, iops=176670, runt= 30003msec
    clat (usec): min=1, max=15902, avg=1067.67, stdev=352.40
    clat percentiles (usec):
     | 30.00th=[  932], 40.00th=[ 1004], 50.00th=[ 1064], 60.00th=[ 1128],
    lat (usec) : 750=10.68%, 1000=25.06%, 2000=59.62%


---------------------------------------------------------------------------
   master
   + "linux-aio: prevent submitting more than MAX_EVENTS"
   + schedule completion BH just after each successful io_submit()

read : io=56875MB, bw=1895.8MB/s, iops=186986, runt= 30001msec
    clat (usec): min=2, max=17288, avg=991.57, stdev=318.86
    clat percentiles (usec):
     | 30.00th=[  868], 40.00th=[  940], 50.00th=[ 1004], 60.00th=[ 1064],
    lat (usec) : 750=13.85%, 1000=30.57%, 2000=49.84%


Three examples definitely show (even without charts) that more often we peek
and harvest completed requests - more performance gain we can get.

Still a lot of room for optimization :)


--
Roman
diff mbox

Patch

diff --git a/block/linux-aio.c b/block/linux-aio.c
index e468960..78f4524 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -28,8 +28,6 @@ 
  */
 #define MAX_EVENTS 128
 
-#define MAX_QUEUED_IO  128
-
 struct qemu_laiocb {
     BlockAIOCB common;
     Coroutine *co;
@@ -44,7 +42,8 @@  struct qemu_laiocb {
 
 typedef struct {
     int plugged;
-    unsigned int n;
+    unsigned int in_queue;
+    unsigned int in_flight;
     bool blocked;
     QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;
@@ -129,6 +128,7 @@  static void qemu_laio_completion_bh(void *opaque)
             s->event_max = 0;
             return; /* no more events */
         }
+        s->io_q.in_flight -= s->event_max;
     }
 
     /* Reschedule so nested event loops see currently pending completions */
@@ -190,7 +190,8 @@  static void ioq_init(LaioQueue *io_q)
 {
     QSIMPLEQ_INIT(&io_q->pending);
     io_q->plugged = 0;
-    io_q->n = 0;
+    io_q->in_queue = 0;
+    io_q->in_flight = 0;
     io_q->blocked = false;
 }
 
@@ -198,14 +199,17 @@  static void ioq_submit(LinuxAioState *s)
 {
     int ret, len;
     struct qemu_laiocb *aiocb;
-    struct iocb *iocbs[MAX_QUEUED_IO];
+    struct iocb *iocbs[MAX_EVENTS];
     QSIMPLEQ_HEAD(, qemu_laiocb) completed;
 
     do {
+        if (s->io_q.in_flight >= MAX_EVENTS) {
+            break;
+        }
         len = 0;
         QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
             iocbs[len++] = &aiocb->iocb;
-            if (len == MAX_QUEUED_IO) {
+            if (s->io_q.in_flight + len >= MAX_EVENTS) {
                 break;
             }
         }
@@ -218,11 +222,12 @@  static void ioq_submit(LinuxAioState *s)
             abort();
         }
 
-        s->io_q.n -= ret;
+        s->io_q.in_flight += ret;
+        s->io_q.in_queue  -= ret;
         aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
         QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
     } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
-    s->io_q.blocked = (s->io_q.n > 0);
+    s->io_q.blocked = (s->io_q.in_queue > 0);
 }
 
 void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
@@ -263,9 +268,10 @@  static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
     io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
 
     QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
-    s->io_q.n++;
+    s->io_q.in_queue++;
     if (!s->io_q.blocked &&
-        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
+        (!s->io_q.plugged ||
+         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
         ioq_submit(s);
     }