Message ID | 1468396629-26094-1-git-send-email-roman.penyaev@profitbricks.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 13/07/2016 09:57, Roman Pen wrote: > v1..v2: > > o comment tweaks. > o fix QEMU coding style. > > Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us > with specified number of events. But kernel ring buffer allocation logic > is a bit tricky (ring buffer is page size aligned + some percpu allocation > are required) so eventually more than requested events number is allocated. > > From a userspace side we have to follow the convention and should not try > to io_submit() more or logic, which consumes completed events, should be > changed accordingly. The pitfall is in the following sequence: > > MAX_EVENTS = 128 > io_setup(MAX_EVENTS) > > io_submit(MAX_EVENTS) > io_submit(MAX_EVENTS) > > /* now 256 events are in-flight */ > > io_getevents(MAX_EVENTS) = 128 > > /* we can handle only 128 events at once, to be sure > * that nothing is pended the io_getevents(MAX_EVENTS) > * call must be invoked once more or hang will happen. */ > > To prevent the hang or reiteration of io_getevents() call this patch > restricts the number of in-flights, which is now limited to MAX_EVENTS. > > Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com> > Reviewed-by: Fam Zheng <famz@redhat.com> > Cc: Stefan Hajnoczi <stefanha@redhat.com> > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: qemu-devel@nongnu.org > --- > block/linux-aio.c | 26 ++++++++++++++++---------- > 1 file changed, 16 insertions(+), 10 deletions(-) > > diff --git a/block/linux-aio.c b/block/linux-aio.c > index e468960..78f4524 100644 > --- a/block/linux-aio.c > +++ b/block/linux-aio.c > @@ -28,8 +28,6 @@ > */ > #define MAX_EVENTS 128 > > -#define MAX_QUEUED_IO 128 > - > struct qemu_laiocb { > BlockAIOCB common; > Coroutine *co; > @@ -44,7 +42,8 @@ struct qemu_laiocb { > > typedef struct { > int plugged; > - unsigned int n; > + unsigned int in_queue; > + unsigned int in_flight; > bool blocked; > QSIMPLEQ_HEAD(, qemu_laiocb) pending; > } LaioQueue; > @@ -129,6 +128,7 @@ static void qemu_laio_completion_bh(void *opaque) > s->event_max = 0; > return; /* no more events */ > } > + s->io_q.in_flight -= s->event_max; > } > > /* Reschedule so nested event loops see currently pending completions */ > @@ -190,7 +190,8 @@ static void ioq_init(LaioQueue *io_q) > { > QSIMPLEQ_INIT(&io_q->pending); > io_q->plugged = 0; > - io_q->n = 0; > + io_q->in_queue = 0; > + io_q->in_flight = 0; > io_q->blocked = false; > } > > @@ -198,14 +199,17 @@ static void ioq_submit(LinuxAioState *s) > { > int ret, len; > struct qemu_laiocb *aiocb; > - struct iocb *iocbs[MAX_QUEUED_IO]; > + struct iocb *iocbs[MAX_EVENTS]; > QSIMPLEQ_HEAD(, qemu_laiocb) completed; > > do { > + if (s->io_q.in_flight >= MAX_EVENTS) { > + break; > + } > len = 0; > QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { > iocbs[len++] = &aiocb->iocb; > - if (len == MAX_QUEUED_IO) { > + if (s->io_q.in_flight + len >= MAX_EVENTS) { > break; > } > } > @@ -218,11 +222,12 @@ static void ioq_submit(LinuxAioState *s) > abort(); > } > > - s->io_q.n -= ret; > + s->io_q.in_flight += ret; > + s->io_q.in_queue -= ret; > aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); > QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); > } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); > - s->io_q.blocked = (s->io_q.n > 0); > + s->io_q.blocked = (s->io_q.in_queue > 0); > } > > void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) > @@ -263,9 +268,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, > io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); > > QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); > - s->io_q.n++; > + s->io_q.in_queue++; > if (!s->io_q.blocked && > - (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { > + (!s->io_q.plugged || > + s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { > ioq_submit(s); > } > > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
On Wed, Jul 13, 2016 at 12:31 PM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > > On 13/07/2016 09:57, Roman Pen wrote: >> v1..v2: >> >> o comment tweaks. >> o fix QEMU coding style. >> >> Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us >> with specified number of events. But kernel ring buffer allocation logic >> is a bit tricky (ring buffer is page size aligned + some percpu allocation >> are required) so eventually more than requested events number is allocated. >> >> From a userspace side we have to follow the convention and should not try >> to io_submit() more or logic, which consumes completed events, should be >> changed accordingly. The pitfall is in the following sequence: >> >> MAX_EVENTS = 128 >> io_setup(MAX_EVENTS) >> >> io_submit(MAX_EVENTS) >> io_submit(MAX_EVENTS) >> >> /* now 256 events are in-flight */ >> >> io_getevents(MAX_EVENTS) = 128 >> >> /* we can handle only 128 events at once, to be sure >> * that nothing is pended the io_getevents(MAX_EVENTS) >> * call must be invoked once more or hang will happen. */ >> >> To prevent the hang or reiteration of io_getevents() call this patch >> restricts the number of in-flights, which is now limited to MAX_EVENTS. >> >> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com> >> Reviewed-by: Fam Zheng <famz@redhat.com> >> Cc: Stefan Hajnoczi <stefanha@redhat.com> >> Cc: Paolo Bonzini <pbonzini@redhat.com> >> Cc: qemu-devel@nongnu.org >> --- >> block/linux-aio.c | 26 ++++++++++++++++---------- >> 1 file changed, 16 insertions(+), 10 deletions(-) >> >> diff --git a/block/linux-aio.c b/block/linux-aio.c >> index e468960..78f4524 100644 >> --- a/block/linux-aio.c >> +++ b/block/linux-aio.c >> @@ -28,8 +28,6 @@ >> */ >> #define MAX_EVENTS 128 >> >> -#define MAX_QUEUED_IO 128 >> - >> struct qemu_laiocb { >> BlockAIOCB common; >> Coroutine *co; >> @@ -44,7 +42,8 @@ struct qemu_laiocb { >> >> typedef struct { >> int plugged; >> - unsigned int n; >> + unsigned int in_queue; >> + unsigned int in_flight; >> bool blocked; >> QSIMPLEQ_HEAD(, qemu_laiocb) pending; >> } LaioQueue; >> @@ -129,6 +128,7 @@ static void qemu_laio_completion_bh(void *opaque) >> s->event_max = 0; >> return; /* no more events */ >> } >> + s->io_q.in_flight -= s->event_max; >> } >> >> /* Reschedule so nested event loops see currently pending completions */ >> @@ -190,7 +190,8 @@ static void ioq_init(LaioQueue *io_q) >> { >> QSIMPLEQ_INIT(&io_q->pending); >> io_q->plugged = 0; >> - io_q->n = 0; >> + io_q->in_queue = 0; >> + io_q->in_flight = 0; >> io_q->blocked = false; >> } >> >> @@ -198,14 +199,17 @@ static void ioq_submit(LinuxAioState *s) >> { >> int ret, len; >> struct qemu_laiocb *aiocb; >> - struct iocb *iocbs[MAX_QUEUED_IO]; >> + struct iocb *iocbs[MAX_EVENTS]; >> QSIMPLEQ_HEAD(, qemu_laiocb) completed; >> >> do { >> + if (s->io_q.in_flight >= MAX_EVENTS) { >> + break; >> + } >> len = 0; >> QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { >> iocbs[len++] = &aiocb->iocb; >> - if (len == MAX_QUEUED_IO) { >> + if (s->io_q.in_flight + len >= MAX_EVENTS) { >> break; >> } >> } >> @@ -218,11 +222,12 @@ static void ioq_submit(LinuxAioState *s) >> abort(); >> } >> >> - s->io_q.n -= ret; >> + s->io_q.in_flight += ret; >> + s->io_q.in_queue -= ret; >> aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); >> QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); >> } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); >> - s->io_q.blocked = (s->io_q.n > 0); >> + s->io_q.blocked = (s->io_q.in_queue > 0); >> } >> >> void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) >> @@ -263,9 +268,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, >> io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); >> >> QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); >> - s->io_q.n++; >> + s->io_q.in_queue++; >> if (!s->io_q.blocked && >> - (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { >> + (!s->io_q.plugged || >> + s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { >> ioq_submit(s); >> } >> >> > > Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Just to be sure that we are on the same page: 1. We have this commit "linux-aio: Cancel BH if not needed" which a) introduces performance regression on my fio workloads on the following config: "iothread=1, VCPU=8, MQ=8". Performance dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is ~14%. b) reproduces IO hang, because of in-flights > MAX_EVENTS. So probably this commit should be reverted because of a) not b). 2. Stefan has fix for 1.b) issue repeating io_getevents(), which is obviously an excess for generic cases where MQ=1 (queue depth for virtio_blk is also set to 128 i.e. equal to MAX_EVENTS on QEMU side). 3. The current patch also aims to fix 1.b) issue restricting number of in-flights. Reverting 1. will fix all the problems, without any need to apply 2. or 3. The most lazy variant. Restricting in-flights is also a step forward, since submitting till -EAGAIN is also possible, but leads (as we already know) to IO hang on specific loads and conditions. But 2. and 3. are mutual exclusive and should not be applied together. So we have several alternatives and a choice what to follow. -- Roman
Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben: > Just to be sure that we are on the same page: > > 1. We have this commit "linux-aio: Cancel BH if not needed" which > > a) introduces performance regression on my fio workloads on the > following config: "iothread=1, VCPU=8, MQ=8". Performance > dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is > ~14%. Do we already understand why the performance regresses with the patch? As long as we don't, everything we do is just guesswork. Kevin > b) reproduces IO hang, because of in-flights > MAX_EVENTS. > > So probably this commit should be reverted because of a) not b). > > 2. Stefan has fix for 1.b) issue repeating io_getevents(), which > is obviously an excess for generic cases where MQ=1 (queue depth > for virtio_blk is also set to 128 i.e. equal to MAX_EVENTS on > QEMU side). > > 3. The current patch also aims to fix 1.b) issue restricting number > of in-flights. > > > Reverting 1. will fix all the problems, without any need to apply > 2. or 3. The most lazy variant. > > Restricting in-flights is also a step forward, since submitting > till -EAGAIN is also possible, but leads (as we already know) to > IO hang on specific loads and conditions. > > But 2. and 3. are mutual exclusive and should not be applied > together. > > So we have several alternatives and a choice what to follow. > > -- > Roman
On 07/13/2016 01:57 AM, Roman Pen wrote: > v1..v2: > > o comment tweaks. > o fix QEMU coding style. The above comments should be delayed... > > Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us > with specified number of events. But kernel ring buffer allocation logic > is a bit tricky (ring buffer is page size aligned + some percpu allocation > are required) so eventually more than requested events number is allocated. > ... > Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com> > Reviewed-by: Fam Zheng <famz@redhat.com> > Cc: Stefan Hajnoczi <stefanha@redhat.com> > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: qemu-devel@nongnu.org > --- ...until here, after the --- separator. They are useful to reviewers, but won't make much sense a year from now in qemu.git (when we don't care what other versions were on list, only the version that got committed). Also, if you use 'git send-email -v2' (or 'git format-patch -v2'), your subject line will resemble most other versioned patches (which use [PATCH v2] rather than [PATCH V2]). We also recommend that v2 patches be sent as top-level threads, rather than in-reply to v1. More submission hints at http://wiki.qemu.org/Contribute/SubmitAPatch
On Wed, Jul 13, 2016 at 2:22 PM, Eric Blake <eblake@redhat.com> wrote: > On 07/13/2016 01:57 AM, Roman Pen wrote: >> v1..v2: >> >> o comment tweaks. >> o fix QEMU coding style. > > The above comments should be delayed... > >> >> Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us >> with specified number of events. But kernel ring buffer allocation logic >> is a bit tricky (ring buffer is page size aligned + some percpu allocation >> are required) so eventually more than requested events number is allocated. >> > > ... > >> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com> >> Reviewed-by: Fam Zheng <famz@redhat.com> >> Cc: Stefan Hajnoczi <stefanha@redhat.com> >> Cc: Paolo Bonzini <pbonzini@redhat.com> >> Cc: qemu-devel@nongnu.org >> --- > > ...until here, after the --- separator. They are useful to reviewers, > but won't make much sense a year from now in qemu.git (when we don't > care what other versions were on list, only the version that got committed). > > Also, if you use 'git send-email -v2' (or 'git format-patch -v2'), your > subject line will resemble most other versioned patches (which use > [PATCH v2] rather than [PATCH V2]). We also recommend that v2 patches > be sent as top-level threads, rather than in-reply to v1. > > More submission hints at http://wiki.qemu.org/Contribute/SubmitAPatch Thanks for tips. Will resend to top-level shortly. -- Roman
On Wed, Jul 13, 2016 at 1:45 PM, Kevin Wolf <kwolf@redhat.com> wrote: > Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben: >> Just to be sure that we are on the same page: >> >> 1. We have this commit "linux-aio: Cancel BH if not needed" which >> >> a) introduces performance regression on my fio workloads on the >> following config: "iothread=1, VCPU=8, MQ=8". Performance >> dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is >> ~14%. > > Do we already understand why the performance regresses with the patch? This is very good question from the author of the patch. Speaking for myself, I do not understand. > As long as we don't, everything we do is just guesswork. Kevin, did you miss the ending "with Stefan's fix" ? Since your patch reproduces another problem it is impossible to test it isolated or IO hangs. I tested four variations and invite you to do the same, since you are the author of the debatable patch: 1. as-is, i.e. + Stefan's "virtio-blk: dataplane multiqueue support" + yours "linux-aio: Cancel BH if not needed" As we already discussed - IO hangs. 2. + Stefan's "linux-aio: keep processing events if MAX_EVENTS reached" READ: io=48199MB, aggrb=1606.5MB/s, minb=1606.5MB/s, maxb=1606.5MB/s, mint=30003msec, maxt=30003msec WRITE: io=48056MB, aggrb=1601.8MB/s, minb=1601.8MB/s, maxb=1601.8MB/s, mint=30003msec, maxt=30003msec 3. - Stefan's "linux-aio: keep processing events if MAX_EVENTS reached" + my "linux-aio: prevent submitting more than MAX_EVENTS" READ: io=53294MB, aggrb=1776.3MB/s, minb=1776.3MB/s, maxb=1776.3MB/s, mint=30003msec, maxt=30003msec WRITE: io=53177MB, aggrb=1772.4MB/s, minb=1772.4MB/s, maxb=1772.4MB/s, mint=30003msec, maxt=30003msec 4. - my "linux-aio: prevent submitting more than MAX_EVENTS" - yours "linux-aio: Cancel BH if not needed" READ: io=56362MB, aggrb=1878.4MB/s, minb=1878.4MB/s, maxb=1878.4MB/s, mint=30007msec, maxt=30007msec WRITE: io=56255MB, aggrb=1874.8MB/s, minb=1874.8MB/s, maxb=1874.8MB/s, mint=30007msec, maxt=30007msec The drop from 1878MB/s to 1776MB/s is ~5% and probably can be ignored (I say probably because would be nice to have some other numbers from you or anybody else) The drop from 1878MB/s to 1606Mb/s is ~14% and seems like a serious degradation. Also to go deeper and to avoid possible suspicions I tested isolated Stefan's "linux-aio: keep processing events if MAX_EVENTS reached" patch, without yours "linux-aio: Cancel BH if not needed": READ: io=109970MB, aggrb=1832.8MB/s, minb=1832.8MB/s, maxb=1832.8MB/s, mint=60003msec, maxt=60003msec WRITE: io=109820MB, aggrb=1830.3MB/s, minb=1830.3MB/s, maxb=1830.3MB/s, mint=60003msec, maxt=60003msec As you can see no significant drop. That means that only the following pair: Stefan's "linux-aio: keep processing events if MAX_EVENTS reached" yours "linux-aio: Cancel BH if not needed" impacts the performance. As I already told we have different choices to follow and one of them (the simplest) is to revert everything. -- Roman > > Kevin > >> b) reproduces IO hang, because of in-flights > MAX_EVENTS. >> >> So probably this commit should be reverted because of a) not b). >> >> 2. Stefan has fix for 1.b) issue repeating io_getevents(), which >> is obviously an excess for generic cases where MQ=1 (queue depth >> for virtio_blk is also set to 128 i.e. equal to MAX_EVENTS on >> QEMU side). >> >> 3. The current patch also aims to fix 1.b) issue restricting number >> of in-flights. >> >> >> Reverting 1. will fix all the problems, without any need to apply >> 2. or 3. The most lazy variant. >> >> Restricting in-flights is also a step forward, since submitting >> till -EAGAIN is also possible, but leads (as we already know) to >> IO hang on specific loads and conditions. >> >> But 2. and 3. are mutual exclusive and should not be applied >> together. >> >> So we have several alternatives and a choice what to follow. >> >> -- >> Roman
On Wed, Jul 13, 2016 at 09:57:09AM +0200, Roman Pen wrote: Please send each new revision of a patch series as a separate email thread. Do not thread revisions with Reply-To:, References:. See http://qemu-project.org/Contribute/SubmitAPatch for all the guidelines on submitting patches. > v1..v2: > > o comment tweaks. > o fix QEMU coding style. The changelog is useful for reviewers but is no longer useful once the patch has been merged. Therefore it goes below the '---' so that git-am(1) doesn't include it when merging. > Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us > with specified number of events. But kernel ring buffer allocation logic > is a bit tricky (ring buffer is page size aligned + some percpu allocation > are required) so eventually more than requested events number is allocated. > > From a userspace side we have to follow the convention and should not try > to io_submit() more or logic, which consumes completed events, should be > changed accordingly. The pitfall is in the following sequence: > > MAX_EVENTS = 128 > io_setup(MAX_EVENTS) > > io_submit(MAX_EVENTS) > io_submit(MAX_EVENTS) > > /* now 256 events are in-flight */ > > io_getevents(MAX_EVENTS) = 128 > > /* we can handle only 128 events at once, to be sure > * that nothing is pended the io_getevents(MAX_EVENTS) > * call must be invoked once more or hang will happen. */ > > To prevent the hang or reiteration of io_getevents() call this patch > restricts the number of in-flights, which is now limited to MAX_EVENTS. > > Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com> > Reviewed-by: Fam Zheng <famz@redhat.com> > Cc: Stefan Hajnoczi <stefanha@redhat.com> > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: qemu-devel@nongnu.org > --- > block/linux-aio.c | 26 ++++++++++++++++---------- > 1 file changed, 16 insertions(+), 10 deletions(-) > > diff --git a/block/linux-aio.c b/block/linux-aio.c > index e468960..78f4524 100644 > --- a/block/linux-aio.c > +++ b/block/linux-aio.c > @@ -28,8 +28,6 @@ > */ > #define MAX_EVENTS 128 > > -#define MAX_QUEUED_IO 128 > - > struct qemu_laiocb { > BlockAIOCB common; > Coroutine *co; > @@ -44,7 +42,8 @@ struct qemu_laiocb { > > typedef struct { > int plugged; > - unsigned int n; > + unsigned int in_queue; > + unsigned int in_flight; > bool blocked; > QSIMPLEQ_HEAD(, qemu_laiocb) pending; > } LaioQueue; > @@ -129,6 +128,7 @@ static void qemu_laio_completion_bh(void *opaque) > s->event_max = 0; > return; /* no more events */ > } > + s->io_q.in_flight -= s->event_max; > } > > /* Reschedule so nested event loops see currently pending completions */ > @@ -190,7 +190,8 @@ static void ioq_init(LaioQueue *io_q) > { > QSIMPLEQ_INIT(&io_q->pending); > io_q->plugged = 0; > - io_q->n = 0; > + io_q->in_queue = 0; > + io_q->in_flight = 0; > io_q->blocked = false; > } > > @@ -198,14 +199,17 @@ static void ioq_submit(LinuxAioState *s) > { > int ret, len; > struct qemu_laiocb *aiocb; > - struct iocb *iocbs[MAX_QUEUED_IO]; > + struct iocb *iocbs[MAX_EVENTS]; > QSIMPLEQ_HEAD(, qemu_laiocb) completed; > > do { > + if (s->io_q.in_flight >= MAX_EVENTS) { > + break; > + } > len = 0; > QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { > iocbs[len++] = &aiocb->iocb; > - if (len == MAX_QUEUED_IO) { > + if (s->io_q.in_flight + len >= MAX_EVENTS) { > break; > } > } > @@ -218,11 +222,12 @@ static void ioq_submit(LinuxAioState *s) > abort(); > } > > - s->io_q.n -= ret; > + s->io_q.in_flight += ret; > + s->io_q.in_queue -= ret; > aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); > QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); > } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); > - s->io_q.blocked = (s->io_q.n > 0); > + s->io_q.blocked = (s->io_q.in_queue > 0); > } > > void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) > @@ -263,9 +268,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, > io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); > > QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); > - s->io_q.n++; > + s->io_q.in_queue++; > if (!s->io_q.blocked && > - (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { > + (!s->io_q.plugged || > + s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { > ioq_submit(s); > } > > -- > 2.8.2 > >
On Wed, Jul 13, 2016 at 1:45 PM, Kevin Wolf <kwolf@redhat.com> wrote: > Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben: >> Just to be sure that we are on the same page: >> >> 1. We have this commit "linux-aio: Cancel BH if not needed" which >> >> a) introduces performance regression on my fio workloads on the >> following config: "iothread=1, VCPU=8, MQ=8". Performance >> dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is >> ~14%. > > Do we already understand why the performance regresses with the patch? > As long as we don't, everything we do is just guesswork. Eventually the issue is clear. I test on /dev/nullb0, which completes all submitted bios almost immediately. That means, that after io_submit() is called it is worth trying to check completed requests and not to accumulate them in-flight. That is the theory. On practise happens the following: ------------------------------------------------------------------- >>> sys_poll <<< sys_poll >>> aio_dispatch >>> aio_bh_poll <<< aio_bh_poll >>> node->io_read !!! ioq_submit(), submitted=98 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=49 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=47 <<< node->io_read <<< aio_dispatch >>> sys_poll <<< sys_poll >>> aio_dispatch >>> aio_bh_poll <<< aio_bh_poll >>> node->io_read !!! ioq_submit(), submitted=50 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=43 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=43 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=8 <<< node->io_read >>> node->io_read ~~~ qemu_laio_completion_bh completed=338 <<< node->io_read <<< aio_dispatch ------------------------------------------------------------------- * this run gave 1461MB/s * This is the very common hunk of the log which I see running fio load with the "linux-aio: Cancel BH if not needed" patch applied. The important thing which is worth paying attention to is submission of 338 requests (almost whole ring buffer of AIO context) before consuming requests completions. Very fast backend device completes submitted requests almost immediately, but we get a chance to fetch completions only some time later. The following is the common part of the log when "linux-aio: Cancel BH if not needed" is reverted: ------------------------------------------------------------------- >>> sys_poll <<< sys_poll >>> dispatch >>> aio_bh_poll ~~~ qemu_laio_completion_bh completed=199 <<< aio_bh_poll >>> node->io_read !!! ioq_submit(), submitted=47 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=49 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=50 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=43 <<< node->io_read >>> node->io_read <<< node->io_read <<< dispatch >>> sys_poll <<< sys_poll >>> dispatch >>> aio_bh_poll ~~~ qemu_laio_completion_bh, completed=189 <<< aio_bh_poll >>> node->io_read !!! ioq_submit(), submitted=46 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=46 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=51 <<< node->io_read >>> node->io_read !!! ioq_submit(), submitted=51 <<< node->io_read <<< dispatch ------------------------------------------------------------------- * this run gave 1805MB/s * According to this part of the log I can say, that completions happen frequently, i.e. we get a chance to fetch completions more often, thus queue is always "refreshed" by new comming requests. To be more precise I collected some statistics: each time I enter qemu_laio_completion_bh() I account the number of collected requests in the bucket, e.g.: "~~~ qemu_laio_completion_bh completed=199" bucket[199] += 1; "~~~ qemu_laio_completion_bh, completed=189" bucket[189] += 1; .... When fio finishes I have a distribution of number of completed requests which I have observed in the ring buffer. Here is the sheet: https://docs.google.com/spreadsheets/d/12CIt6EKiJLqNx0OHNqiabR-oFBrqkH0LN3mjzZ5jGeo/edit?usp=sharing (Frankly, I could not think of anything better than to send a link on google docs, sorry if that insults someone). There is a chart which shows the whole picture of distribution: o X axis is a number of requests completed at once. o Y axis is a number of times we observe that number of requests. To avoid scaling problems I plotted the chart starting from 10 requests, since low numbers of requests do not have much impact but have huge values. Those 3 red spikes and a blue hill is what we have to focus on. The blue hill at the right corner of the chart means that almost always the ring buffer was observed as full, i.e. qemu_laio_completion_bh() got a chance to reap completions not very often, meanwhile completed requests stand in the ring buffer for quite a long time which degrades the overall performance. The results covered by the red line are much better and that can be explained by those 3 red spikes, which are almost in the middle of the whole distribution, i.e. qemu_laio_completion_bh() is called more often, completed requests do not stall, giving fio a chance to submit new fresh requests. The theoretical fix would be to schedule completion BH just after successful io_submit, i.e.: --------------------------------------------------------------------- @@ -228,6 +228,8 @@ static void ioq_submit(LinuxAioState *s) QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); s->io_q.blocked = (s->io_q.n > 0); + + qemu_bh_schedule(s->completion_bh); } --------------------------------------------------------------------- This theoretical fix works pretty fine and numbers return to expected ~1800MB/s. So believe me or not but BH, which was not accidentally canceled, gives better results on very fast backend devices. The other interesting observation is the following: submission limitation (which I did in the "linux-aio: prevent submitting more than MAX_EVENTS" patch) also fixes the issue, because before submitting more than MAX_EVENTS we have to reap something, which obviously do not let already completed requests stall in the queue for a long time. -- Roman
On 15/07/2016 11:18, Roman Penyaev wrote: > Those 3 red spikes and a blue hill is what we have to focus on. The > blue hill at the right corner of the chart means that almost always the > ring buffer was observed as full, i.e. qemu_laio_completion_bh() got > a chance to reap completions not very often, meanwhile completed > requests stand in the ring buffer for quite a long time which degrades > the overall performance. > > The results covered by the red line are much better and that can be > explained by those 3 red spikes, which are almost in the middle of the > whole distribution, i.e. qemu_laio_completion_bh() is called more often, > completed requests do not stall, giving fio a chance to submit new fresh > requests. > > The theoretical fix would be to schedule completion BH just after > successful io_submit, i.e.: What about removing the qemu_bh_cancel but keeping the rest of the patch? I'm also interested in a graph with this patch ("linux-aio: prevent submitting more than MAX_EVENTS") on top of origin/master. Thanks for the analysis. Sometimes a picture _is_ worth a thousand words, even if it's measuring "only" second-order effects (# of completions is not what causes the slowdown, but # of completions affects latency which causes the slowdown). Paolo
On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: > > > On 15/07/2016 11:18, Roman Penyaev wrote: >> Those 3 red spikes and a blue hill is what we have to focus on. The >> blue hill at the right corner of the chart means that almost always the >> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got >> a chance to reap completions not very often, meanwhile completed >> requests stand in the ring buffer for quite a long time which degrades >> the overall performance. >> >> The results covered by the red line are much better and that can be >> explained by those 3 red spikes, which are almost in the middle of the >> whole distribution, i.e. qemu_laio_completion_bh() is called more often, >> completed requests do not stall, giving fio a chance to submit new fresh >> requests. >> >> The theoretical fix would be to schedule completion BH just after >> successful io_submit, i.e.: > > What about removing the qemu_bh_cancel but keeping the rest of the patch? That exactly what I did. Numbers go to expected from ~1600MB/s to ~1800MB/s. So basically this hunk of the debatable patch: if (event_notifier_test_and_clear(&s->e)) { - qemu_bh_schedule(s->completion_bh); + qemu_laio_completion_bh(s); } does not have any impact and can be ignored. At least I did not notice anything important. > > I'm also interested in a graph with this patch ("linux-aio: prevent > submitting more than MAX_EVENTS") on top of origin/master. I can plot it also of course. > > Thanks for the analysis. Sometimes a picture _is_ worth a thousand > words, even if it's measuring "only" second-order effects (# of > completions is not what causes the slowdown, but # of completions > affects latency which causes the slowdown). Yes, you are right, latency. With userspace io_getevents ~0 costs we can peek requests as often as we like to decrease latency on very fast devices. That can also bring something. Probably after each io_submit() it makes sense to peek and complete something. -- Roman
On 15/07/2016 12:17, Roman Penyaev wrote: > On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: >> >> >> On 15/07/2016 11:18, Roman Penyaev wrote: >>> Those 3 red spikes and a blue hill is what we have to focus on. The >>> blue hill at the right corner of the chart means that almost always the >>> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got >>> a chance to reap completions not very often, meanwhile completed >>> requests stand in the ring buffer for quite a long time which degrades >>> the overall performance. >>> >>> The results covered by the red line are much better and that can be >>> explained by those 3 red spikes, which are almost in the middle of the >>> whole distribution, i.e. qemu_laio_completion_bh() is called more often, >>> completed requests do not stall, giving fio a chance to submit new fresh >>> requests. >>> >>> The theoretical fix would be to schedule completion BH just after >>> successful io_submit, i.e.: >> >> What about removing the qemu_bh_cancel but keeping the rest of the patch? > > That exactly what I did. Numbers go to expected from ~1600MB/s to ~1800MB/s. > So basically this hunk of the debatable patch: > > if (event_notifier_test_and_clear(&s->e)) { > - qemu_bh_schedule(s->completion_bh); > + qemu_laio_completion_bh(s); > } > > does not have any impact and can be ignored. At least I did not notice > anything important. > >> >> I'm also interested in a graph with this patch ("linux-aio: prevent >> submitting more than MAX_EVENTS") on top of origin/master. > > I can plot it also of course. > >> >> Thanks for the analysis. Sometimes a picture _is_ worth a thousand >> words, even if it's measuring "only" second-order effects (# of >> completions is not what causes the slowdown, but # of completions >> affects latency which causes the slowdown). > > Yes, you are right, latency. With userspace io_getevents ~0 costs we > can peek requests as often as we like to decrease latency on very > fast devices. That can also bring something. Probably after each > io_submit() it makes sense to peek and complete something. Right, especially 1) because io_getevents with timeout 0 is cheap (it peeks at the ring buffer before the syscall); 2) because we want anyway to replace io_getevents with userspace code through your other patch. Paolo
On Fri, Jul 15, 2016 at 12:17 PM, Roman Penyaev <roman.penyaev@profitbricks.com> wrote: > On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: >> >> >> On 15/07/2016 11:18, Roman Penyaev wrote: >>> Those 3 red spikes and a blue hill is what we have to focus on. The >>> blue hill at the right corner of the chart means that almost always the >>> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got >>> a chance to reap completions not very often, meanwhile completed >>> requests stand in the ring buffer for quite a long time which degrades >>> the overall performance. >>> >>> The results covered by the red line are much better and that can be >>> explained by those 3 red spikes, which are almost in the middle of the >>> whole distribution, i.e. qemu_laio_completion_bh() is called more often, >>> completed requests do not stall, giving fio a chance to submit new fresh >>> requests. >>> >>> The theoretical fix would be to schedule completion BH just after >>> successful io_submit, i.e.: >> >> What about removing the qemu_bh_cancel but keeping the rest of the patch? > > That exactly what I did. Numbers go to expected from ~1600MB/s to ~1800MB/s. > So basically this hunk of the debatable patch: > > if (event_notifier_test_and_clear(&s->e)) { > - qemu_bh_schedule(s->completion_bh); > + qemu_laio_completion_bh(s); > } > > does not have any impact and can be ignored. At least I did not notice > anything important. > >> >> I'm also interested in a graph with this patch ("linux-aio: prevent >> submitting more than MAX_EVENTS") on top of origin/master. > > I can plot it also of course. So, finally I have it. Same link: https://docs.google.com/spreadsheets/d/12CIt6EKiJLqNx0OHNqiabR-oFBrqkH0LN3mjzZ5jGeo/edit?usp=sharing last sheet: "1789MB/s" Not that much interesting: almost all the time we complete maximum: MAX_LIMIT requests at once. But of course that expected on such device. Probably other good metrics should be taken into account. -- Roman
On 15/07/2016 13:35, Roman Penyaev wrote: > On Fri, Jul 15, 2016 at 12:17 PM, Roman Penyaev > <roman.penyaev@profitbricks.com> wrote: >> On Fri, Jul 15, 2016 at 11:58 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: >>> >>> >>> On 15/07/2016 11:18, Roman Penyaev wrote: >>>> Those 3 red spikes and a blue hill is what we have to focus on. The >>>> blue hill at the right corner of the chart means that almost always the >>>> ring buffer was observed as full, i.e. qemu_laio_completion_bh() got >>>> a chance to reap completions not very often, meanwhile completed >>>> requests stand in the ring buffer for quite a long time which degrades >>>> the overall performance. >>>> >>>> The results covered by the red line are much better and that can be >>>> explained by those 3 red spikes, which are almost in the middle of the >>>> whole distribution, i.e. qemu_laio_completion_bh() is called more often, >>>> completed requests do not stall, giving fio a chance to submit new fresh >>>> requests. >>>> >>>> The theoretical fix would be to schedule completion BH just after >>>> successful io_submit, i.e.: >>> >>> What about removing the qemu_bh_cancel but keeping the rest of the patch? >> >> That exactly what I did. Numbers go to expected from ~1600MB/s to ~1800MB/s. >> So basically this hunk of the debatable patch: >> >> if (event_notifier_test_and_clear(&s->e)) { >> - qemu_bh_schedule(s->completion_bh); >> + qemu_laio_completion_bh(s); >> } >> >> does not have any impact and can be ignored. At least I did not notice >> anything important. Thanks, this means that we should either add back the other line, or wrap qemu_laio_completion_bh in a loop. The rationale is that an io_getevents that doesn't find any event is extremely cheap. >>> I'm also interested in a graph with this patch ("linux-aio: prevent >>> submitting more than MAX_EVENTS") on top of origin/master. >> >> I can plot it also of course. > > So, finally I have it. > > Same link: > https://docs.google.com/spreadsheets/d/12CIt6EKiJLqNx0OHNqiabR-oFBrqkH0LN3mjzZ5jGeo/edit?usp=sharing > > last sheet: > "1789MB/s" > > Not that much interesting: almost all the time we complete maximum: > MAX_LIMIT requests at once. But of course that expected on such > device. Probably other good metrics should be taken into account. And this means that we probably should raise MAX_LIMIT. Paolo
On Fri, Jul 15, 2016 at 11:18 AM, Roman Penyaev <roman.penyaev@profitbricks.com> wrote: > On Wed, Jul 13, 2016 at 1:45 PM, Kevin Wolf <kwolf@redhat.com> wrote: >> Am 13.07.2016 um 13:33 hat Roman Penyaev geschrieben: >>> Just to be sure that we are on the same page: >>> >>> 1. We have this commit "linux-aio: Cancel BH if not needed" which >>> >>> a) introduces performance regression on my fio workloads on the >>> following config: "iothread=1, VCPU=8, MQ=8". Performance >>> dropped from 1878MB/s to 1606MB/s with Stefan's fix, that is >>> ~14%. >> >> Do we already understand why the performance regresses with the patch? >> As long as we don't, everything we do is just guesswork. > > Eventually the issue is clear. I test on /dev/nullb0, which completes > all submitted bios almost immediately. That means, that after io_submit() > is called it is worth trying to check completed requests and not to > accumulate them in-flight. > [snip] > > The theoretical fix would be to schedule completion BH just after > successful io_submit, i.e.: > > --------------------------------------------------------------------- > @@ -228,6 +228,8 @@ static void ioq_submit(LinuxAioState *s) > QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); > } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); > s->io_q.blocked = (s->io_q.n > 0); > + > + qemu_bh_schedule(s->completion_bh); > } > --------------------------------------------------------------------- > > This theoretical fix works pretty fine and numbers return to expected > ~1800MB/s. > > So believe me or not but BH, which was not accidentally canceled, gives > better results on very fast backend devices. > > The other interesting observation is the following: submission limitation > (which I did in the "linux-aio: prevent submitting more than MAX_EVENTS" > patch) also fixes the issue, because before submitting more than MAX_EVENTS > we have to reap something, which obviously do not let already completed > requests stall in the queue for a long time. Got expected but nevertheless interesting latencies from fio: --------------------------------------------------------------------------- master + "linux-aio: keep processing events if MAX_EVENTS reached" read : io=47995MB, bw=1599.8MB/s, iops=157530, runt= 30002msec clat (usec): min=1, max=19754, avg=1223.26, stdev=358.03 clat percentiles (usec): | 30.00th=[ 1080], 40.00th=[ 1160], 50.00th=[ 1224], 60.00th=[ 1288], lat (usec) : 750=6.55%, 1000=14.19%, 2000=75.38% --------------------------------------------------------------------------- master + "linux-aio: prevent submitting more than MAX_EVENTS" read : io=53746MB, bw=1791.4MB/s, iops=176670, runt= 30003msec clat (usec): min=1, max=15902, avg=1067.67, stdev=352.40 clat percentiles (usec): | 30.00th=[ 932], 40.00th=[ 1004], 50.00th=[ 1064], 60.00th=[ 1128], lat (usec) : 750=10.68%, 1000=25.06%, 2000=59.62% --------------------------------------------------------------------------- master + "linux-aio: prevent submitting more than MAX_EVENTS" + schedule completion BH just after each successful io_submit() read : io=56875MB, bw=1895.8MB/s, iops=186986, runt= 30001msec clat (usec): min=2, max=17288, avg=991.57, stdev=318.86 clat percentiles (usec): | 30.00th=[ 868], 40.00th=[ 940], 50.00th=[ 1004], 60.00th=[ 1064], lat (usec) : 750=13.85%, 1000=30.57%, 2000=49.84% Three examples definitely show (even without charts) that more often we peek and harvest completed requests - more performance gain we can get. Still a lot of room for optimization :) -- Roman
diff --git a/block/linux-aio.c b/block/linux-aio.c index e468960..78f4524 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -28,8 +28,6 @@ */ #define MAX_EVENTS 128 -#define MAX_QUEUED_IO 128 - struct qemu_laiocb { BlockAIOCB common; Coroutine *co; @@ -44,7 +42,8 @@ struct qemu_laiocb { typedef struct { int plugged; - unsigned int n; + unsigned int in_queue; + unsigned int in_flight; bool blocked; QSIMPLEQ_HEAD(, qemu_laiocb) pending; } LaioQueue; @@ -129,6 +128,7 @@ static void qemu_laio_completion_bh(void *opaque) s->event_max = 0; return; /* no more events */ } + s->io_q.in_flight -= s->event_max; } /* Reschedule so nested event loops see currently pending completions */ @@ -190,7 +190,8 @@ static void ioq_init(LaioQueue *io_q) { QSIMPLEQ_INIT(&io_q->pending); io_q->plugged = 0; - io_q->n = 0; + io_q->in_queue = 0; + io_q->in_flight = 0; io_q->blocked = false; } @@ -198,14 +199,17 @@ static void ioq_submit(LinuxAioState *s) { int ret, len; struct qemu_laiocb *aiocb; - struct iocb *iocbs[MAX_QUEUED_IO]; + struct iocb *iocbs[MAX_EVENTS]; QSIMPLEQ_HEAD(, qemu_laiocb) completed; do { + if (s->io_q.in_flight >= MAX_EVENTS) { + break; + } len = 0; QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { iocbs[len++] = &aiocb->iocb; - if (len == MAX_QUEUED_IO) { + if (s->io_q.in_flight + len >= MAX_EVENTS) { break; } } @@ -218,11 +222,12 @@ static void ioq_submit(LinuxAioState *s) abort(); } - s->io_q.n -= ret; + s->io_q.in_flight += ret; + s->io_q.in_queue -= ret; aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); - s->io_q.blocked = (s->io_q.n > 0); + s->io_q.blocked = (s->io_q.in_queue > 0); } void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) @@ -263,9 +268,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); - s->io_q.n++; + s->io_q.in_queue++; if (!s->io_q.blocked && - (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { + (!s->io_q.plugged || + s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { ioq_submit(s); }