diff mbox series

[5/5] aio-posix: Separate AioPolledEvent per AioHandler

Message ID 20250307221634.71951-6-kwolf@redhat.com (mailing list archive)
State New
Headers show
Series block: Improve writethrough performance | expand

Commit Message

Kevin Wolf March 7, 2025, 10:16 p.m. UTC
Adaptive polling has a big problem: It doesn't consider that an event
loop can wait for many different events that may have very different
typical latencies.

For example, think of a guest that tends to send a new I/O request soon
after the previous I/O request completes, but the storage on the host is
rather slow. In this case, getting the new request from guest quickly
means that polling is enabled, but the next thing is performing the I/O
request on the backend, which is slow and disables polling again for the
next guest request. This means that in such a scenario, polling could
help for every other event, but is only ever enabled when it can't
succeed.

In order to fix this, keep a separate AioPolledEvent for each
AioHandler. We will then know that the backend file descriptor always
has a high latency and isn't worth polling for, but we also know that
the guest is always fast and we should poll for it. This solves at least
half of the problem, we can now keep polling for those cases where it
makes sense and get the improved performance from it.

Since the event loop doesn't know which event will be next, we still do
some unnecessary polling while we're waiting for the slow disk. I made
some attempts to be more clever than just randomly growing and shrinking
the polling time, and even to let callers be explicit about when they
expect a new event, but so far this hasn't resulted in improved
performance or even caused performance regressions. For now, let's just
fix the part that is easy enough to fix, we can revisit the rest later.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/block/aio.h |  1 -
 util/aio-posix.h    |  1 +
 util/aio-posix.c    | 24 +++++++++++++++++++++---
 util/async.c        |  2 --
 4 files changed, 22 insertions(+), 6 deletions(-)

Comments

Stefan Hajnoczi March 10, 2025, 10:55 a.m. UTC | #1
On Fri, Mar 07, 2025 at 11:16:34PM +0100, Kevin Wolf wrote:
> Adaptive polling has a big problem: It doesn't consider that an event
> loop can wait for many different events that may have very different
> typical latencies.
> 
> For example, think of a guest that tends to send a new I/O request soon
> after the previous I/O request completes, but the storage on the host is
> rather slow. In this case, getting the new request from guest quickly
> means that polling is enabled, but the next thing is performing the I/O
> request on the backend, which is slow and disables polling again for the
> next guest request. This means that in such a scenario, polling could
> help for every other event, but is only ever enabled when it can't
> succeed.
> 
> In order to fix this, keep a separate AioPolledEvent for each
> AioHandler. We will then know that the backend file descriptor always
> has a high latency and isn't worth polling for, but we also know that
> the guest is always fast and we should poll for it. This solves at least
> half of the problem, we can now keep polling for those cases where it
> makes sense and get the improved performance from it.
> 
> Since the event loop doesn't know which event will be next, we still do
> some unnecessary polling while we're waiting for the slow disk. I made
> some attempts to be more clever than just randomly growing and shrinking
> the polling time, and even to let callers be explicit about when they
> expect a new event, but so far this hasn't resulted in improved
> performance or even caused performance regressions. For now, let's just
> fix the part that is easy enough to fix, we can revisit the rest later.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  include/block/aio.h |  1 -
>  util/aio-posix.h    |  1 +
>  util/aio-posix.c    | 24 +++++++++++++++++++++---
>  util/async.c        |  2 --
>  4 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/include/block/aio.h b/include/block/aio.h
> index 49f46e01cb..0ef7ce48e3 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -233,7 +233,6 @@ struct AioContext {
>      int poll_disable_cnt;
>  
>      /* Polling mode parameters */
> -    AioPolledEvent poll;
>      int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
>      int64_t poll_grow;      /* polling time growth factor */
>      int64_t poll_shrink;    /* polling time shrink factor */
> diff --git a/util/aio-posix.h b/util/aio-posix.h
> index 4264c518be..82a0201ea4 100644
> --- a/util/aio-posix.h
> +++ b/util/aio-posix.h
> @@ -38,6 +38,7 @@ struct AioHandler {
>  #endif
>      int64_t poll_idle_timeout; /* when to stop userspace polling */
>      bool poll_ready; /* has polling detected an event? */
> +    AioPolledEvent poll;
>  };
>  
>  /* Add a handler to a ready list */
> diff --git a/util/aio-posix.c b/util/aio-posix.c
> index 259827c7ad..2251871c61 100644
> --- a/util/aio-posix.c
> +++ b/util/aio-posix.c
> @@ -579,13 +579,19 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
>  static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
>                            int64_t *timeout)
>  {
> +    AioHandler *node;
>      int64_t max_ns;
>  
>      if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
>          return false;
>      }
>  
> -    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
> +    max_ns = 0;
> +    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
> +        max_ns = MAX(max_ns, node->poll.ns);
> +    }
> +    max_ns = qemu_soonest_timeout(*timeout, max_ns);
> +
>      if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
>          /*
>           * Enable poll mode. It pairs with the poll_set_started() in
> @@ -721,8 +727,14 @@ bool aio_poll(AioContext *ctx, bool blocking)
>  
>      /* Adjust polling time */
>      if (ctx->poll_max_ns) {
> +        AioHandler *node;
>          int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
> -        adjust_polling_time(ctx, &ctx->poll, block_ns);
> +
> +        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
> +            if (QLIST_IS_INSERTED(node, node_ready)) {
> +                adjust_polling_time(ctx, &node->poll, block_ns);
> +            }
> +        }
>      }
>  
>      progress |= aio_bh_poll(ctx);
> @@ -772,10 +784,16 @@ void aio_context_use_g_source(AioContext *ctx)
>  void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
>                                   int64_t grow, int64_t shrink, Error **errp)
>  {
> +    AioHandler *node;
> +
>      /* No thread synchronization here, it doesn't matter if an incorrect value
>       * is used once.
>       */

If you respin this series:

This comment is confusing now that qemu_lockcnt_inc() is being used.
Lockcnt tells other threads in aio_set_fd_handler() not to remove nodes
from the aio_handlers list (because we're traversing the list).

The comment is about the poll state though, not about the aio_handlers
list. Moving it down to where poll_max_ns, etc are assigned would make
it clearer.

> -    ctx->poll.ns = 0;
> +    qemu_lockcnt_inc(&ctx->list_lock);
> +    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
> +        node->poll.ns = 0;
> +    }
> +    qemu_lockcnt_dec(&ctx->list_lock);
>  
>      ctx->poll_max_ns = max_ns;
>      ctx->poll_grow = grow;
> diff --git a/util/async.c b/util/async.c
> index 38667ea091..4124a948fd 100644
> --- a/util/async.c
> +++ b/util/async.c
> @@ -609,8 +609,6 @@ AioContext *aio_context_new(Error **errp)
>      qemu_rec_mutex_init(&ctx->lock);
>      timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
>  
> -    ctx->poll.ns = 0;
> -
>      ctx->poll_max_ns = 0;
>      ctx->poll_grow = 0;
>      ctx->poll_shrink = 0;
> -- 
> 2.48.1
>
Kevin Wolf March 10, 2025, 11:11 a.m. UTC | #2
Am 10.03.2025 um 11:55 hat Stefan Hajnoczi geschrieben:
> On Fri, Mar 07, 2025 at 11:16:34PM +0100, Kevin Wolf wrote:
> > Adaptive polling has a big problem: It doesn't consider that an event
> > loop can wait for many different events that may have very different
> > typical latencies.
> > 
> > For example, think of a guest that tends to send a new I/O request soon
> > after the previous I/O request completes, but the storage on the host is
> > rather slow. In this case, getting the new request from guest quickly
> > means that polling is enabled, but the next thing is performing the I/O
> > request on the backend, which is slow and disables polling again for the
> > next guest request. This means that in such a scenario, polling could
> > help for every other event, but is only ever enabled when it can't
> > succeed.
> > 
> > In order to fix this, keep a separate AioPolledEvent for each
> > AioHandler. We will then know that the backend file descriptor always
> > has a high latency and isn't worth polling for, but we also know that
> > the guest is always fast and we should poll for it. This solves at least
> > half of the problem, we can now keep polling for those cases where it
> > makes sense and get the improved performance from it.
> > 
> > Since the event loop doesn't know which event will be next, we still do
> > some unnecessary polling while we're waiting for the slow disk. I made
> > some attempts to be more clever than just randomly growing and shrinking
> > the polling time, and even to let callers be explicit about when they
> > expect a new event, but so far this hasn't resulted in improved
> > performance or even caused performance regressions. For now, let's just
> > fix the part that is easy enough to fix, we can revisit the rest later.
> > 
> > Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> > ---
> >  include/block/aio.h |  1 -
> >  util/aio-posix.h    |  1 +
> >  util/aio-posix.c    | 24 +++++++++++++++++++++---
> >  util/async.c        |  2 --
> >  4 files changed, 22 insertions(+), 6 deletions(-)
> > 
> > diff --git a/include/block/aio.h b/include/block/aio.h
> > index 49f46e01cb..0ef7ce48e3 100644
> > --- a/include/block/aio.h
> > +++ b/include/block/aio.h
> > @@ -233,7 +233,6 @@ struct AioContext {
> >      int poll_disable_cnt;
> >  
> >      /* Polling mode parameters */
> > -    AioPolledEvent poll;
> >      int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
> >      int64_t poll_grow;      /* polling time growth factor */
> >      int64_t poll_shrink;    /* polling time shrink factor */
> > diff --git a/util/aio-posix.h b/util/aio-posix.h
> > index 4264c518be..82a0201ea4 100644
> > --- a/util/aio-posix.h
> > +++ b/util/aio-posix.h
> > @@ -38,6 +38,7 @@ struct AioHandler {
> >  #endif
> >      int64_t poll_idle_timeout; /* when to stop userspace polling */
> >      bool poll_ready; /* has polling detected an event? */
> > +    AioPolledEvent poll;
> >  };
> >  
> >  /* Add a handler to a ready list */
> > diff --git a/util/aio-posix.c b/util/aio-posix.c
> > index 259827c7ad..2251871c61 100644
> > --- a/util/aio-posix.c
> > +++ b/util/aio-posix.c
> > @@ -579,13 +579,19 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
> >  static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
> >                            int64_t *timeout)
> >  {
> > +    AioHandler *node;
> >      int64_t max_ns;
> >  
> >      if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
> >          return false;
> >      }
> >  
> > -    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
> > +    max_ns = 0;
> > +    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
> > +        max_ns = MAX(max_ns, node->poll.ns);
> > +    }
> > +    max_ns = qemu_soonest_timeout(*timeout, max_ns);
> > +
> >      if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
> >          /*
> >           * Enable poll mode. It pairs with the poll_set_started() in
> > @@ -721,8 +727,14 @@ bool aio_poll(AioContext *ctx, bool blocking)
> >  
> >      /* Adjust polling time */
> >      if (ctx->poll_max_ns) {
> > +        AioHandler *node;
> >          int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
> > -        adjust_polling_time(ctx, &ctx->poll, block_ns);
> > +
> > +        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
> > +            if (QLIST_IS_INSERTED(node, node_ready)) {
> > +                adjust_polling_time(ctx, &node->poll, block_ns);
> > +            }
> > +        }
> >      }
> >  
> >      progress |= aio_bh_poll(ctx);
> > @@ -772,10 +784,16 @@ void aio_context_use_g_source(AioContext *ctx)
> >  void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
> >                                   int64_t grow, int64_t shrink, Error **errp)
> >  {
> > +    AioHandler *node;
> > +
> >      /* No thread synchronization here, it doesn't matter if an incorrect value
> >       * is used once.
> >       */
> 
> If you respin this series:
> 
> This comment is confusing now that qemu_lockcnt_inc() is being used.
> Lockcnt tells other threads in aio_set_fd_handler() not to remove nodes
> from the aio_handlers list (because we're traversing the list).
> 
> The comment is about the poll state though, not about the aio_handlers
> list. Moving it down to where poll_max_ns, etc are assigned would make
> it clearer.

Yes, I can do that while applying the series.

Should I add your R-b after making the change?

Kevin
Stefan Hajnoczi March 11, 2025, 2:18 a.m. UTC | #3
On Mon, Mar 10, 2025 at 12:11:44PM +0100, Kevin Wolf wrote:
> Am 10.03.2025 um 11:55 hat Stefan Hajnoczi geschrieben:
> > On Fri, Mar 07, 2025 at 11:16:34PM +0100, Kevin Wolf wrote:
> > > Adaptive polling has a big problem: It doesn't consider that an event
> > > loop can wait for many different events that may have very different
> > > typical latencies.
> > > 
> > > For example, think of a guest that tends to send a new I/O request soon
> > > after the previous I/O request completes, but the storage on the host is
> > > rather slow. In this case, getting the new request from guest quickly
> > > means that polling is enabled, but the next thing is performing the I/O
> > > request on the backend, which is slow and disables polling again for the
> > > next guest request. This means that in such a scenario, polling could
> > > help for every other event, but is only ever enabled when it can't
> > > succeed.
> > > 
> > > In order to fix this, keep a separate AioPolledEvent for each
> > > AioHandler. We will then know that the backend file descriptor always
> > > has a high latency and isn't worth polling for, but we also know that
> > > the guest is always fast and we should poll for it. This solves at least
> > > half of the problem, we can now keep polling for those cases where it
> > > makes sense and get the improved performance from it.
> > > 
> > > Since the event loop doesn't know which event will be next, we still do
> > > some unnecessary polling while we're waiting for the slow disk. I made
> > > some attempts to be more clever than just randomly growing and shrinking
> > > the polling time, and even to let callers be explicit about when they
> > > expect a new event, but so far this hasn't resulted in improved
> > > performance or even caused performance regressions. For now, let's just
> > > fix the part that is easy enough to fix, we can revisit the rest later.
> > > 
> > > Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> > > ---
> > >  include/block/aio.h |  1 -
> > >  util/aio-posix.h    |  1 +
> > >  util/aio-posix.c    | 24 +++++++++++++++++++++---
> > >  util/async.c        |  2 --
> > >  4 files changed, 22 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/include/block/aio.h b/include/block/aio.h
> > > index 49f46e01cb..0ef7ce48e3 100644
> > > --- a/include/block/aio.h
> > > +++ b/include/block/aio.h
> > > @@ -233,7 +233,6 @@ struct AioContext {
> > >      int poll_disable_cnt;
> > >  
> > >      /* Polling mode parameters */
> > > -    AioPolledEvent poll;
> > >      int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
> > >      int64_t poll_grow;      /* polling time growth factor */
> > >      int64_t poll_shrink;    /* polling time shrink factor */
> > > diff --git a/util/aio-posix.h b/util/aio-posix.h
> > > index 4264c518be..82a0201ea4 100644
> > > --- a/util/aio-posix.h
> > > +++ b/util/aio-posix.h
> > > @@ -38,6 +38,7 @@ struct AioHandler {
> > >  #endif
> > >      int64_t poll_idle_timeout; /* when to stop userspace polling */
> > >      bool poll_ready; /* has polling detected an event? */
> > > +    AioPolledEvent poll;
> > >  };
> > >  
> > >  /* Add a handler to a ready list */
> > > diff --git a/util/aio-posix.c b/util/aio-posix.c
> > > index 259827c7ad..2251871c61 100644
> > > --- a/util/aio-posix.c
> > > +++ b/util/aio-posix.c
> > > @@ -579,13 +579,19 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
> > >  static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
> > >                            int64_t *timeout)
> > >  {
> > > +    AioHandler *node;
> > >      int64_t max_ns;
> > >  
> > >      if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
> > >          return false;
> > >      }
> > >  
> > > -    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
> > > +    max_ns = 0;
> > > +    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
> > > +        max_ns = MAX(max_ns, node->poll.ns);
> > > +    }
> > > +    max_ns = qemu_soonest_timeout(*timeout, max_ns);
> > > +
> > >      if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
> > >          /*
> > >           * Enable poll mode. It pairs with the poll_set_started() in
> > > @@ -721,8 +727,14 @@ bool aio_poll(AioContext *ctx, bool blocking)
> > >  
> > >      /* Adjust polling time */
> > >      if (ctx->poll_max_ns) {
> > > +        AioHandler *node;
> > >          int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
> > > -        adjust_polling_time(ctx, &ctx->poll, block_ns);
> > > +
> > > +        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
> > > +            if (QLIST_IS_INSERTED(node, node_ready)) {
> > > +                adjust_polling_time(ctx, &node->poll, block_ns);
> > > +            }
> > > +        }
> > >      }
> > >  
> > >      progress |= aio_bh_poll(ctx);
> > > @@ -772,10 +784,16 @@ void aio_context_use_g_source(AioContext *ctx)
> > >  void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
> > >                                   int64_t grow, int64_t shrink, Error **errp)
> > >  {
> > > +    AioHandler *node;
> > > +
> > >      /* No thread synchronization here, it doesn't matter if an incorrect value
> > >       * is used once.
> > >       */
> > 
> > If you respin this series:
> > 
> > This comment is confusing now that qemu_lockcnt_inc() is being used.
> > Lockcnt tells other threads in aio_set_fd_handler() not to remove nodes
> > from the aio_handlers list (because we're traversing the list).
> > 
> > The comment is about the poll state though, not about the aio_handlers
> > list. Moving it down to where poll_max_ns, etc are assigned would make
> > it clearer.
> 
> Yes, I can do that while applying the series.
> 
> Should I add your R-b after making the change?

Yes, please.

Stefan
diff mbox series

Patch

diff --git a/include/block/aio.h b/include/block/aio.h
index 49f46e01cb..0ef7ce48e3 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -233,7 +233,6 @@  struct AioContext {
     int poll_disable_cnt;
 
     /* Polling mode parameters */
-    AioPolledEvent poll;
     int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
     int64_t poll_grow;      /* polling time growth factor */
     int64_t poll_shrink;    /* polling time shrink factor */
diff --git a/util/aio-posix.h b/util/aio-posix.h
index 4264c518be..82a0201ea4 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -38,6 +38,7 @@  struct AioHandler {
 #endif
     int64_t poll_idle_timeout; /* when to stop userspace polling */
     bool poll_ready; /* has polling detected an event? */
+    AioPolledEvent poll;
 };
 
 /* Add a handler to a ready list */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 259827c7ad..2251871c61 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -579,13 +579,19 @@  static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
 static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
                           int64_t *timeout)
 {
+    AioHandler *node;
     int64_t max_ns;
 
     if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
         return false;
     }
 
-    max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns);
+    max_ns = 0;
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+        max_ns = MAX(max_ns, node->poll.ns);
+    }
+    max_ns = qemu_soonest_timeout(*timeout, max_ns);
+
     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         /*
          * Enable poll mode. It pairs with the poll_set_started() in
@@ -721,8 +727,14 @@  bool aio_poll(AioContext *ctx, bool blocking)
 
     /* Adjust polling time */
     if (ctx->poll_max_ns) {
+        AioHandler *node;
         int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
-        adjust_polling_time(ctx, &ctx->poll, block_ns);
+
+        QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+            if (QLIST_IS_INSERTED(node, node_ready)) {
+                adjust_polling_time(ctx, &node->poll, block_ns);
+            }
+        }
     }
 
     progress |= aio_bh_poll(ctx);
@@ -772,10 +784,16 @@  void aio_context_use_g_source(AioContext *ctx)
 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
                                  int64_t grow, int64_t shrink, Error **errp)
 {
+    AioHandler *node;
+
     /* No thread synchronization here, it doesn't matter if an incorrect value
      * is used once.
      */
-    ctx->poll.ns = 0;
+    qemu_lockcnt_inc(&ctx->list_lock);
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        node->poll.ns = 0;
+    }
+    qemu_lockcnt_dec(&ctx->list_lock);
 
     ctx->poll_max_ns = max_ns;
     ctx->poll_grow = grow;
diff --git a/util/async.c b/util/async.c
index 38667ea091..4124a948fd 100644
--- a/util/async.c
+++ b/util/async.c
@@ -609,8 +609,6 @@  AioContext *aio_context_new(Error **errp)
     qemu_rec_mutex_init(&ctx->lock);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
-    ctx->poll.ns = 0;
-
     ctx->poll_max_ns = 0;
     ctx->poll_grow = 0;
     ctx->poll_shrink = 0;