diff mbox series

[1/4] aio-posix: fix polling mode with fdmon-io_uring

Message ID 20250326183340.1010531-2-stefanha@redhat.com (mailing list archive)
State New
Headers show
Series aio-posix: integrate fdmon into glib event loop | expand

Commit Message

Stefan Hajnoczi March 26, 2025, 6:33 p.m. UTC
The io_uring(7) file descriptor monitor cannot enter polling mode
because it needs to submit a POLL_ADD SQE every time a file descriptor
becomes active. Submitting SQEs only happens in FDMonOps->wait() outside
of polling mode.

Fix this using the multi-shot mechanism introduced in Linux 5.13 and
liburing 2.1. Stable and enterprise Linux distros ship 5.14+ as of March
2025, so it is safe to require this. Note that fdmon-io_uring is
currently not enabled at runtime and is not essential, so QEMU can still
be built without it on older hosts.

In multi-shot mode, a POLL_ADD SQE remains active until canceled with
POLL_REMOVE. This avoids the need to submit a new SQE every time a file
descriptor becomes active.

When POLL_REMOVE is processed by the host kernel, the multi-shot
POLL_ADD operation completes with -ECANCELED. Adjust the code slightly
to take this into account.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 meson.build           |  2 +-
 util/fdmon-io_uring.c | 34 +++++++++++++++++++++-------------
 2 files changed, 22 insertions(+), 14 deletions(-)
diff mbox series

Patch

diff --git a/meson.build b/meson.build
index 41f68d3806..9f5f31ac46 100644
--- a/meson.build
+++ b/meson.build
@@ -1144,7 +1144,7 @@  linux_io_uring_test = '''
 
 linux_io_uring = not_found
 if not get_option('linux_io_uring').auto() or have_block
-  linux_io_uring = dependency('liburing', version: '>=0.3',
+  linux_io_uring = dependency('liburing', version: '>=2.1',
                               required: get_option('linux_io_uring'),
                               method: 'pkg-config')
   if not cc.links(linux_io_uring_test)
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index b0d68bdc44..6cd665e565 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -124,8 +124,7 @@  static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
     /*
      * Don't clear FDMON_IO_URING_REMOVE.  It's sticky so it can serve two
      * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
-     * telling process_cqe() to delete the AioHandler when its
-     * IORING_OP_POLL_ADD completes.
+     * telling process_cqe() to ignore IORING_OP_POLL_ADD completions.
      */
     *flags = qatomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
                                               FDMON_IO_URING_ADD));
@@ -166,12 +165,12 @@  static void fdmon_io_uring_update(AioContext *ctx,
     }
 }
 
-static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
+static void add_poll_multishot_sqe(AioContext *ctx, AioHandler *node)
 {
     struct io_uring_sqe *sqe = get_sqe(ctx);
     int events = poll_events_from_pfd(node->pfd.events);
 
-    io_uring_prep_poll_add(sqe, node->pfd.fd, events);
+    io_uring_prep_poll_multishot(sqe, node->pfd.fd, events);
     io_uring_sqe_set_data(sqe, node);
 }
 
@@ -213,7 +212,7 @@  static void fill_sq_ring(AioContext *ctx)
     while ((node = dequeue(&submit_list, &flags))) {
         /* Order matters, just in case both flags were set */
         if (flags & FDMON_IO_URING_ADD) {
-            add_poll_add_sqe(ctx, node);
+            add_poll_multishot_sqe(ctx, node);
         }
         if (flags & FDMON_IO_URING_REMOVE) {
             add_poll_remove_sqe(ctx, node);
@@ -234,21 +233,30 @@  static bool process_cqe(AioContext *ctx,
         return false;
     }
 
+    flags = qatomic_read(&node->flags);
+
     /*
-     * Deletion can only happen when IORING_OP_POLL_ADD completes.  If we race
-     * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
-     * bit before IORING_OP_POLL_REMOVE is submitted.
+     * poll_multishot cancelled by poll_remove? Or completed early because fd
+     * was closed before poll_remove finished?
      */
-    flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
-    if (flags & FDMON_IO_URING_REMOVE) {
+    if (cqe->res == -ECANCELED || cqe->res == -EBADF) {
+        assert(!(cqe->flags & IORING_CQE_F_MORE));
+        assert(flags & FDMON_IO_URING_REMOVE);
         QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
         return false;
     }
 
-    aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
+    /* Ignore if it becomes ready during removal */
+    if (flags & FDMON_IO_URING_REMOVE) {
+        return false;
+    }
 
-    /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
-    add_poll_add_sqe(ctx, node);
+    /* Multi-shot can stop at any time, so re-arm if necessary */
+    if (!(cqe->flags & IORING_CQE_F_MORE)) {
+        add_poll_multishot_sqe(ctx, node);
+    }
+
+    aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
     return true;
 }