diff mbox series

[v9,16/17] block/io_uring: adds fd registration

Message ID 20190801234031.29561-17-mehta.aaru20@gmail.com (mailing list archive)
State New, archived
Headers show
Series Add support for io_uring | expand

Commit Message

Aarushi Mehta Aug. 1, 2019, 11:40 p.m. UTC
Signed-off-by: Aarushi Mehta <mehta.aaru20@gmail.com>
---
 block/io_uring.c   | 107 ++++++++++++++++++++++++++++++++++++++++++++-
 block/trace-events |   1 +
 2 files changed, 107 insertions(+), 1 deletion(-)

Comments

Stefan Hajnoczi Aug. 2, 2019, 8:03 a.m. UTC | #1
On Fri, Aug 02, 2019 at 05:10:30AM +0530, Aarushi Mehta wrote:

The fd lifecycle/leak issue remains.  After a drive is removed the
kernel still has a reference to the file.  If this repeats many times
our process will run out of open files.

A callback is required to unregister the file descriptor in
block/file-posix.c:

  static void raw_aio_detach_aio_context(BlockDriverState *bs)
  {
  #ifdef CONFIG_LINUX_IO_URING
      BDRVRawState *s = bs->opaque;
      LuringState *luring;

      luring = aio_get_linux_io_uring(bdrv_get_aio_context(bs));

      if (luring && s->fd >= 0) {
          luring_fd_unregister(luring, s->fd);
      }
  #endif
  }

I think this should eliminate fd leaks, but please test it.  You can use
drive_add/drive_del and device_add/device_del to hotplug and unplug
-drive and -device objects on the HMP monitor.  Use "ls -l /proc/PID/fd"
to see the list of currently open files.

> +    g_hash_table_insert(lookup, GINT_TO_POINTER(fd), GINT_TO_POINTER(nr));
> +    trace_luring_fd_register(fd, nr);

This trace event can be made even more useful by including
io_uring_register_files()'s return value so we know whether the kernel
accepted fd_array[] or not.

> +    return io_uring_register_files(ring, fd_reg->fd_array, nr + 1);
> +}
> +/**
> + * luring_fd_unregister:
> + *
> + * Unregisters file descriptors, TODO: error handling
> + */
> +static void luring_fd_unregister(LuringState *s)
> +{
> +        io_uring_unregister_files(&s->ring);
> +        g_hash_table_unref(s->fd_reg.fd_lookup);
> +        g_free(s->fd_reg.fd_array);

Please use 4-space indentation.

Missing s->fd_reg.fd_array = NULL so that the next g_realloc_n()
allocates a fresh array instead of trying to reallocate a freed pointer.

> +}
> +
> +/**
> + * luring_fd_lookup:
> + *
> + * Used to lookup fd index in registered array at submission time
> + * If the lookup table has not been created or the fd is not in the table,
> + * the fd is registered.
> + *
> + * If registration errors, the hash is cleared and the fd used directly
> + *
> + * Unregistering is done at luring_detach_aio_context
> + */
> +static int luring_fd_lookup(LuringState *s, int fd)
> +{
> +    int ret;
> +    void *index;
> +    GHashTable *lookup;
> +
> +    if (!s->fd_reg.fd_lookup) {
> +        s->fd_reg.fd_lookup = g_hash_table_new_full(g_direct_hash,
> +                                                    g_direct_equal,
> +                                                    g_free, g_free);
> +        luring_fd_register(&s->ring, &s->fd_reg, fd);
> +    }

This if statement can be eliminated:

  static void luring_fd_init(LuringState *s)
  {
      s->fd_reg.fd_lookup = g_hash_table_new_full(g_direct_hash,
                                                  g_direct_equal,
						  g_free, g_free);
  }

  static void luring_fd_cleanup(LuringState *s)
  {
      io_uring_unregister_files(&s->ring);
      g_hash_table_unref(s->fd_reg.fd_lookup);
      g_free(s->fd_reg.fd_array);
      s->fd_reg.fd_array = NULL;
  }

Call luring_fd_init() from luring_attach_aio_context() and call
luring_fd_cleanup() from luring_detach_aio_context().  This makes
luring_fd_lookup() simpler and gives a nice symmetry to attach/detach.

luring_fd_cleanup() is just luring_fd_unregister() renamed.

> +    lookup = s->fd_reg.fd_lookup;
> +    index = g_hash_table_lookup(lookup, GINT_TO_POINTER(fd));
> +
> +    if (index < 0) {
> +        ret = luring_fd_register(&s->ring, &s->fd_reg, fd);
> +
> +        if (ret < 0) {
> +            if (ret == -ENOMEM || ret == -EMFILE ||
> +                ret == -ENXIO) {
> +                return ret;
> +            } else {
> +                /* Should not reach here */
> +                g_hash_table_remove_all(lookup);
> +                g_free(s->fd_reg.fd_array);
> +                return ret;

I suggest making luring_fd_register() clean up after itself when an
error occurs.  Then you can change this code to:

  if (ret < 0) {
      return ret;
  }

It's usually convenient for a function to clean up after itself instead
of relying on the caller to do it since only the function knows exactly
what state has been modified so far.

The luring_fd_register() code becomes:

  ret = io_uring_register_files(ring, fd_reg->fd_array, nr + 1);
  if (ret == -ENOMEM || ret == -EMFILE || ret == -ENXIO) {
      /* Leave fd_array[] alone, fd will be overwritten next time anyway */
      g_hash_table_remove(lookup, GINT_TO_POINTER(fd));
  } else if (ret < 0) {
      /* A more severe error, clear out all registered fds */
      g_hash_table_remove_all(lookup);
      g_free(s->fd_reg.fd_array);
      s->fd_reg.fd_array = NULL;
  }
  return ret;

> +            }
> +        }
> +        index = g_hash_table_lookup(lookup, GINT_TO_POINTER(fd));

One final idea: make luring_fd_register() return the index on success so
callers don't need to look up the key again.  In luring_fd_register():

  if (ret < 0) {
      return ret;
  } else {
      return nr;
  }
diff mbox series

Patch

diff --git a/block/io_uring.c b/block/io_uring.c
index 86f32e18a1..1553cd2e58 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -45,10 +45,16 @@  typedef struct LuringQueue {
     QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
 } LuringQueue;
 
+typedef struct LuringFd {
+    int *fd_array;
+    GHashTable *fd_lookup;
+} LuringFd;
+
 typedef struct LuringState {
     AioContext *aio_context;
 
     struct io_uring ring;
+    LuringFd fd_reg;
 
     /* io queue for submit at batch.  Protected by AioContext lock. */
     LuringQueue io_q;
@@ -306,6 +312,94 @@  static int ioq_submit(LuringState *s)
     return ret;
 }
 
+/**
+ * luring_fd_register:
+ *
+ * Register file descriptors, see luring_fd_lookup
+ */
+static int luring_fd_register(struct io_uring *ring, LuringFd *fd_reg, int fd)
+{
+    int ret, nr;
+    GHashTable *lookup = fd_reg->fd_lookup;
+    nr = g_hash_table_size(lookup);
+
+    /* If adding new, API requires older registrations to be removed */
+    if (nr) {
+        /*
+         * See linux b19062a56726, register needs the ring mutex, any
+         * submission in progress will complete before unregistering begins
+         * and new ones will have to wait.
+         */
+        ret = io_uring_unregister_files(ring);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    fd_reg->fd_array = g_realloc_n(fd_reg->fd_array, nr + 1, sizeof(int));
+    fd_reg->fd_array[nr] = fd;
+
+    g_hash_table_insert(lookup, GINT_TO_POINTER(fd), GINT_TO_POINTER(nr));
+    trace_luring_fd_register(fd, nr);
+    return io_uring_register_files(ring, fd_reg->fd_array, nr + 1);
+}
+/**
+ * luring_fd_unregister:
+ *
+ * Unregisters file descriptors, TODO: error handling
+ */
+static void luring_fd_unregister(LuringState *s)
+{
+        io_uring_unregister_files(&s->ring);
+        g_hash_table_unref(s->fd_reg.fd_lookup);
+        g_free(s->fd_reg.fd_array);
+}
+
+/**
+ * luring_fd_lookup:
+ *
+ * Used to lookup fd index in registered array at submission time
+ * If the lookup table has not been created or the fd is not in the table,
+ * the fd is registered.
+ *
+ * If registration errors, the hash is cleared and the fd used directly
+ *
+ * Unregistering is done at luring_detach_aio_context
+ */
+static int luring_fd_lookup(LuringState *s, int fd)
+{
+    int ret;
+    void *index;
+    GHashTable *lookup;
+
+    if (!s->fd_reg.fd_lookup) {
+        s->fd_reg.fd_lookup = g_hash_table_new_full(g_direct_hash,
+                                                    g_direct_equal,
+                                                    g_free, g_free);
+        luring_fd_register(&s->ring, &s->fd_reg, fd);
+    }
+    lookup = s->fd_reg.fd_lookup;
+    index = g_hash_table_lookup(lookup, GINT_TO_POINTER(fd));
+
+    if (index < 0) {
+        ret = luring_fd_register(&s->ring, &s->fd_reg, fd);
+
+        if (ret < 0) {
+            if (ret == -ENOMEM || ret == -EMFILE ||
+                ret == -ENXIO) {
+                return ret;
+            } else {
+                /* Should not reach here */
+                g_hash_table_remove_all(lookup);
+                g_free(s->fd_reg.fd_array);
+                return ret;
+            }
+        }
+        index = g_hash_table_lookup(lookup, GINT_TO_POINTER(fd));
+    }
+    return GPOINTER_TO_INT(index);
+}
+
 void luring_io_plug(BlockDriverState *bs, LuringState *s)
 {
     trace_luring_io_plug(s);
@@ -337,9 +431,14 @@  void luring_io_unplug(BlockDriverState *bs, LuringState *s)
 static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
                             uint64_t offset, int type)
 {
-    int ret;
+    int ret, fd_index;
     struct io_uring_sqe *sqes = &luringcb->sqeq;
 
+    fd_index = luring_fd_lookup(s, fd);
+    if (fd_index >= 0) {
+        fd = fd_index;
+    }
+
     switch (type) {
     case QEMU_AIO_WRITE:
         io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
@@ -357,7 +456,11 @@  static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
                         __func__, type);
         abort();
     }
+
     io_uring_sqe_set_data(sqes, luringcb);
+    if (fd_index >= 0) {
+        io_uring_sqe_set_flags(sqes, IOSQE_FIXED_FILE);
+    }
 
     QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
     s->io_q.in_queue++;
@@ -383,6 +486,7 @@  int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
         .qiov       = qiov,
         .is_read    = (type == QEMU_AIO_READ),
     };
+
     trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
                            type);
     ret = luring_do_submit(fd, &luringcb, s, offset, type);
@@ -399,6 +503,7 @@  int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
 
 void luring_detach_aio_context(LuringState *s, AioContext *old_context)
 {
+    luring_fd_unregister(s);
     aio_set_fd_handler(old_context, s->ring.ring_fd, false, NULL, NULL, NULL,
                        s);
     qemu_bh_delete(s->completion_bh);
diff --git a/block/trace-events b/block/trace-events
index 66aaf8352b..13571aa182 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -71,6 +71,7 @@  luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, siz
 luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
 luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
 luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
+luring_fd_register(int fd, int index) "fd %d index %d"
 
 # qcow2.c
 qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"