diff mbox series

[v8,16/16] block/io_uring: adds fd registration

Message ID 20190730173441.26486-17-mehta.aaru20@gmail.com (mailing list archive)
State New, archived
Headers show
Series Add support for io_uring | expand

Commit Message

Aarushi Mehta July 30, 2019, 5:34 p.m. UTC
The fd registration API in io_uring registers a series of fds
together that cannot be modified later. Thus, a hashtable
maintains an index of fds registered and their index in the
internally registered array. The existing fd array is unregistered
and a new one submitted.

Signed-off-by: Aarushi Mehta <mehta.aaru20@gmail.com>
---
 block/io_uring.c   | 86 ++++++++++++++++++++++++++++++++++++++++++++--
 block/trace-events |  1 +
 2 files changed, 85 insertions(+), 2 deletions(-)

Comments

Stefan Hajnoczi July 31, 2019, 10:26 a.m. UTC | #1
On Tue, Jul 30, 2019 at 11:04:41PM +0530, Aarushi Mehta wrote:

I'm concerned about file descriptor leaks.  fd_array[] keeps file
descriptors basically forever, even after the file is no longer in use
by the rest of QEMU.  There needs to be a call to unregister whenever a
file is closed elsewhere in QEMU.  For benchmarking and experimentation
the current code is okay, but for production usage the leak must be
prevented.

> +/**
> + * luring_fd_register:
> + *
> + * Register and unregisters file descriptors, see luring_fd_lookup
> + */
> +static int luring_fd_register(struct io_uring *ring, LuringFd *fd_reg, int fd)
> +{
> +    int ret, nr;
> +    GHashTable *lookup = fd_reg->fd_lookup;
> +    nr = g_hash_table_size(lookup);
> +
> +    /* Unregister */
> +    if (!fd) {
> +        ret = io_uring_unregister_files(ring);
> +        g_hash_table_remove_all(lookup);

Is it correct to clear the hash table be cleared if there was an error?

> +        return ret;
> +    }

Please make unregistering all files a separate function.  It's not
necessary to overload this function since this is a completely separate
operation.

> +
> +    /* If adding new, API requires older registrations to be removed */
> +    if (nr) {
> +        io_uring_unregister_files(ring);
> +    }
> +
> +    fd_reg->fd_array = g_realloc_n(fd_reg->fd_array, nr + 1, sizeof(int));
> +    fd_reg->fd_array[nr] = fd;
> +    fd_reg->fd_index = g_realloc_n(fd_reg->fd_index, nr + 1, sizeof(int));
> +    fd_reg->fd_index[nr] = nr;
> +
> +    g_hash_table_insert(lookup, &fd_reg->fd_array[nr], &fd_reg->fd_index[nr]);

fd_index[] is not necessary, you can cast nr to a gpointer instead to
store the data directly inside GHashTable:

  g_hash_table_insert(lookup, &fd_reg->fd_array[nr],
                      GINT_TO_POINTER(nr));

The hash table accesses can be made slightly more efficient by avoiding
the pointer dereference for keys as well:

  g_hash_table_insert(lookup, GINT_TO_POINTER(fd),
                      GINT_TO_POINTER(nr));

In this case fd_array[] is only used for the io_uring_register_files()
call and nothing else.  Remember to switch to g_direct_equal() and
g_direct_hash() in g_hash_table_new_full() if you make the key a direct
gpointer.

> +    trace_luring_fd_register(fd, nr);
> +    return io_uring_register_files(ring, fd_reg->fd_array, nr + 1);
> +}
> +
> +/**
> + * luring_fd_lookup:
> + *
> + * Used to lookup fd index in registered array at submission time
> + * If the lookup table has not been created or the fd is not in the table,
> + * the fd is registered.
> + *
> + * If registration errors, the hash is cleared and the fd used directly
> + *
> + * Unregistering is done at luring_detach_aio_context
> + */
> +static int luring_fd_lookup(LuringState *s, int fd)
> +{
> +    int *index, ret;
> +    if (!s->fd_reg.fd_lookup) {
> +        s->fd_reg.fd_lookup = g_hash_table_new_full(g_int_hash, g_int_equal,
> +                                                    g_free, g_free);

fd_array[] and fd_index[] are allocated in single allocations for the
entire array, therefore g_free(key) and g_free(value) on individual
elements is undefined behavior and could crash the program.  There
should be no destroy function for them.

Missing g_hash_table_unref() to free fd_lookup.

> +        luring_fd_register(&s->ring, &s->fd_reg, fd);
> +    }
> +    index = g_hash_table_lookup(s->fd_reg.fd_lookup, &fd);
> +
> +    if (!index) {
> +        ret = luring_fd_register(&s->ring, &s->fd_reg, fd);
> +        if (ret < 0) {
> +            g_hash_table_remove_all(s->fd_reg.fd_lookup);

Why is the hash table cleared and why are fd_array[]/fd_index[] left
behind?

> +            return ret;
> +        }
> +        index = g_hash_table_lookup(s->fd_reg.fd_lookup, &fd);
> +    }
> +    return *index;
> +}

What are the concerns about in-flight requests and how are they
addressed?  For example, if a request is in-flight and another request
wants to add a new fd then io_uring_unregister_files() and
io_uring_register_files() are called while a request is still in-flight.
How does the io_uring kernel code handle this?
diff mbox series

Patch

diff --git a/block/io_uring.c b/block/io_uring.c
index e2bef380e7..eb8fd23822 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -45,10 +45,17 @@  typedef struct LuringQueue {
     QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
 } LuringQueue;
 
+typedef struct LuringFd {
+    int *fd_array;
+    int *fd_index;
+    GHashTable *fd_lookup;
+} LuringFd;
+
 typedef struct LuringState {
     AioContext *aio_context;
 
     struct io_uring ring;
+    LuringFd fd_reg;
 
     /* io queue for submit at batch.  Protected by AioContext lock. */
     LuringQueue io_q;
@@ -305,6 +312,70 @@  static int ioq_submit(LuringState *s)
     }
     return ret;
 }
+/**
+ * luring_fd_register:
+ *
+ * Register and unregisters file descriptors, see luring_fd_lookup
+ */
+static int luring_fd_register(struct io_uring *ring, LuringFd *fd_reg, int fd)
+{
+    int ret, nr;
+    GHashTable *lookup = fd_reg->fd_lookup;
+    nr = g_hash_table_size(lookup);
+
+    /* Unregister */
+    if (!fd) {
+        ret = io_uring_unregister_files(ring);
+        g_hash_table_remove_all(lookup);
+        return ret;
+    }
+
+    /* If adding new, API requires older registrations to be removed */
+    if (nr) {
+        io_uring_unregister_files(ring);
+    }
+
+    fd_reg->fd_array = g_realloc_n(fd_reg->fd_array, nr + 1, sizeof(int));
+    fd_reg->fd_array[nr] = fd;
+    fd_reg->fd_index = g_realloc_n(fd_reg->fd_index, nr + 1, sizeof(int));
+    fd_reg->fd_index[nr] = nr;
+
+    g_hash_table_insert(lookup, &fd_reg->fd_array[nr], &fd_reg->fd_index[nr]);
+    trace_luring_fd_register(fd, nr);
+    return io_uring_register_files(ring, fd_reg->fd_array, nr + 1);
+}
+
+/**
+ * luring_fd_lookup:
+ *
+ * Used to lookup fd index in registered array at submission time
+ * If the lookup table has not been created or the fd is not in the table,
+ * the fd is registered.
+ *
+ * If registration errors, the hash is cleared and the fd used directly
+ *
+ * Unregistering is done at luring_detach_aio_context
+ */
+static int luring_fd_lookup(LuringState *s, int fd)
+{
+    int *index, ret;
+    if (!s->fd_reg.fd_lookup) {
+        s->fd_reg.fd_lookup = g_hash_table_new_full(g_int_hash, g_int_equal,
+                                                    g_free, g_free);
+        luring_fd_register(&s->ring, &s->fd_reg, fd);
+    }
+    index = g_hash_table_lookup(s->fd_reg.fd_lookup, &fd);
+
+    if (!index) {
+        ret = luring_fd_register(&s->ring, &s->fd_reg, fd);
+        if (ret < 0) {
+            g_hash_table_remove_all(s->fd_reg.fd_lookup);
+            return ret;
+        }
+        index = g_hash_table_lookup(s->fd_reg.fd_lookup, &fd);
+    }
+    return *index;
+}
 
 void luring_io_plug(BlockDriverState *bs, LuringState *s)
 {
@@ -357,7 +428,11 @@  static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
                         __func__, type);
         abort();
     }
+
     io_uring_sqe_set_data(sqes, luringcb);
+    if (s->fd_reg.fd_array) {
+        io_uring_sqe_set_flags(sqes, IOSQE_FIXED_FILE);
+    }
 
     QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
     s->io_q.in_queue++;
@@ -374,15 +449,21 @@  static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
 }
 
 int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
                                   uint64_t offset, QEMUIOVector *qiov, int type)
 {
-    int ret;
+    int ret, fd_index;
     LuringAIOCB luringcb = {
         .co         = qemu_coroutine_self(),
         .ret        = -EINPROGRESS,
         .qiov       = qiov,
         .is_read    = (type == QEMU_AIO_READ),
     };
+
+    fd_index = luring_fd_lookup(s, fd);
+    if (fd_index >= 0) {
+        fd = fd_index;
+    }
+
     trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
                            type);
     ret = luring_do_submit(fd, &luringcb, s, offset, type);
@@ -399,6 +480,7 @@  int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
 
 void luring_detach_aio_context(LuringState *s, AioContext *old_context)
 {
+    luring_fd_register(&s->ring, &s->fd_reg, 0);
     aio_set_fd_handler(old_context, s->ring.ring_fd, false, NULL, NULL, NULL,
                        s);
     qemu_bh_delete(s->completion_bh);
diff --git a/block/trace-events b/block/trace-events
index 66aaf8352b..13571aa182 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -71,6 +71,7 @@  luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, siz
 luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
 luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
 luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
+luring_fd_register(int fd, int index) "fd %d index %d"
 
 # qcow2.c
 qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"