diff mbox series

io_uring: allow application controlled CQ ring size

Message ID 0188a3ff-6a41-1a95-f444-2ef308a83f7a@kernel.dk (mailing list archive)
State New, archived
Headers show
Series io_uring: allow application controlled CQ ring size | expand

Commit Message

Jens Axboe Oct. 4, 2019, 7:18 p.m. UTC
We currently size the CQ ring as twice the SQ ring, to allow some
flexibility in not overflowing the CQ ring. This is done because the
SQE life time is different than that of the IO request itself, the SQE
is consumed as soon as the kernel has seen the entry.

Certain application don't need a huge SQ ring size, since they just
submit IO in batches. But they may have a lot of requests pending, and
hence need a big CQ ring to hold them all. By allowing the application
to control the CQ ring size multiplier, we can cater to those
applications more efficiently.

If an application wants to define its own CQ ring size, it must set
IORING_SETUP_CQSIZE in the setup flags, and fill out
io_uring_params->cq_entries. The value must be a power of two.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8afb0b689523..bfbb7ab3c4e4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -76,6 +76,7 @@ 
 #include "internal.h"
 
 #define IORING_MAX_ENTRIES	32768
+#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 #define IORING_MAX_FIXED_FILES	1024
 
 struct io_uring {
@@ -3984,10 +3985,23 @@  static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	 * Use twice as many entries for the CQ ring. It's possible for the
 	 * application to drive a higher depth than the size of the SQ ring,
 	 * since the sqes are only used at submission time. This allows for
-	 * some flexibility in overcommitting a bit.
+	 * some flexibility in overcommitting a bit. If the application has
+	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
+	 * of CQ ring entries manually.
 	 */
 	p->sq_entries = roundup_pow_of_two(entries);
-	p->cq_entries = 2 * p->sq_entries;
+	if (p->flags & IORING_SETUP_CQSIZE) {
+		/*
+		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
+		 * to a power-of-two, if it isn't already. We do NOT impose
+		 * any cq vs sq ring sizing.
+		 */
+		if (!p->cq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
+			return -EINVAL;
+		p->cq_entries = roundup_pow_of_two(p->cq_entries);
+	} else {
+		p->cq_entries = 2 * p->sq_entries;
+	}
 
 	user = get_uid(current_user());
 	account_mem = !capable(CAP_IPC_LOCK);
@@ -4068,7 +4082,7 @@  static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	}
 
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
-			IORING_SETUP_SQ_AFF))
+			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4f532d9c0554..e0137ea6ad79 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -50,6 +50,7 @@  struct io_uring_sqe {
 #define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
 #define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
 #define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
+#define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
 
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1