diff mbox series

[RFC,3/3] io_uring: allow waiting loop to ignore some CQEs

Message ID a15bb014ecc67b004c2bd2283758c5ab3987e54a.1731205010.git.asml.silence@gmail.com (mailing list archive)
State New
Headers show
Series request parameter set api and wait termination tuning | expand

Commit Message

Pavel Begunkov Nov. 10, 2024, 2:56 p.m. UTC
The user might not care about getting results of certain request, but
there will still wake up the task (i.e. task_work) and trigger the
waiting loop to terminate.

IOSQE_SET_F_HINT_SILENT attempts to de-priorities such completions.
The completion will be eventually posted, however the execution of the
request can and likely will be delayed to batch it with other requests.

It's an incomplete prototype, it only works with DEFER_TASKRUN, fails to
apply the optimisation for task_works queued before the waiting loop
starts, and interaction with IOSQE_SET_F_HINT_IGNORE_INLINE is likely
broken.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/io_uring.c           | 43 +++++++++++++++++++++++------------
 io_uring/register.c           |  3 ++-
 3 files changed, 31 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e6d10fba8ae2..6dff0ee4e20c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -901,6 +901,7 @@  struct io_uring_recvmsg_out {
 
 enum {
 	IOSQE_SET_F_HINT_IGNORE_INLINE		= 1,
+	IOSQE_SET_F_HINT_SILENT			= 2,
 };
 
 struct io_uring_ioset_reg {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6e89435c243d..2e1af10fd4f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1270,6 +1270,7 @@  static inline void io_req_local_work_add(struct io_kiocb *req,
 {
 	unsigned nr_wait, nr_tw, nr_tw_prev;
 	struct llist_node *head;
+	bool ignore = req->ioset->flags & IOSQE_SET_F_HINT_SILENT;
 
 	/* See comment above IO_CQ_WAKE_INIT */
 	BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES);
@@ -1297,13 +1298,17 @@  static inline void io_req_local_work_add(struct io_kiocb *req,
 			nr_tw_prev = READ_ONCE(first_req->nr_tw);
 		}
 
-		/*
-		 * Theoretically, it can overflow, but that's fine as one of
-		 * previous adds should've tried to wake the task.
-		 */
-		nr_tw = nr_tw_prev + 1;
-		if (!(flags & IOU_F_TWQ_LAZY_WAKE))
-			nr_tw = IO_CQ_WAKE_FORCE;
+		nr_tw = nr_tw_prev;
+
+		if (!ignore) {
+			/*
+			 * Theoretically, it can overflow, but that's fine as
+			 * one of previous adds should've tried to wake the task.
+			 */
+			nr_tw += 1;
+			if (!(flags & IOU_F_TWQ_LAZY_WAKE))
+				nr_tw = IO_CQ_WAKE_FORCE;
+		}
 
 		req->nr_tw = nr_tw;
 		req->io_task_work.node.next = head;
@@ -1325,6 +1330,9 @@  static inline void io_req_local_work_add(struct io_kiocb *req,
 			io_eventfd_signal(ctx);
 	}
 
+	if (ignore)
+		return;
+
 	nr_wait = atomic_read(&ctx->cq_wait_nr);
 	/* not enough or no one is waiting */
 	if (nr_tw < nr_wait)
@@ -1405,7 +1413,7 @@  static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
 }
 
 static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
-			       int min_events)
+			       int min_events, struct io_wait_queue *waitq)
 {
 	struct llist_node *node;
 	unsigned int loops = 0;
@@ -1425,6 +1433,10 @@  static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
+
+		if (req->ioset->flags & IOSQE_SET_F_HINT_SILENT)
+			waitq->cq_tail++;
+
 		INDIRECT_CALL_2(req->io_task_work.func,
 				io_poll_task_func, io_req_rw_complete,
 				req, ts);
@@ -1450,16 +1462,17 @@  static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
 
 	if (llist_empty(&ctx->work_llist))
 		return 0;
-	return __io_run_local_work(ctx, &ts, min_events);
+	return __io_run_local_work(ctx, &ts, min_events, NULL);
 }
 
-static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
+static int io_run_local_work(struct io_ring_ctx *ctx, int min_events,
+			      struct io_wait_queue *waitq)
 {
 	struct io_tw_state ts = {};
 	int ret;
 
 	mutex_lock(&ctx->uring_lock);
-	ret = __io_run_local_work(ctx, &ts, min_events);
+	ret = __io_run_local_work(ctx, &ts, min_events, waitq);
 	mutex_unlock(&ctx->uring_lock);
 	return ret;
 }
@@ -2643,7 +2656,7 @@  int io_run_task_work_sig(struct io_ring_ctx *ctx)
 {
 	if (!llist_empty(&ctx->work_llist)) {
 		__set_current_state(TASK_RUNNING);
-		if (io_run_local_work(ctx, INT_MAX) > 0)
+		if (io_run_local_work(ctx, INT_MAX, NULL) > 0)
 			return 0;
 	}
 	if (io_run_task_work() > 0)
@@ -2806,7 +2819,7 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 	if (!io_allowed_run_tw(ctx))
 		return -EEXIST;
 	if (!llist_empty(&ctx->work_llist))
-		io_run_local_work(ctx, min_events);
+		io_run_local_work(ctx, min_events, NULL);
 	io_run_task_work();
 
 	if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
@@ -2877,7 +2890,7 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 		 * now rather than let the caller do another wait loop.
 		 */
 		if (!llist_empty(&ctx->work_llist))
-			io_run_local_work(ctx, nr_wait);
+			io_run_local_work(ctx, nr_wait, &iowq);
 		io_run_task_work();
 
 		/*
@@ -3389,7 +3402,7 @@  static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 
 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
 	    io_allowed_defer_tw_run(ctx))
-		ret |= io_run_local_work(ctx, INT_MAX) > 0;
+		ret |= io_run_local_work(ctx, INT_MAX, NULL) > 0;
 	ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, tctx, cancel_all);
diff --git a/io_uring/register.c b/io_uring/register.c
index f87ec7b773bd..5462c49bebd3 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -92,7 +92,8 @@  static int io_update_ioset(struct io_ring_ctx *ctx,
 {
 	if (!(ctx->flags & IORING_SETUP_IOSET))
 		return -EINVAL;
-	if (reg->flags & ~IOSQE_SET_F_HINT_IGNORE_INLINE)
+	if (reg->flags & ~(IOSQE_SET_F_HINT_IGNORE_INLINE |
+			   IOSQE_SET_F_HINT_SILENT))
 		return -EINVAL;
 	if (reg->__resv[0] || reg->__resv[1] || reg->__resv[2])
 		return -EINVAL;