diff mbox series

[2/4] io_uring: implement our own schedule timeout handling

Message ID 20240215161002.3044270-3-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Add support for batched min timeout | expand

Commit Message

Jens Axboe Feb. 15, 2024, 4:06 p.m. UTC
In preparation for having two distinct timeouts and avoid waking the
task if we don't need to.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 39 +++++++++++++++++++++++++++++++++++----
 io_uring/io_uring.h |  2 ++
 2 files changed, 37 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8f52acc14ebc..ebc646ad6acf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2488,7 +2488,7 @@  static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
+	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }
@@ -2516,6 +2516,37 @@  static bool current_pending_io(void)
 	return percpu_counter_read_positive(&tctx->inflight);
 }
 
+static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
+{
+	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	WRITE_ONCE(iowq->hit_timeout, 1);
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		wake_up_process(ctx->submitter_task);
+	else
+		io_cqring_wake(ctx);
+	return HRTIMER_NORESTART;
+}
+
+static int io_cqring_schedule_timeout(struct io_wait_queue *iowq)
+{
+	iowq->hit_timeout = 0;
+	hrtimer_init_on_stack(&iowq->t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	iowq->t.function = io_cqring_timer_wakeup;
+	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
+	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
+
+	if (!READ_ONCE(iowq->hit_timeout))
+		schedule();
+
+	hrtimer_cancel(&iowq->t);
+	destroy_hrtimer_on_stack(&iowq->t);
+	__set_current_state(TASK_RUNNING);
+
+	return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
+}
+
 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 				     struct io_wait_queue *iowq)
 {
@@ -2529,10 +2560,10 @@  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	io_wait = current->in_iowait;
 	if (current_pending_io())
 		current->in_iowait = 1;
-	if (iowq->timeout == KTIME_MAX)
+	if (iowq->timeout != KTIME_MAX)
+		ret = io_cqring_schedule_timeout(iowq);
+	else
 		schedule();
-	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
-		ret = -ETIME;
 	current->in_iowait = io_wait;
 	return ret;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 6426ee382276..9d1045bdc505 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -40,7 +40,9 @@  struct io_wait_queue {
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
 	unsigned nr_timeouts;
+	int hit_timeout;
 	ktime_t timeout;
+	struct hrtimer t;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int napi_busy_poll_to;