diff mbox series

[3/5] io_uring: implement our own schedule timeout handling

Message ID 20240821141910.204660-4-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Add support for batched min timeout | expand

Commit Message

Jens Axboe Aug. 21, 2024, 2:16 p.m. UTC
In preparation for having two distinct timeouts and avoid waking the
task if we don't need to.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 37 ++++++++++++++++++++++++++++++++-----
 io_uring/io_uring.h |  2 ++
 2 files changed, 34 insertions(+), 5 deletions(-)

Comments

Pavel Begunkov Aug. 22, 2024, 1:22 p.m. UTC | #1
On 8/21/24 15:16, Jens Axboe wrote:
> In preparation for having two distinct timeouts and avoid waking the
> task if we don't need to.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>   io_uring/io_uring.c | 37 ++++++++++++++++++++++++++++++++-----
>   io_uring/io_uring.h |  2 ++
>   2 files changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 9e2b8d4c05db..4ba5292137c3 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>   	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>   	 * the task, and the next invocation will do it.
>   	 */
> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)

Shouldn't be needed. If the timer fires, it should wake the task,
and the task will check ->hit_timeout there and later remove the
itself from the waitqueue.

>   		return autoremove_wake_function(curr, mode, wake_flags, key);
>   	return -1;
>   }
> @@ -2350,6 +2350,34 @@ static bool current_pending_io(void)
>   	return percpu_counter_read_positive(&tctx->inflight);
>   }
...
Jens Axboe Aug. 22, 2024, 3:27 p.m. UTC | #2
On 8/22/24 7:22 AM, Pavel Begunkov wrote:
> On 8/21/24 15:16, Jens Axboe wrote:
>> In preparation for having two distinct timeouts and avoid waking the
>> task if we don't need to.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>   io_uring/io_uring.c | 37 ++++++++++++++++++++++++++++++++-----
>>   io_uring/io_uring.h |  2 ++
>>   2 files changed, 34 insertions(+), 5 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 9e2b8d4c05db..4ba5292137c3 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>        * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>        * the task, and the next invocation will do it.
>>        */
>> -    if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>> +    if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
> 
> Shouldn't be needed. If the timer fires, it should wake the task,
> and the task will check ->hit_timeout there and later remove the
> itself from the waitqueue.

Good point indeed, I'll kill it.
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9e2b8d4c05db..4ba5292137c3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2322,7 +2322,7 @@  static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
+	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }
@@ -2350,6 +2350,34 @@  static bool current_pending_io(void)
 	return percpu_counter_read_positive(&tctx->inflight);
 }
 
+static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
+{
+	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+
+	WRITE_ONCE(iowq->hit_timeout, 1);
+	wake_up_process(iowq->wq.private);
+	return HRTIMER_NORESTART;
+}
+
+static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
+				      clockid_t clock_id)
+{
+	iowq->hit_timeout = 0;
+	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
+	iowq->t.function = io_cqring_timer_wakeup;
+	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
+	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
+
+	if (!READ_ONCE(iowq->hit_timeout))
+		schedule();
+
+	hrtimer_cancel(&iowq->t);
+	destroy_hrtimer_on_stack(&iowq->t);
+	__set_current_state(TASK_RUNNING);
+
+	return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
+}
+
 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 				     struct io_wait_queue *iowq)
 {
@@ -2362,11 +2390,10 @@  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 */
 	if (current_pending_io())
 		current->in_iowait = 1;
-	if (iowq->timeout == KTIME_MAX)
+	if (iowq->timeout != KTIME_MAX)
+		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
+	else
 		schedule();
-	else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0,
-						 HRTIMER_MODE_ABS, ctx->clockid))
-		ret = -ETIME;
 	current->in_iowait = 0;
 	return ret;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9935819f12b7..f95c1b080f4b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -40,7 +40,9 @@  struct io_wait_queue {
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
 	unsigned nr_timeouts;
+	int hit_timeout;
 	ktime_t timeout;
+	struct hrtimer t;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	ktime_t napi_busy_poll_dt;