Message ID | 20240819233042.230956-4-axboe@kernel.dk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for batched min timeout | expand |
On 2024-08-19 16:28, Jens Axboe wrote: > In preparation for having two distinct timeouts and avoid waking the > task if we don't need to. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- > io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- > io_uring/io_uring.h | 2 ++ > 2 files changed, 38 insertions(+), 5 deletions(-) > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index 9e2b8d4c05db..ddfbe04c61ed 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, > * Cannot safely flush overflowed CQEs from here, ensure we wake up > * the task, and the next invocation will do it. > */ > - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) > + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) iowq->hit_timeout may be modified in a timer softirq context, while this wait_queue_func_t (AIUI) may get called from any context e.g. net_rx_softirq for sockets. Does this need a READ_ONLY()? > return autoremove_wake_function(curr, mode, wake_flags, key); > return -1; > } > @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) > return percpu_counter_read_positive(&tctx->inflight); > } > > +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) > +{ > + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); > + struct io_ring_ctx *ctx = iowq->ctx; > + > + WRITE_ONCE(iowq->hit_timeout, 1); > + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) > + wake_up_process(ctx->submitter_task); > + else > + io_cqring_wake(ctx); This is a bit different to schedule_hrtimeout_range_clock(). Why is io_cqring_wake() needed here for non-DEFER_TASKRUN? > + return HRTIMER_NORESTART; > +} > + > +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, > + clockid_t clock_id) > +{ > + iowq->hit_timeout = 0; > + hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); > + iowq->t.function = io_cqring_timer_wakeup; > + hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0); > + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); > + > + if (!READ_ONCE(iowq->hit_timeout)) > + schedule(); > + > + hrtimer_cancel(&iowq->t); > + destroy_hrtimer_on_stack(&iowq->t); > + __set_current_state(TASK_RUNNING); > + > + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; > +} > + > static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, > struct io_wait_queue *iowq) > { > @@ -2362,11 +2394,10 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, > */ > if (current_pending_io()) > current->in_iowait = 1; > - if (iowq->timeout == KTIME_MAX) > + if (iowq->timeout != KTIME_MAX) > + ret = io_cqring_schedule_timeout(iowq, ctx->clockid); > + else > schedule(); > - else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0, > - HRTIMER_MODE_ABS, ctx->clockid)) > - ret = -ETIME; > current->in_iowait = 0; > return ret; > } > diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h > index 9935819f12b7..f95c1b080f4b 100644 > --- a/io_uring/io_uring.h > +++ b/io_uring/io_uring.h > @@ -40,7 +40,9 @@ struct io_wait_queue { > struct io_ring_ctx *ctx; > unsigned cq_tail; > unsigned nr_timeouts; > + int hit_timeout; > ktime_t timeout; > + struct hrtimer t; > > #ifdef CONFIG_NET_RX_BUSY_POLL > ktime_t napi_busy_poll_dt;
On 8/20/24 2:08 PM, David Wei wrote: > On 2024-08-19 16:28, Jens Axboe wrote: >> In preparation for having two distinct timeouts and avoid waking the >> task if we don't need to. >> >> Signed-off-by: Jens Axboe <axboe@kernel.dk> >> --- >> io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- >> io_uring/io_uring.h | 2 ++ >> 2 files changed, 38 insertions(+), 5 deletions(-) >> >> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >> index 9e2b8d4c05db..ddfbe04c61ed 100644 >> --- a/io_uring/io_uring.c >> +++ b/io_uring/io_uring.c >> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >> * Cannot safely flush overflowed CQEs from here, ensure we wake up >> * the task, and the next invocation will do it. >> */ >> - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) >> + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) > > iowq->hit_timeout may be modified in a timer softirq context, while this > wait_queue_func_t (AIUI) may get called from any context e.g. > net_rx_softirq for sockets. Does this need a READ_ONLY()? Yes probably not a bad idea to make it READ_ONCE(). >> return autoremove_wake_function(curr, mode, wake_flags, key); >> return -1; >> } >> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) >> return percpu_counter_read_positive(&tctx->inflight); >> } >> >> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) >> +{ >> + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); >> + struct io_ring_ctx *ctx = iowq->ctx; >> + >> + WRITE_ONCE(iowq->hit_timeout, 1); >> + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) >> + wake_up_process(ctx->submitter_task); >> + else >> + io_cqring_wake(ctx); > > This is a bit different to schedule_hrtimeout_range_clock(). Why is > io_cqring_wake() needed here for non-DEFER_TASKRUN? That's how the wakeups work - for defer taskrun, the task isn't on a waitqueue at all. Hence we need to wake the task itself. For any other setup, they will be on the waitqueue, and we just call io_cqring_wake() to wake up anyone waiting on the waitqueue. That will iterate the wake queue and call handlers for each item. Having a separate handler for that will allow to NOT wake up the task if we don't need to. taskrun, the waker
On 2024-08-20 14:34, Jens Axboe wrote: > On 8/20/24 2:08 PM, David Wei wrote: >> On 2024-08-19 16:28, Jens Axboe wrote: >>> In preparation for having two distinct timeouts and avoid waking the >>> task if we don't need to. >>> >>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>> --- >>> io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- >>> io_uring/io_uring.h | 2 ++ >>> 2 files changed, 38 insertions(+), 5 deletions(-) >>> >>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >>> index 9e2b8d4c05db..ddfbe04c61ed 100644 >>> --- a/io_uring/io_uring.c >>> +++ b/io_uring/io_uring.c >>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >>> * Cannot safely flush overflowed CQEs from here, ensure we wake up >>> * the task, and the next invocation will do it. >>> */ >>> - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) >>> + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) >> >> iowq->hit_timeout may be modified in a timer softirq context, while this >> wait_queue_func_t (AIUI) may get called from any context e.g. >> net_rx_softirq for sockets. Does this need a READ_ONLY()? > > Yes probably not a bad idea to make it READ_ONCE(). > >>> return autoremove_wake_function(curr, mode, wake_flags, key); >>> return -1; >>> } >>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) >>> return percpu_counter_read_positive(&tctx->inflight); >>> } >>> >>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) >>> +{ >>> + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); >>> + struct io_ring_ctx *ctx = iowq->ctx; >>> + >>> + WRITE_ONCE(iowq->hit_timeout, 1); >>> + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) >>> + wake_up_process(ctx->submitter_task); >>> + else >>> + io_cqring_wake(ctx); >> >> This is a bit different to schedule_hrtimeout_range_clock(). Why is >> io_cqring_wake() needed here for non-DEFER_TASKRUN? > > That's how the wakeups work - for defer taskrun, the task isn't on a > waitqueue at all. Hence we need to wake the task itself. For any other > setup, they will be on the waitqueue, and we just call io_cqring_wake() > to wake up anyone waiting on the waitqueue. That will iterate the wake > queue and call handlers for each item. Having a separate handler for > that will allow to NOT wake up the task if we don't need to. > taskrun, the waker To rephase the question, why is the original code calling schedule_hrtimeout_range_clock() not needing to differentiate behaviour between defer taskrun and not?
On 8/20/24 3:37 PM, David Wei wrote: > On 2024-08-20 14:34, Jens Axboe wrote: >> On 8/20/24 2:08 PM, David Wei wrote: >>> On 2024-08-19 16:28, Jens Axboe wrote: >>>> In preparation for having two distinct timeouts and avoid waking the >>>> task if we don't need to. >>>> >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>> --- >>>> io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- >>>> io_uring/io_uring.h | 2 ++ >>>> 2 files changed, 38 insertions(+), 5 deletions(-) >>>> >>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >>>> index 9e2b8d4c05db..ddfbe04c61ed 100644 >>>> --- a/io_uring/io_uring.c >>>> +++ b/io_uring/io_uring.c >>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >>>> * Cannot safely flush overflowed CQEs from here, ensure we wake up >>>> * the task, and the next invocation will do it. >>>> */ >>>> - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) >>>> + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) >>> >>> iowq->hit_timeout may be modified in a timer softirq context, while this >>> wait_queue_func_t (AIUI) may get called from any context e.g. >>> net_rx_softirq for sockets. Does this need a READ_ONLY()? >> >> Yes probably not a bad idea to make it READ_ONCE(). >> >>>> return autoremove_wake_function(curr, mode, wake_flags, key); >>>> return -1; >>>> } >>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) >>>> return percpu_counter_read_positive(&tctx->inflight); >>>> } >>>> >>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) >>>> +{ >>>> + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); >>>> + struct io_ring_ctx *ctx = iowq->ctx; >>>> + >>>> + WRITE_ONCE(iowq->hit_timeout, 1); >>>> + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) >>>> + wake_up_process(ctx->submitter_task); >>>> + else >>>> + io_cqring_wake(ctx); >>> >>> This is a bit different to schedule_hrtimeout_range_clock(). Why is >>> io_cqring_wake() needed here for non-DEFER_TASKRUN? >> >> That's how the wakeups work - for defer taskrun, the task isn't on a >> waitqueue at all. Hence we need to wake the task itself. For any other >> setup, they will be on the waitqueue, and we just call io_cqring_wake() >> to wake up anyone waiting on the waitqueue. That will iterate the wake >> queue and call handlers for each item. Having a separate handler for >> that will allow to NOT wake up the task if we don't need to. >> taskrun, the waker > > To rephase the question, why is the original code calling > schedule_hrtimeout_range_clock() not needing to differentiate behaviour > between defer taskrun and not? Because that part is the same, the task schedules out and goes to sleep. That has always been the same regardless of how the ring is setup. Only difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and hence cannot be woken by a wake_up(ctx->wait). We have to wake the task manually.
On 8/20/24 22:39, Jens Axboe wrote: > On 8/20/24 3:37 PM, David Wei wrote: >> On 2024-08-20 14:34, Jens Axboe wrote: >>> On 8/20/24 2:08 PM, David Wei wrote: >>>> On 2024-08-19 16:28, Jens Axboe wrote: >>>>> In preparation for having two distinct timeouts and avoid waking the >>>>> task if we don't need to. >>>>> >>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>>> --- >>>>> io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- >>>>> io_uring/io_uring.h | 2 ++ >>>>> 2 files changed, 38 insertions(+), 5 deletions(-) >>>>> >>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >>>>> index 9e2b8d4c05db..ddfbe04c61ed 100644 >>>>> --- a/io_uring/io_uring.c >>>>> +++ b/io_uring/io_uring.c >>>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >>>>> * Cannot safely flush overflowed CQEs from here, ensure we wake up >>>>> * the task, and the next invocation will do it. >>>>> */ >>>>> - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) >>>>> + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) >>>> >>>> iowq->hit_timeout may be modified in a timer softirq context, while this >>>> wait_queue_func_t (AIUI) may get called from any context e.g. >>>> net_rx_softirq for sockets. Does this need a READ_ONLY()? >>> >>> Yes probably not a bad idea to make it READ_ONCE(). >>> >>>>> return autoremove_wake_function(curr, mode, wake_flags, key); >>>>> return -1; >>>>> } >>>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) >>>>> return percpu_counter_read_positive(&tctx->inflight); >>>>> } >>>>> >>>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) >>>>> +{ >>>>> + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); >>>>> + struct io_ring_ctx *ctx = iowq->ctx; >>>>> + >>>>> + WRITE_ONCE(iowq->hit_timeout, 1); >>>>> + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) >>>>> + wake_up_process(ctx->submitter_task); >>>>> + else >>>>> + io_cqring_wake(ctx); >>>> >>>> This is a bit different to schedule_hrtimeout_range_clock(). Why is >>>> io_cqring_wake() needed here for non-DEFER_TASKRUN? >>> >>> That's how the wakeups work - for defer taskrun, the task isn't on a >>> waitqueue at all. Hence we need to wake the task itself. For any other >>> setup, they will be on the waitqueue, and we just call io_cqring_wake() >>> to wake up anyone waiting on the waitqueue. That will iterate the wake >>> queue and call handlers for each item. Having a separate handler for >>> that will allow to NOT wake up the task if we don't need to. >>> taskrun, the waker >> >> To rephase the question, why is the original code calling >> schedule_hrtimeout_range_clock() not needing to differentiate behaviour >> between defer taskrun and not? > > Because that part is the same, the task schedules out and goes to sleep. > That has always been the same regardless of how the ring is setup. Only > difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and > hence cannot be woken by a wake_up(ctx->wait). We have to wake the task > manually. Answering the question, for !IORING_SETUP_DEFER_TASKRUN, before it was waking only the task that started the timeout. Now, io_cqring_wake(ctx) wakes all waiters, so if there are multiple tasks waiting, a timeout of one waiter will try to wake all of them. I believe it's unintentional, but I don't think we care either. You can save the waiter task struct and wake it specifically.
On 2024-08-20 14:39, Jens Axboe wrote: > On 8/20/24 3:37 PM, David Wei wrote: >> On 2024-08-20 14:34, Jens Axboe wrote: >>> On 8/20/24 2:08 PM, David Wei wrote: >>>> On 2024-08-19 16:28, Jens Axboe wrote: >>>>> In preparation for having two distinct timeouts and avoid waking the >>>>> task if we don't need to. >>>>> >>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>>> --- >>>>> io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- >>>>> io_uring/io_uring.h | 2 ++ >>>>> 2 files changed, 38 insertions(+), 5 deletions(-) >>>>> >>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >>>>> index 9e2b8d4c05db..ddfbe04c61ed 100644 >>>>> --- a/io_uring/io_uring.c >>>>> +++ b/io_uring/io_uring.c >>>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >>>>> * Cannot safely flush overflowed CQEs from here, ensure we wake up >>>>> * the task, and the next invocation will do it. >>>>> */ >>>>> - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) >>>>> + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) >>>> >>>> iowq->hit_timeout may be modified in a timer softirq context, while this >>>> wait_queue_func_t (AIUI) may get called from any context e.g. >>>> net_rx_softirq for sockets. Does this need a READ_ONLY()? >>> >>> Yes probably not a bad idea to make it READ_ONCE(). >>> >>>>> return autoremove_wake_function(curr, mode, wake_flags, key); >>>>> return -1; >>>>> } >>>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) >>>>> return percpu_counter_read_positive(&tctx->inflight); >>>>> } >>>>> >>>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) >>>>> +{ >>>>> + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); >>>>> + struct io_ring_ctx *ctx = iowq->ctx; >>>>> + >>>>> + WRITE_ONCE(iowq->hit_timeout, 1); >>>>> + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) >>>>> + wake_up_process(ctx->submitter_task); >>>>> + else >>>>> + io_cqring_wake(ctx); >>>> >>>> This is a bit different to schedule_hrtimeout_range_clock(). Why is >>>> io_cqring_wake() needed here for non-DEFER_TASKRUN? >>> >>> That's how the wakeups work - for defer taskrun, the task isn't on a >>> waitqueue at all. Hence we need to wake the task itself. For any other >>> setup, they will be on the waitqueue, and we just call io_cqring_wake() >>> to wake up anyone waiting on the waitqueue. That will iterate the wake >>> queue and call handlers for each item. Having a separate handler for >>> that will allow to NOT wake up the task if we don't need to. >>> taskrun, the waker >> >> To rephase the question, why is the original code calling >> schedule_hrtimeout_range_clock() not needing to differentiate behaviour >> between defer taskrun and not? > > Because that part is the same, the task schedules out and goes to sleep. > That has always been the same regardless of how the ring is setup. Only > difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and > hence cannot be woken by a wake_up(ctx->wait). We have to wake the task > manually. > io_cqring_timer_wakeup() is the timer expired callback which calls wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN. The original code calling schedule_hrtimeout_range_clock() uses hrtimer_sleeper instead, which has a default timer expired callback set to hrtimer_wakeup(). hrtimer_wakeup() only calls wake_up_process(). My question is: why this asymmetry? Why does the new custom callback require io_cqring_wake()?
On 8/20/24 23:06, David Wei wrote: > On 2024-08-20 14:39, Jens Axboe wrote: >> On 8/20/24 3:37 PM, David Wei wrote: >>> On 2024-08-20 14:34, Jens Axboe wrote: >>>> On 8/20/24 2:08 PM, David Wei wrote: >>>>> On 2024-08-19 16:28, Jens Axboe wrote: >>>>>> In preparation for having two distinct timeouts and avoid waking the >>>>>> task if we don't need to. >>>>>> >>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>>>> --- >>>>>> io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- >>>>>> io_uring/io_uring.h | 2 ++ >>>>>> 2 files changed, 38 insertions(+), 5 deletions(-) >>>>>> >>>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >>>>>> index 9e2b8d4c05db..ddfbe04c61ed 100644 >>>>>> --- a/io_uring/io_uring.c >>>>>> +++ b/io_uring/io_uring.c >>>>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >>>>>> * Cannot safely flush overflowed CQEs from here, ensure we wake up >>>>>> * the task, and the next invocation will do it. >>>>>> */ >>>>>> - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) >>>>>> + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) >>>>> >>>>> iowq->hit_timeout may be modified in a timer softirq context, while this >>>>> wait_queue_func_t (AIUI) may get called from any context e.g. >>>>> net_rx_softirq for sockets. Does this need a READ_ONLY()? >>>> >>>> Yes probably not a bad idea to make it READ_ONCE(). >>>> >>>>>> return autoremove_wake_function(curr, mode, wake_flags, key); >>>>>> return -1; >>>>>> } >>>>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) >>>>>> return percpu_counter_read_positive(&tctx->inflight); >>>>>> } >>>>>> >>>>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) >>>>>> +{ >>>>>> + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); >>>>>> + struct io_ring_ctx *ctx = iowq->ctx; >>>>>> + >>>>>> + WRITE_ONCE(iowq->hit_timeout, 1); >>>>>> + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) >>>>>> + wake_up_process(ctx->submitter_task); >>>>>> + else >>>>>> + io_cqring_wake(ctx); >>>>> >>>>> This is a bit different to schedule_hrtimeout_range_clock(). Why is >>>>> io_cqring_wake() needed here for non-DEFER_TASKRUN? >>>> >>>> That's how the wakeups work - for defer taskrun, the task isn't on a >>>> waitqueue at all. Hence we need to wake the task itself. For any other >>>> setup, they will be on the waitqueue, and we just call io_cqring_wake() >>>> to wake up anyone waiting on the waitqueue. That will iterate the wake >>>> queue and call handlers for each item. Having a separate handler for >>>> that will allow to NOT wake up the task if we don't need to. >>>> taskrun, the waker >>> >>> To rephase the question, why is the original code calling >>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour >>> between defer taskrun and not? >> >> Because that part is the same, the task schedules out and goes to sleep. >> That has always been the same regardless of how the ring is setup. Only >> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and >> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task >> manually. >> > > io_cqring_timer_wakeup() is the timer expired callback which calls > wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN. > > The original code calling schedule_hrtimeout_range_clock() uses > hrtimer_sleeper instead, which has a default timer expired callback set > to hrtimer_wakeup(). > > hrtimer_wakeup() only calls wake_up_process(). > > My question is: why this asymmetry? Why does the new custom callback > require io_cqring_wake()? That's what I'm saying, it doesn't need and doesn't really want it. From the correctness point of view, it's ok since we wake up a (unnecessarily) larger set of tasks.
On 2024-08-20 15:13, Pavel Begunkov wrote: >>>>> On 8/20/24 2:08 PM, David Wei wrote: >>>> To rephase the question, why is the original code calling >>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour >>>> between defer taskrun and not? >>> >>> Because that part is the same, the task schedules out and goes to sleep. >>> That has always been the same regardless of how the ring is setup. Only >>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and >>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task >>> manually. >>> >> >> io_cqring_timer_wakeup() is the timer expired callback which calls >> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN. >> >> The original code calling schedule_hrtimeout_range_clock() uses >> hrtimer_sleeper instead, which has a default timer expired callback set >> to hrtimer_wakeup(). >> >> hrtimer_wakeup() only calls wake_up_process(). >> >> My question is: why this asymmetry? Why does the new custom callback >> require io_cqring_wake()? > > That's what I'm saying, it doesn't need and doesn't really want it. > From the correctness point of view, it's ok since we wake up a > (unnecessarily) larger set of tasks. > Yeah your explanation that came in while I was writing the email answered it, thanks Pavel.
On 8/20/24 4:14 PM, David Wei wrote: > On 2024-08-20 15:13, Pavel Begunkov wrote: >>>>>> On 8/20/24 2:08 PM, David Wei wrote: >>>>> To rephase the question, why is the original code calling >>>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour >>>>> between defer taskrun and not? >>>> >>>> Because that part is the same, the task schedules out and goes to sleep. >>>> That has always been the same regardless of how the ring is setup. Only >>>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and >>>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task >>>> manually. >>>> >>> >>> io_cqring_timer_wakeup() is the timer expired callback which calls >>> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN. >>> >>> The original code calling schedule_hrtimeout_range_clock() uses >>> hrtimer_sleeper instead, which has a default timer expired callback set >>> to hrtimer_wakeup(). >>> >>> hrtimer_wakeup() only calls wake_up_process(). >>> >>> My question is: why this asymmetry? Why does the new custom callback >>> require io_cqring_wake()? >> >> That's what I'm saying, it doesn't need and doesn't really want it. >> From the correctness point of view, it's ok since we wake up a >> (unnecessarily) larger set of tasks. >> > > Yeah your explanation that came in while I was writing the email > answered it, thanks Pavel. Hah and now I see what you meant - yeah we can just remove the distinction. I didn't see anything in testing, but I also didn't have multiple tasks waiting on a ring, nor would you. So it doesn't really matter, but I'll clean it up so there's no confusion.
On 8/20/24 4:19 PM, Jens Axboe wrote: > On 8/20/24 4:14 PM, David Wei wrote: >> On 2024-08-20 15:13, Pavel Begunkov wrote: >>>>>>> On 8/20/24 2:08 PM, David Wei wrote: >>>>>> To rephase the question, why is the original code calling >>>>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour >>>>>> between defer taskrun and not? >>>>> >>>>> Because that part is the same, the task schedules out and goes to sleep. >>>>> That has always been the same regardless of how the ring is setup. Only >>>>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and >>>>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task >>>>> manually. >>>>> >>>> >>>> io_cqring_timer_wakeup() is the timer expired callback which calls >>>> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN. >>>> >>>> The original code calling schedule_hrtimeout_range_clock() uses >>>> hrtimer_sleeper instead, which has a default timer expired callback set >>>> to hrtimer_wakeup(). >>>> >>>> hrtimer_wakeup() only calls wake_up_process(). >>>> >>>> My question is: why this asymmetry? Why does the new custom callback >>>> require io_cqring_wake()? >>> >>> That's what I'm saying, it doesn't need and doesn't really want it. >>> From the correctness point of view, it's ok since we wake up a >>> (unnecessarily) larger set of tasks. >>> >> >> Yeah your explanation that came in while I was writing the email >> answered it, thanks Pavel. > > Hah and now I see what you meant - yeah we can just remove the > distinction. I didn't see anything in testing, but I also didn't have > multiple tasks waiting on a ring, nor would you. So it doesn't really > matter, but I'll clean it up so there's no confusion. Actually probably better to just leave it as-is, as we'd otherwise need to store a task in io_wait_queue. Which we of course could, and would remove a branch in there...
On 8/20/24 4:51 PM, Jens Axboe wrote: > On 8/20/24 4:19 PM, Jens Axboe wrote: >> On 8/20/24 4:14 PM, David Wei wrote: >>> On 2024-08-20 15:13, Pavel Begunkov wrote: >>>>>>>> On 8/20/24 2:08 PM, David Wei wrote: >>>>>>> To rephase the question, why is the original code calling >>>>>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour >>>>>>> between defer taskrun and not? >>>>>> >>>>>> Because that part is the same, the task schedules out and goes to sleep. >>>>>> That has always been the same regardless of how the ring is setup. Only >>>>>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and >>>>>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task >>>>>> manually. >>>>>> >>>>> >>>>> io_cqring_timer_wakeup() is the timer expired callback which calls >>>>> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN. >>>>> >>>>> The original code calling schedule_hrtimeout_range_clock() uses >>>>> hrtimer_sleeper instead, which has a default timer expired callback set >>>>> to hrtimer_wakeup(). >>>>> >>>>> hrtimer_wakeup() only calls wake_up_process(). >>>>> >>>>> My question is: why this asymmetry? Why does the new custom callback >>>>> require io_cqring_wake()? >>>> >>>> That's what I'm saying, it doesn't need and doesn't really want it. >>>> From the correctness point of view, it's ok since we wake up a >>>> (unnecessarily) larger set of tasks. >>>> >>> >>> Yeah your explanation that came in while I was writing the email >>> answered it, thanks Pavel. >> >> Hah and now I see what you meant - yeah we can just remove the >> distinction. I didn't see anything in testing, but I also didn't have >> multiple tasks waiting on a ring, nor would you. So it doesn't really >> matter, but I'll clean it up so there's no confusion. > > Actually probably better to just leave it as-is, as we'd otherwise need > to store a task in io_wait_queue. Which we of course could, and would > remove a branch in there... I guess I should actually look at the code first, we have it via wq->private already. Hence: diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ddfbe04c61ed..4ba5292137c3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2353,13 +2353,9 @@ static bool current_pending_io(void) static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) { struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - struct io_ring_ctx *ctx = iowq->ctx; WRITE_ONCE(iowq->hit_timeout, 1); - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - wake_up_process(ctx->submitter_task); - else - io_cqring_wake(ctx); + wake_up_process(iowq->wq.private); return HRTIMER_NORESTART; }
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9e2b8d4c05db..ddfbe04c61ed 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } @@ -2350,6 +2350,38 @@ static bool current_pending_io(void) return percpu_counter_read_positive(&tctx->inflight); } +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + WRITE_ONCE(iowq->hit_timeout, 1); + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + wake_up_process(ctx->submitter_task); + else + io_cqring_wake(ctx); + return HRTIMER_NORESTART; +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id) +{ + iowq->hit_timeout = 0; + hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); + iowq->t.function = io_cqring_timer_wakeup; + hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { @@ -2362,11 +2394,10 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, */ if (current_pending_io()) current->in_iowait = 1; - if (iowq->timeout == KTIME_MAX) + if (iowq->timeout != KTIME_MAX) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid); + else schedule(); - else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0, - HRTIMER_MODE_ABS, ctx->clockid)) - ret = -ETIME; current->in_iowait = 0; return ret; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 9935819f12b7..f95c1b080f4b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -40,7 +40,9 @@ struct io_wait_queue { struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; + int hit_timeout; ktime_t timeout; + struct hrtimer t; #ifdef CONFIG_NET_RX_BUSY_POLL ktime_t napi_busy_poll_dt;
In preparation for having two distinct timeouts and avoid waking the task if we don't need to. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++----- io_uring/io_uring.h | 2 ++ 2 files changed, 38 insertions(+), 5 deletions(-)