diff mbox series

[4/5] io_uring: add support for batch wait timeout

Message ID 20240819233042.230956-5-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series Add support for batched min timeout | expand

Commit Message

Jens Axboe Aug. 19, 2024, 11:28 p.m. UTC
Waiting for events with io_uring has two knobs that can be set:

1) The number of events to wake for
2) The timeout associated with the event

Waiting will abort when either of those conditions are met, as expected.

This adds support for a third event, which is associated with the number
of events to wait for. Applications generally like to handle batches of
completions, and right now they'd set a number of events to wait for and
the timeout for that. If no events have been received but the timeout
triggers, control is returned to the application and it can wait again.
However, if the application doesn't have anything to do until events are
reaped, then it's possible to make this waiting more efficient.

For example, the application may have a latency time of 50 usecs and
wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
as the timeout, then it'll be doing 20K context switches per second even
if nothing is happening.

This introduces the notion of min batch wait time. If the min batch wait
time expires, then we'll return to userspace if we have any events at all.
If none are available, the general wait time is applied. Any request
arriving after the min batch wait time will cause waiting to stop and
return control to the application.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++------
 io_uring/io_uring.h |  2 ++
 2 files changed, 67 insertions(+), 10 deletions(-)

Comments

David Wei Aug. 20, 2024, 9:10 p.m. UTC | #1
On 2024-08-19 16:28, Jens Axboe wrote:
> Waiting for events with io_uring has two knobs that can be set:
> 
> 1) The number of events to wake for
> 2) The timeout associated with the event
> 
> Waiting will abort when either of those conditions are met, as expected.
> 
> This adds support for a third event, which is associated with the number
> of events to wait for. Applications generally like to handle batches of
> completions, and right now they'd set a number of events to wait for and
> the timeout for that. If no events have been received but the timeout
> triggers, control is returned to the application and it can wait again.
> However, if the application doesn't have anything to do until events are
> reaped, then it's possible to make this waiting more efficient.
> 
> For example, the application may have a latency time of 50 usecs and
> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
> as the timeout, then it'll be doing 20K context switches per second even
> if nothing is happening.
> 
> This introduces the notion of min batch wait time. If the min batch wait
> time expires, then we'll return to userspace if we have any events at all.
> If none are available, the general wait time is applied. Any request
> arriving after the min batch wait time will cause waiting to stop and
> return control to the application.

I think the batch request count should be applied to the min_timeout,
such that:

start_time          min_timeout            timeout
    |--------------------|--------------------|

Return to user between [start_time, min_timeout) if there are wait_nr
number of completions, checked by io_req_local_work_add(), or is it
io_wake_function()?

Return to user between [min_timeout, timeout) if there are at least one
completion.

Return to user at timeout always.

> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  io_uring/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++------
>  io_uring/io_uring.h |  2 ++
>  2 files changed, 67 insertions(+), 10 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index ddfbe04c61ed..d09a7c2e1096 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2363,13 +2363,62 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>  	return HRTIMER_NORESTART;
>  }
>  
> +/*
> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
> + * wake up. If not, and we have a normal timeout, switch to that and keep
> + * sleeping.
> + */
> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
> +{
> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/* no general timeout, or shorter, we are done */
> +	if (iowq->timeout == KTIME_MAX ||
> +	    ktime_after(iowq->min_timeout, iowq->timeout))
> +		goto out_wake;
> +	/* work we may need to run, wake function will see if we need to wake */
> +	if (io_has_work(ctx))
> +		goto out_wake;
> +	/* got events since we started waiting, min timeout is done */
> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
> +		goto out_wake;
> +	/* if we have any events and min timeout expired, we're done */
> +	if (io_cqring_events(ctx))
> +		goto out_wake;

How can ctx->rings->cq.tail be modified if the task is sleeping while
waiting for completions? What is doing the work?

> +
> +	/*
> +	 * If using deferred task_work running and application is waiting on
> +	 * more than one request, ensure we reset it now where we are switching
> +	 * to normal sleeps. Any request completion post min_wait should wake
> +	 * the task and return.
> +	 */
> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
> +		atomic_set(&ctx->cq_wait_nr, 1);
> +
> +	iowq->t.function = io_cqring_timer_wakeup;
> +	hrtimer_set_expires(timer, iowq->timeout);
> +	return HRTIMER_RESTART;
> +out_wake:
> +	return io_cqring_timer_wakeup(timer);
> +}
> +
>  static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
> -				      clockid_t clock_id)
> +				      clockid_t clock_id, ktime_t start_time)
>  {
> +	ktime_t timeout;
> +
>  	iowq->hit_timeout = 0;
>  	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
> -	iowq->t.function = io_cqring_timer_wakeup;
> -	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
> +	if (iowq->min_timeout) {
> +		timeout = ktime_add_ns(iowq->min_timeout, start_time);
> +		iowq->t.function = io_cqring_min_timer_wakeup;
> +	} else {
> +		timeout = iowq->timeout;
> +		iowq->t.function = io_cqring_timer_wakeup;
> +	}
> +
> +	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
>  	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
>  
>  	if (!READ_ONCE(iowq->hit_timeout))
Jens Axboe Aug. 20, 2024, 9:31 p.m. UTC | #2
On 8/20/24 3:10 PM, David Wei wrote:
> On 2024-08-19 16:28, Jens Axboe wrote:
>> Waiting for events with io_uring has two knobs that can be set:
>>
>> 1) The number of events to wake for
>> 2) The timeout associated with the event
>>
>> Waiting will abort when either of those conditions are met, as expected.
>>
>> This adds support for a third event, which is associated with the number
>> of events to wait for. Applications generally like to handle batches of
>> completions, and right now they'd set a number of events to wait for and
>> the timeout for that. If no events have been received but the timeout
>> triggers, control is returned to the application and it can wait again.
>> However, if the application doesn't have anything to do until events are
>> reaped, then it's possible to make this waiting more efficient.
>>
>> For example, the application may have a latency time of 50 usecs and
>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>> as the timeout, then it'll be doing 20K context switches per second even
>> if nothing is happening.
>>
>> This introduces the notion of min batch wait time. If the min batch wait
>> time expires, then we'll return to userspace if we have any events at all.
>> If none are available, the general wait time is applied. Any request
>> arriving after the min batch wait time will cause waiting to stop and
>> return control to the application.
> 
> I think the batch request count should be applied to the min_timeout,
> such that:
> 
> start_time          min_timeout            timeout
>     |--------------------|--------------------|
> 
> Return to user between [start_time, min_timeout) if there are wait_nr
> number of completions, checked by io_req_local_work_add(), or is it
> io_wake_function()?

Right, if we get the batch fulfilled, we should ALWAYS return.

If we have any events and min_timeout expires, return.

If not, sleep the full timeout.

> Return to user between [min_timeout, timeout) if there are at least one
> completion.

Yes

> Return to user at timeout always.

Yes

This should be how it works, and how I described it in the commit
message.
Jens Axboe Aug. 20, 2024, 9:36 p.m. UTC | #3
On 8/20/24 3:10 PM, David Wei wrote:
>> +/*
>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>> + * sleeping.
>> + */
>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>> +{
>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>> +	struct io_ring_ctx *ctx = iowq->ctx;
>> +
>> +	/* no general timeout, or shorter, we are done */
>> +	if (iowq->timeout == KTIME_MAX ||
>> +	    ktime_after(iowq->min_timeout, iowq->timeout))
>> +		goto out_wake;
>> +	/* work we may need to run, wake function will see if we need to wake */
>> +	if (io_has_work(ctx))
>> +		goto out_wake;
>> +	/* got events since we started waiting, min timeout is done */
>> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>> +		goto out_wake;
>> +	/* if we have any events and min timeout expired, we're done */
>> +	if (io_cqring_events(ctx))
>> +		goto out_wake;
> 
> How can ctx->rings->cq.tail be modified if the task is sleeping while
> waiting for completions? What is doing the work?

Good question. If we have a min_timeout of <something> and a batch count
of <something>, ideally we don't want to wake the task to process when a
single completion comes in. And this is how we handle DEFER_TASKRUN, but
for anything else, the task will wake and process items. So it may have
woken up to process an item and posted a completion before this timeout
triggers. If that's the case, and min_timeout has expired (which it has
when this handler is called), then we should wake up and return.
David Wei Aug. 20, 2024, 9:59 p.m. UTC | #4
On 2024-08-20 14:31, Jens Axboe wrote:
> On 8/20/24 3:10 PM, David Wei wrote:
>> On 2024-08-19 16:28, Jens Axboe wrote:
>>> Waiting for events with io_uring has two knobs that can be set:
>>>
>>> 1) The number of events to wake for
>>> 2) The timeout associated with the event
>>>
>>> Waiting will abort when either of those conditions are met, as expected.
>>>
>>> This adds support for a third event, which is associated with the number
>>> of events to wait for. Applications generally like to handle batches of
>>> completions, and right now they'd set a number of events to wait for and
>>> the timeout for that. If no events have been received but the timeout
>>> triggers, control is returned to the application and it can wait again.
>>> However, if the application doesn't have anything to do until events are
>>> reaped, then it's possible to make this waiting more efficient.
>>>
>>> For example, the application may have a latency time of 50 usecs and
>>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>>> as the timeout, then it'll be doing 20K context switches per second even
>>> if nothing is happening.
>>>
>>> This introduces the notion of min batch wait time. If the min batch wait
>>> time expires, then we'll return to userspace if we have any events at all.
>>> If none are available, the general wait time is applied. Any request
>>> arriving after the min batch wait time will cause waiting to stop and
>>> return control to the application.
>>
>> I think the batch request count should be applied to the min_timeout,
>> such that:
>>
>> start_time          min_timeout            timeout
>>     |--------------------|--------------------|
>>
>> Return to user between [start_time, min_timeout) if there are wait_nr
>> number of completions, checked by io_req_local_work_add(), or is it
>> io_wake_function()?
> 
> Right, if we get the batch fulfilled, we should ALWAYS return.
> 
> If we have any events and min_timeout expires, return.
> 
> If not, sleep the full timeout.
> 
>> Return to user between [min_timeout, timeout) if there are at least one
>> completion.
> 
> Yes
> 
>> Return to user at timeout always.
> 
> Yes
> 
> This should be how it works, and how I described it in the commit
> message.
> 

You're right, thanks. With DEFER_TASKRUN, the wakeup either happens in
the timer expired callback io_cqring_min_timer_wakeup(), or in
io_req_local_work_add().

In both cases control returns to after schedule() in
io_cqring_schedule_timeout() and the timer is cancelled.

Is it possible for the two to race at all?
Pavel Begunkov Aug. 20, 2024, 10:08 p.m. UTC | #5
On 8/20/24 22:36, Jens Axboe wrote:
> On 8/20/24 3:10 PM, David Wei wrote:
>>> +/*
>>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>>> + * sleeping.
>>> + */
>>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>>> +{
>>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>> +
>>> +	/* no general timeout, or shorter, we are done */
>>> +	if (iowq->timeout == KTIME_MAX ||
>>> +	    ktime_after(iowq->min_timeout, iowq->timeout))
>>> +		goto out_wake;
>>> +	/* work we may need to run, wake function will see if we need to wake */
>>> +	if (io_has_work(ctx))
>>> +		goto out_wake;
>>> +	/* got events since we started waiting, min timeout is done */
>>> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>>> +		goto out_wake;
>>> +	/* if we have any events and min timeout expired, we're done */
>>> +	if (io_cqring_events(ctx))
>>> +		goto out_wake;
>>
>> How can ctx->rings->cq.tail be modified if the task is sleeping while
>> waiting for completions? What is doing the work?
> 
> Good question. If we have a min_timeout of <something> and a batch count
> of <something>, ideally we don't want to wake the task to process when a
> single completion comes in. And this is how we handle DEFER_TASKRUN, but
> for anything else, the task will wake and process items. So it may have
> woken up to process an item and posted a completion before this timeout
> triggers. If that's the case, and min_timeout has expired (which it has
> when this handler is called), then we should wake up and return.
Also, for !DEFER_TASKRUN, it can be iowq or another user thread sharing
the ring.
Pavel Begunkov Aug. 20, 2024, 10:46 p.m. UTC | #6
On 8/20/24 00:28, Jens Axboe wrote:
> Waiting for events with io_uring has two knobs that can be set:
> 
> 1) The number of events to wake for
> 2) The timeout associated with the event
> 
> Waiting will abort when either of those conditions are met, as expected.
> 
> This adds support for a third event, which is associated with the number
> of events to wait for. Applications generally like to handle batches of
> completions, and right now they'd set a number of events to wait for and
> the timeout for that. If no events have been received but the timeout
> triggers, control is returned to the application and it can wait again.
> However, if the application doesn't have anything to do until events are
> reaped, then it's possible to make this waiting more efficient.
> 
> For example, the application may have a latency time of 50 usecs and
> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
> as the timeout, then it'll be doing 20K context switches per second even
> if nothing is happening.
> 
> This introduces the notion of min batch wait time. If the min batch wait
> time expires, then we'll return to userspace if we have any events at all.
> If none are available, the general wait time is applied. Any request
> arriving after the min batch wait time will cause waiting to stop and
> return control to the application.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>   io_uring/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++------
>   io_uring/io_uring.h |  2 ++
>   2 files changed, 67 insertions(+), 10 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index ddfbe04c61ed..d09a7c2e1096 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2363,13 +2363,62 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>   	return HRTIMER_NORESTART;
>   }
>   
> +/*
> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
> + * wake up. If not, and we have a normal timeout, switch to that and keep
> + * sleeping.
> + */
> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
> +{
> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/* no general timeout, or shorter, we are done */
> +	if (iowq->timeout == KTIME_MAX ||
> +	    ktime_after(iowq->min_timeout, iowq->timeout))
> +		goto out_wake;
> +	/* work we may need to run, wake function will see if we need to wake */
> +	if (io_has_work(ctx))
> +		goto out_wake;
> +	/* got events since we started waiting, min timeout is done */
> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
> +		goto out_wake;
> +	/* if we have any events and min timeout expired, we're done */
> +	if (io_cqring_events(ctx))
> +		goto out_wake;
> +
> +	/*
> +	 * If using deferred task_work running and application is waiting on
> +	 * more than one request, ensure we reset it now where we are switching
> +	 * to normal sleeps. Any request completion post min_wait should wake
> +	 * the task and return.
> +	 */
> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
> +		atomic_set(&ctx->cq_wait_nr, 1);

racy

atomic_set(&ctx->cq_wait_nr, 1);
smp_mb();
if (llist_empty(&ctx->work_llist))
	// wake;


> +
> +	iowq->t.function = io_cqring_timer_wakeup;
> +	hrtimer_set_expires(timer, iowq->timeout);
> +	return HRTIMER_RESTART;
> +out_wake:
> +	return io_cqring_timer_wakeup(timer);
> +}
> +
Pavel Begunkov Aug. 20, 2024, 10:47 p.m. UTC | #7
On 8/20/24 23:46, Pavel Begunkov wrote:
> On 8/20/24 00:28, Jens Axboe wrote:
>> Waiting for events with io_uring has two knobs that can be set:
>>
>> 1) The number of events to wake for
>> 2) The timeout associated with the event
>>
>> Waiting will abort when either of those conditions are met, as expected.
>>
>> This adds support for a third event, which is associated with the number
>> of events to wait for. Applications generally like to handle batches of
>> completions, and right now they'd set a number of events to wait for and
>> the timeout for that. If no events have been received but the timeout
>> triggers, control is returned to the application and it can wait again.
>> However, if the application doesn't have anything to do until events are
>> reaped, then it's possible to make this waiting more efficient.
>>
>> For example, the application may have a latency time of 50 usecs and
>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>> as the timeout, then it'll be doing 20K context switches per second even
>> if nothing is happening.
>>
>> This introduces the notion of min batch wait time. If the min batch wait
>> time expires, then we'll return to userspace if we have any events at all.
>> If none are available, the general wait time is applied. Any request
>> arriving after the min batch wait time will cause waiting to stop and
>> return control to the application.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>   io_uring/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++------
>>   io_uring/io_uring.h |  2 ++
>>   2 files changed, 67 insertions(+), 10 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index ddfbe04c61ed..d09a7c2e1096 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2363,13 +2363,62 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>       return HRTIMER_NORESTART;
>>   }
>> +/*
>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>> + * sleeping.
>> + */
>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>> +{
>> +    struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>> +    struct io_ring_ctx *ctx = iowq->ctx;
>> +
>> +    /* no general timeout, or shorter, we are done */
>> +    if (iowq->timeout == KTIME_MAX ||
>> +        ktime_after(iowq->min_timeout, iowq->timeout))
>> +        goto out_wake;
>> +    /* work we may need to run, wake function will see if we need to wake */
>> +    if (io_has_work(ctx))
>> +        goto out_wake;
>> +    /* got events since we started waiting, min timeout is done */
>> +    if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>> +        goto out_wake;
>> +    /* if we have any events and min timeout expired, we're done */
>> +    if (io_cqring_events(ctx))
>> +        goto out_wake;
>> +
>> +    /*
>> +     * If using deferred task_work running and application is waiting on
>> +     * more than one request, ensure we reset it now where we are switching
>> +     * to normal sleeps. Any request completion post min_wait should wake
>> +     * the task and return.
>> +     */
>> +    if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>> +        atomic_set(&ctx->cq_wait_nr, 1);
> 
> racy
> 
> atomic_set(&ctx->cq_wait_nr, 1);
> smp_mb();
> if (llist_empty(&ctx->work_llist))
>      // wake;

rather if _not_ empty


  
>> +
>> +    iowq->t.function = io_cqring_timer_wakeup;
>> +    hrtimer_set_expires(timer, iowq->timeout);
>> +    return HRTIMER_RESTART;
>> +out_wake:
>> +    return io_cqring_timer_wakeup(timer);
>> +}
>> +
Jens Axboe Aug. 20, 2024, 10:58 p.m. UTC | #8
On 8/20/24 4:47 PM, Pavel Begunkov wrote:
> On 8/20/24 23:46, Pavel Begunkov wrote:
>> On 8/20/24 00:28, Jens Axboe wrote:
>>> Waiting for events with io_uring has two knobs that can be set:
>>>
>>> 1) The number of events to wake for
>>> 2) The timeout associated with the event
>>>
>>> Waiting will abort when either of those conditions are met, as expected.
>>>
>>> This adds support for a third event, which is associated with the number
>>> of events to wait for. Applications generally like to handle batches of
>>> completions, and right now they'd set a number of events to wait for and
>>> the timeout for that. If no events have been received but the timeout
>>> triggers, control is returned to the application and it can wait again.
>>> However, if the application doesn't have anything to do until events are
>>> reaped, then it's possible to make this waiting more efficient.
>>>
>>> For example, the application may have a latency time of 50 usecs and
>>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>>> as the timeout, then it'll be doing 20K context switches per second even
>>> if nothing is happening.
>>>
>>> This introduces the notion of min batch wait time. If the min batch wait
>>> time expires, then we'll return to userspace if we have any events at all.
>>> If none are available, the general wait time is applied. Any request
>>> arriving after the min batch wait time will cause waiting to stop and
>>> return control to the application.
>>>
>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>> ---
>>>   io_uring/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++------
>>>   io_uring/io_uring.h |  2 ++
>>>   2 files changed, 67 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> index ddfbe04c61ed..d09a7c2e1096 100644
>>> --- a/io_uring/io_uring.c
>>> +++ b/io_uring/io_uring.c
>>> @@ -2363,13 +2363,62 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>       return HRTIMER_NORESTART;
>>>   }
>>> +/*
>>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>>> + * sleeping.
>>> + */
>>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>>> +{
>>> +    struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>> +    struct io_ring_ctx *ctx = iowq->ctx;
>>> +
>>> +    /* no general timeout, or shorter, we are done */
>>> +    if (iowq->timeout == KTIME_MAX ||
>>> +        ktime_after(iowq->min_timeout, iowq->timeout))
>>> +        goto out_wake;
>>> +    /* work we may need to run, wake function will see if we need to wake */
>>> +    if (io_has_work(ctx))
>>> +        goto out_wake;
>>> +    /* got events since we started waiting, min timeout is done */
>>> +    if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>>> +        goto out_wake;
>>> +    /* if we have any events and min timeout expired, we're done */
>>> +    if (io_cqring_events(ctx))
>>> +        goto out_wake;
>>> +
>>> +    /*
>>> +     * If using deferred task_work running and application is waiting on
>>> +     * more than one request, ensure we reset it now where we are switching
>>> +     * to normal sleeps. Any request completion post min_wait should wake
>>> +     * the task and return.
>>> +     */
>>> +    if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>> +        atomic_set(&ctx->cq_wait_nr, 1);
>>
>> racy
>>
>> atomic_set(&ctx->cq_wait_nr, 1);
>> smp_mb();
>> if (llist_empty(&ctx->work_llist))
>>      // wake;
> 
> rather if _not_ empty

Yep that one was a given :-)

Updated it, we'll punt to out_wake at that point.
Pavel Begunkov Aug. 21, 2024, 12:08 a.m. UTC | #9
On 8/20/24 23:58, Jens Axboe wrote:
> On 8/20/24 4:47 PM, Pavel Begunkov wrote:
>> On 8/20/24 23:46, Pavel Begunkov wrote:
>>> On 8/20/24 00:28, Jens Axboe wrote:
>>>> Waiting for events with io_uring has two knobs that can be set:
>>>>
>>>> 1) The number of events to wake for
>>>> 2) The timeout associated with the event
>>>>
>>>> Waiting will abort when either of those conditions are met, as expected.
>>>>
>>>> This adds support for a third event, which is associated with the number
>>>> of events to wait for. Applications generally like to handle batches of
>>>> completions, and right now they'd set a number of events to wait for and
>>>> the timeout for that. If no events have been received but the timeout
>>>> triggers, control is returned to the application and it can wait again.
>>>> However, if the application doesn't have anything to do until events are
>>>> reaped, then it's possible to make this waiting more efficient.
>>>>
>>>> For example, the application may have a latency time of 50 usecs and
>>>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>>>> as the timeout, then it'll be doing 20K context switches per second even
>>>> if nothing is happening.
>>>>
>>>> This introduces the notion of min batch wait time. If the min batch wait
>>>> time expires, then we'll return to userspace if we have any events at all.
>>>> If none are available, the general wait time is applied. Any request
>>>> arriving after the min batch wait time will cause waiting to stop and
>>>> return control to the application.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>>    io_uring/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++------
>>>>    io_uring/io_uring.h |  2 ++
>>>>    2 files changed, 67 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>> index ddfbe04c61ed..d09a7c2e1096 100644
>>>> --- a/io_uring/io_uring.c
>>>> +++ b/io_uring/io_uring.c
>>>> @@ -2363,13 +2363,62 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>>        return HRTIMER_NORESTART;
>>>>    }
>>>> +/*
>>>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>>>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>>>> + * sleeping.
>>>> + */
>>>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>>>> +{
>>>> +    struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>>> +    struct io_ring_ctx *ctx = iowq->ctx;
>>>> +
>>>> +    /* no general timeout, or shorter, we are done */
>>>> +    if (iowq->timeout == KTIME_MAX ||
>>>> +        ktime_after(iowq->min_timeout, iowq->timeout))
>>>> +        goto out_wake;
>>>> +    /* work we may need to run, wake function will see if we need to wake */
>>>> +    if (io_has_work(ctx))
>>>> +        goto out_wake;
>>>> +    /* got events since we started waiting, min timeout is done */
>>>> +    if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>>>> +        goto out_wake;
>>>> +    /* if we have any events and min timeout expired, we're done */
>>>> +    if (io_cqring_events(ctx))
>>>> +        goto out_wake;
>>>> +
>>>> +    /*
>>>> +     * If using deferred task_work running and application is waiting on
>>>> +     * more than one request, ensure we reset it now where we are switching
>>>> +     * to normal sleeps. Any request completion post min_wait should wake
>>>> +     * the task and return.
>>>> +     */
>>>> +    if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>>> +        atomic_set(&ctx->cq_wait_nr, 1);
>>>
>>> racy
>>>
>>> atomic_set(&ctx->cq_wait_nr, 1);
>>> smp_mb();
>>> if (llist_empty(&ctx->work_llist))
>>>       // wake;
>>
>> rather if _not_ empty
> 
> Yep that one was a given :-)
> 
> Updated it, we'll punt to out_wake at that point.

Another concern is racing with the task [re]setting ->cq_wait_nr
in io_cqring_wait(), e.g. because of a spurious wake up.
Jens Axboe Aug. 21, 2024, 2:22 p.m. UTC | #10
On 8/20/24 6:08 PM, Pavel Begunkov wrote:
> On 8/20/24 23:58, Jens Axboe wrote:
>> On 8/20/24 4:47 PM, Pavel Begunkov wrote:
>>> On 8/20/24 23:46, Pavel Begunkov wrote:
>>>> On 8/20/24 00:28, Jens Axboe wrote:
>>>>> Waiting for events with io_uring has two knobs that can be set:
>>>>>
>>>>> 1) The number of events to wake for
>>>>> 2) The timeout associated with the event
>>>>>
>>>>> Waiting will abort when either of those conditions are met, as expected.
>>>>>
>>>>> This adds support for a third event, which is associated with the number
>>>>> of events to wait for. Applications generally like to handle batches of
>>>>> completions, and right now they'd set a number of events to wait for and
>>>>> the timeout for that. If no events have been received but the timeout
>>>>> triggers, control is returned to the application and it can wait again.
>>>>> However, if the application doesn't have anything to do until events are
>>>>> reaped, then it's possible to make this waiting more efficient.
>>>>>
>>>>> For example, the application may have a latency time of 50 usecs and
>>>>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>>>>> as the timeout, then it'll be doing 20K context switches per second even
>>>>> if nothing is happening.
>>>>>
>>>>> This introduces the notion of min batch wait time. If the min batch wait
>>>>> time expires, then we'll return to userspace if we have any events at all.
>>>>> If none are available, the general wait time is applied. Any request
>>>>> arriving after the min batch wait time will cause waiting to stop and
>>>>> return control to the application.
>>>>>
>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>> ---
>>>>>    io_uring/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++------
>>>>>    io_uring/io_uring.h |  2 ++
>>>>>    2 files changed, 67 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>>> index ddfbe04c61ed..d09a7c2e1096 100644
>>>>> --- a/io_uring/io_uring.c
>>>>> +++ b/io_uring/io_uring.c
>>>>> @@ -2363,13 +2363,62 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>>>        return HRTIMER_NORESTART;
>>>>>    }
>>>>> +/*
>>>>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>>>>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>>>>> + * sleeping.
>>>>> + */
>>>>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>>>>> +{
>>>>> +    struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>>>> +    struct io_ring_ctx *ctx = iowq->ctx;
>>>>> +
>>>>> +    /* no general timeout, or shorter, we are done */
>>>>> +    if (iowq->timeout == KTIME_MAX ||
>>>>> +        ktime_after(iowq->min_timeout, iowq->timeout))
>>>>> +        goto out_wake;
>>>>> +    /* work we may need to run, wake function will see if we need to wake */
>>>>> +    if (io_has_work(ctx))
>>>>> +        goto out_wake;
>>>>> +    /* got events since we started waiting, min timeout is done */
>>>>> +    if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>>>>> +        goto out_wake;
>>>>> +    /* if we have any events and min timeout expired, we're done */
>>>>> +    if (io_cqring_events(ctx))
>>>>> +        goto out_wake;
>>>>> +
>>>>> +    /*
>>>>> +     * If using deferred task_work running and application is waiting on
>>>>> +     * more than one request, ensure we reset it now where we are switching
>>>>> +     * to normal sleeps. Any request completion post min_wait should wake
>>>>> +     * the task and return.
>>>>> +     */
>>>>> +    if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>>>> +        atomic_set(&ctx->cq_wait_nr, 1);
>>>>
>>>> racy
>>>>
>>>> atomic_set(&ctx->cq_wait_nr, 1);
>>>> smp_mb();
>>>> if (llist_empty(&ctx->work_llist))
>>>>       // wake;
>>>
>>> rather if _not_ empty
>>
>> Yep that one was a given :-)
>>
>> Updated it, we'll punt to out_wake at that point.
> 
> Another concern is racing with the task [re]setting ->cq_wait_nr
> in io_cqring_wait(), e.g. because of a spurious wake up.

I tried to close that up somewhat in the next iteration.
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ddfbe04c61ed..d09a7c2e1096 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2363,13 +2363,62 @@  static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+/*
+ * Doing min_timeout portion. If we saw any timeouts, events, or have work,
+ * wake up. If not, and we have a normal timeout, switch to that and keep
+ * sleeping.
+ */
+static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
+{
+	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/* no general timeout, or shorter, we are done */
+	if (iowq->timeout == KTIME_MAX ||
+	    ktime_after(iowq->min_timeout, iowq->timeout))
+		goto out_wake;
+	/* work we may need to run, wake function will see if we need to wake */
+	if (io_has_work(ctx))
+		goto out_wake;
+	/* got events since we started waiting, min timeout is done */
+	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
+		goto out_wake;
+	/* if we have any events and min timeout expired, we're done */
+	if (io_cqring_events(ctx))
+		goto out_wake;
+
+	/*
+	 * If using deferred task_work running and application is waiting on
+	 * more than one request, ensure we reset it now where we are switching
+	 * to normal sleeps. Any request completion post min_wait should wake
+	 * the task and return.
+	 */
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		atomic_set(&ctx->cq_wait_nr, 1);
+
+	iowq->t.function = io_cqring_timer_wakeup;
+	hrtimer_set_expires(timer, iowq->timeout);
+	return HRTIMER_RESTART;
+out_wake:
+	return io_cqring_timer_wakeup(timer);
+}
+
 static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
-				      clockid_t clock_id)
+				      clockid_t clock_id, ktime_t start_time)
 {
+	ktime_t timeout;
+
 	iowq->hit_timeout = 0;
 	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
-	iowq->t.function = io_cqring_timer_wakeup;
-	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
+	if (iowq->min_timeout) {
+		timeout = ktime_add_ns(iowq->min_timeout, start_time);
+		iowq->t.function = io_cqring_min_timer_wakeup;
+	} else {
+		timeout = iowq->timeout;
+		iowq->t.function = io_cqring_timer_wakeup;
+	}
+
+	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
 	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
 
 	if (!READ_ONCE(iowq->hit_timeout))
@@ -2383,7 +2432,8 @@  static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
 }
 
 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
-				     struct io_wait_queue *iowq)
+				     struct io_wait_queue *iowq,
+				     ktime_t start_time)
 {
 	int ret = 0;
 
@@ -2394,8 +2444,8 @@  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 */
 	if (current_pending_io())
 		current->in_iowait = 1;
-	if (iowq->timeout != KTIME_MAX)
-		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
+	if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX)
+		ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
 	else
 		schedule();
 	current->in_iowait = 0;
@@ -2404,7 +2454,8 @@  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 
 /* If this returns > 0, the caller should retry */
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
-					  struct io_wait_queue *iowq)
+					  struct io_wait_queue *iowq,
+					  ktime_t start_time)
 {
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
@@ -2417,7 +2468,7 @@  static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	if (unlikely(io_should_wake(iowq)))
 		return 0;
 
-	return __io_cqring_wait_schedule(ctx, iowq);
+	return __io_cqring_wait_schedule(ctx, iowq, start_time);
 }
 
 struct ext_arg {
@@ -2435,6 +2486,7 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 {
 	struct io_wait_queue iowq;
 	struct io_rings *rings = ctx->rings;
+	ktime_t start_time;
 	int ret;
 
 	if (!io_allowed_run_tw(ctx))
@@ -2453,8 +2505,11 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 	INIT_LIST_HEAD(&iowq.wq.entry);
 	iowq.ctx = ctx;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
+	iowq.min_timeout = 0;
 	iowq.timeout = KTIME_MAX;
+	start_time = io_get_time(ctx);
 
 	if (ext_arg->ts) {
 		struct timespec64 ts;
@@ -2464,7 +2519,7 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 
 		iowq.timeout = timespec64_to_ktime(ts);
 		if (!(flags & IORING_ENTER_ABS_TIMER))
-			iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
+			iowq.timeout = ktime_add(iowq.timeout, start_time);
 	}
 
 	if (ext_arg->sig) {
@@ -2495,7 +2550,7 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 							TASK_INTERRUPTIBLE);
 		}
 
-		ret = io_cqring_wait_schedule(ctx, &iowq);
+		ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
 		__set_current_state(TASK_RUNNING);
 		atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f95c1b080f4b..65078e641390 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -39,8 +39,10 @@  struct io_wait_queue {
 	struct wait_queue_entry wq;
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
+	unsigned cq_min_tail;
 	unsigned nr_timeouts;
 	int hit_timeout;
+	ktime_t min_timeout;
 	ktime_t timeout;
 	struct hrtimer t;