diff mbox series

[v1,1/4] io_uring: only account cqring wait time as iowait if enabled for a ring

Message ID 20240224050735.1759733-1-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series [v1,1/4] io_uring: only account cqring wait time as iowait if enabled for a ring | expand

Commit Message

David Wei Feb. 24, 2024, 5:07 a.m. UTC
Currently we unconditionally account time spent waiting for events in CQ
ring as iowait time.

Some userspace tools consider iowait time to be CPU util/load which can
be misleading as the process is sleeping. High iowait time might be
indicative of issues for storage IO, but for network IO e.g. socket
recv() we do not control when the completions happen so its value
misleads userspace tooling.

This patch gates the previously unconditional iowait accounting behind a
new IORING_REGISTER opcode. By default time is not accounted as iowait,
unless this is explicitly enabled for a ring. Thus userspace can decide,
depending on the type of work it expects to do, whether it wants to
consider cqring wait time as iowait or not.

Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/linux/io_uring_types.h |  1 +
 include/uapi/linux/io_uring.h  |  3 +++
 io_uring/io_uring.c            |  9 +++++----
 io_uring/register.c            | 17 +++++++++++++++++
 4 files changed, 26 insertions(+), 4 deletions(-)

Comments

Jens Axboe Feb. 24, 2024, 3:28 p.m. UTC | #1
On 2/23/24 10:07 PM, David Wei wrote:
> Currently we unconditionally account time spent waiting for events in CQ
> ring as iowait time.
> 
> Some userspace tools consider iowait time to be CPU util/load which can
> be misleading as the process is sleeping. High iowait time might be
> indicative of issues for storage IO, but for network IO e.g. socket
> recv() we do not control when the completions happen so its value
> misleads userspace tooling.
> 
> This patch gates the previously unconditional iowait accounting behind a
> new IORING_REGISTER opcode. By default time is not accounted as iowait,
> unless this is explicitly enabled for a ring. Thus userspace can decide,
> depending on the type of work it expects to do, whether it wants to
> consider cqring wait time as iowait or not.

Looks good, thanks!
Pavel Begunkov Feb. 24, 2024, 3:31 p.m. UTC | #2
On 2/24/24 05:07, David Wei wrote:
> Currently we unconditionally account time spent waiting for events in CQ
> ring as iowait time.
> 
> Some userspace tools consider iowait time to be CPU util/load which can
> be misleading as the process is sleeping. High iowait time might be
> indicative of issues for storage IO, but for network IO e.g. socket
> recv() we do not control when the completions happen so its value
> misleads userspace tooling.
> 
> This patch gates the previously unconditional iowait accounting behind a
> new IORING_REGISTER opcode. By default time is not accounted as iowait,
> unless this is explicitly enabled for a ring. Thus userspace can decide,
> depending on the type of work it expects to do, whether it wants to
> consider cqring wait time as iowait or not.

I don't believe it's a sane approach. I think we agree that per
cpu iowait is a silly and misleading metric. I have hard time to
define what it is, and I'm sure most probably people complaining
wouldn't be able to tell as well. Now we're taking that metric
and expose even more knobs to userspace.

Another argument against is that per ctx is not the right place
to have it. It's a system metric, and you can imagine some system
admin looking for it. Even in cases when had some meaning w/o
io_uring now without looking at what flags io_uring has it's
completely meaningless, and it's too much to ask.

I don't understand why people freak out at seeing hi iowait,
IMHO it perfectly fits the definition of io_uring waiting for
IO / completions, but at this point it might be better to just
revert it to the old behaviour of not reporting iowait at all.
And if we want to save the cpu freq iowait optimisation, we
should just split notion of iowait reporting and iowait cpufreq
tuning.


> Signed-off-by: David Wei <dw@davidwei.uk>
> ---
>   include/linux/io_uring_types.h |  1 +
>   include/uapi/linux/io_uring.h  |  3 +++
>   io_uring/io_uring.c            |  9 +++++----
>   io_uring/register.c            | 17 +++++++++++++++++
>   4 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index bd7071aeec5d..c568e6b8c9f9 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -242,6 +242,7 @@ struct io_ring_ctx {
>   		unsigned int		drain_disabled: 1;
>   		unsigned int		compat: 1;
>   		unsigned int		iowq_limits_set : 1;
> +		unsigned int		iowait_enabled: 1;
>   
>   		struct task_struct	*submitter_task;
>   		struct io_rings		*rings;
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 7bd10201a02b..b068898c2283 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -575,6 +575,9 @@ enum {
>   	IORING_REGISTER_NAPI			= 27,
>   	IORING_UNREGISTER_NAPI			= 28,
>   
> +	/* account time spent in cqring wait as iowait */
> +	IORING_REGISTER_IOWAIT			= 29,
> +
>   	/* this goes last */
>   	IORING_REGISTER_LAST,
>   
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index cf2f514b7cc0..7f8d2a03cce6 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2533,12 +2533,13 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>   		return 0;
>   
>   	/*
> -	 * Mark us as being in io_wait if we have pending requests, so cpufreq
> -	 * can take into account that the task is waiting for IO - turns out
> -	 * to be important for low QD IO.
> +	 * Mark us as being in io_wait if we have pending requests if enabled
> +	 * via IORING_REGISTER_IOWAIT, so cpufreq can take into account that
> +	 * the task is waiting for IO - turns out to be important for low QD
> +	 * IO.
>   	 */
>   	io_wait = current->in_iowait;
> -	if (current_pending_io())
> +	if (ctx->iowait_enabled && current_pending_io())
>   		current->in_iowait = 1;
>   	ret = 0;
>   	if (iowq->timeout == KTIME_MAX)
> diff --git a/io_uring/register.c b/io_uring/register.c
> index 99c37775f974..fbdf3d3461d8 100644
> --- a/io_uring/register.c
> +++ b/io_uring/register.c
> @@ -387,6 +387,17 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
>   	return ret;
>   }
>   
> +static int io_register_iowait(struct io_ring_ctx *ctx, int val)
> +{
> +	int was_enabled = ctx->iowait_enabled;
> +
> +	if (val)
> +		ctx->iowait_enabled = 1;
> +	else
> +		ctx->iowait_enabled = 0;
> +	return was_enabled;
> +}
> +
>   static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>   			       void __user *arg, unsigned nr_args)
>   	__releases(ctx->uring_lock)
> @@ -563,6 +574,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>   			break;
>   		ret = io_unregister_napi(ctx, arg);
>   		break;
> +	case IORING_REGISTER_IOWAIT:
> +		ret = -EINVAL;
> +		if (arg)
> +			break;
> +		ret = io_register_iowait(ctx, nr_args);
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
David Wei Feb. 24, 2024, 5:20 p.m. UTC | #3
On 2024-02-24 07:31, Pavel Begunkov wrote:
> On 2/24/24 05:07, David Wei wrote:
>> Currently we unconditionally account time spent waiting for events in CQ
>> ring as iowait time.
>>
>> Some userspace tools consider iowait time to be CPU util/load which can
>> be misleading as the process is sleeping. High iowait time might be
>> indicative of issues for storage IO, but for network IO e.g. socket
>> recv() we do not control when the completions happen so its value
>> misleads userspace tooling.
>>
>> This patch gates the previously unconditional iowait accounting behind a
>> new IORING_REGISTER opcode. By default time is not accounted as iowait,
>> unless this is explicitly enabled for a ring. Thus userspace can decide,
>> depending on the type of work it expects to do, whether it wants to
>> consider cqring wait time as iowait or not.
> 
> I don't believe it's a sane approach. I think we agree that per
> cpu iowait is a silly and misleading metric. I have hard time to
> define what it is, and I'm sure most probably people complaining
> wouldn't be able to tell as well. Now we're taking that metric
> and expose even more knobs to userspace.
> 
> Another argument against is that per ctx is not the right place
> to have it. It's a system metric, and you can imagine some system
> admin looking for it. Even in cases when had some meaning w/o
> io_uring now without looking at what flags io_uring has it's
> completely meaningless, and it's too much to ask.> 
> I don't understand why people freak out at seeing hi iowait,
> IMHO it perfectly fits the definition of io_uring waiting for
> IO / completions, but at this point it might be better to just
> revert it to the old behaviour of not reporting iowait at all.

Irrespective of how misleading iowait is, many tools include it in its
CPU util/load calculations and users then use those metrics for e.g.
load balancing. In situations with storage workloads, iowait can be
useful even if its usefulness is limited. The problem that this patch is
trying to resolve is in mixed storage/network workloads on the same
system, where iowait has some usefulness (due to storage workloads)
_but_ I don't want network workloads contributing to the metric.

This does put the onus on userspace to do the right thing - decide
whether iowait makes sense for a workload or not. I don't have enough
kernel experience to know whether this expectation is realistic or not.
But, it is turned off by default so if userspace does not set it (which
seems like the most likely thing) then iowait accounting is off just
like the old behaviour. Perhaps we need to make it clearer to storage
use-cases to turn it on in order to get the optimisation?

> And if we want to save the cpu freq iowait optimisation, we
> should just split notion of iowait reporting and iowait cpufreq
> tuning.

Yeah, that could be an option. I'll take a look at it.

> 
> 
>> Signed-off-by: David Wei <dw@davidwei.uk>
>> ---
>>   include/linux/io_uring_types.h |  1 +
>>   include/uapi/linux/io_uring.h  |  3 +++
>>   io_uring/io_uring.c            |  9 +++++----
>>   io_uring/register.c            | 17 +++++++++++++++++
>>   4 files changed, 26 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>> index bd7071aeec5d..c568e6b8c9f9 100644
>> --- a/include/linux/io_uring_types.h
>> +++ b/include/linux/io_uring_types.h
>> @@ -242,6 +242,7 @@ struct io_ring_ctx {
>>           unsigned int        drain_disabled: 1;
>>           unsigned int        compat: 1;
>>           unsigned int        iowq_limits_set : 1;
>> +        unsigned int        iowait_enabled: 1;
>>             struct task_struct    *submitter_task;
>>           struct io_rings        *rings;
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 7bd10201a02b..b068898c2283 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -575,6 +575,9 @@ enum {
>>       IORING_REGISTER_NAPI            = 27,
>>       IORING_UNREGISTER_NAPI            = 28,
>>   +    /* account time spent in cqring wait as iowait */
>> +    IORING_REGISTER_IOWAIT            = 29,
>> +
>>       /* this goes last */
>>       IORING_REGISTER_LAST,
>>   diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index cf2f514b7cc0..7f8d2a03cce6 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2533,12 +2533,13 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>           return 0;
>>         /*
>> -     * Mark us as being in io_wait if we have pending requests, so cpufreq
>> -     * can take into account that the task is waiting for IO - turns out
>> -     * to be important for low QD IO.
>> +     * Mark us as being in io_wait if we have pending requests if enabled
>> +     * via IORING_REGISTER_IOWAIT, so cpufreq can take into account that
>> +     * the task is waiting for IO - turns out to be important for low QD
>> +     * IO.
>>        */
>>       io_wait = current->in_iowait;
>> -    if (current_pending_io())
>> +    if (ctx->iowait_enabled && current_pending_io())
>>           current->in_iowait = 1;
>>       ret = 0;
>>       if (iowq->timeout == KTIME_MAX)
>> diff --git a/io_uring/register.c b/io_uring/register.c
>> index 99c37775f974..fbdf3d3461d8 100644
>> --- a/io_uring/register.c
>> +++ b/io_uring/register.c
>> @@ -387,6 +387,17 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
>>       return ret;
>>   }
>>   +static int io_register_iowait(struct io_ring_ctx *ctx, int val)
>> +{
>> +    int was_enabled = ctx->iowait_enabled;
>> +
>> +    if (val)
>> +        ctx->iowait_enabled = 1;
>> +    else
>> +        ctx->iowait_enabled = 0;
>> +    return was_enabled;
>> +}
>> +
>>   static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>>                      void __user *arg, unsigned nr_args)
>>       __releases(ctx->uring_lock)
>> @@ -563,6 +574,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>>               break;
>>           ret = io_unregister_napi(ctx, arg);
>>           break;
>> +    case IORING_REGISTER_IOWAIT:
>> +        ret = -EINVAL;
>> +        if (arg)
>> +            break;
>> +        ret = io_register_iowait(ctx, nr_args);
>> +        break;
>>       default:
>>           ret = -EINVAL;
>>           break;
>
Jens Axboe Feb. 24, 2024, 6:51 p.m. UTC | #4
On 2/24/24 8:31 AM, Pavel Begunkov wrote:
> On 2/24/24 05:07, David Wei wrote:
>> Currently we unconditionally account time spent waiting for events in CQ
>> ring as iowait time.
>>
>> Some userspace tools consider iowait time to be CPU util/load which can
>> be misleading as the process is sleeping. High iowait time might be
>> indicative of issues for storage IO, but for network IO e.g. socket
>> recv() we do not control when the completions happen so its value
>> misleads userspace tooling.
>>
>> This patch gates the previously unconditional iowait accounting behind a
>> new IORING_REGISTER opcode. By default time is not accounted as iowait,
>> unless this is explicitly enabled for a ring. Thus userspace can decide,
>> depending on the type of work it expects to do, whether it wants to
>> consider cqring wait time as iowait or not.
> 
> I don't believe it's a sane approach. I think we agree that per
> cpu iowait is a silly and misleading metric. I have hard time to
> define what it is, and I'm sure most probably people complaining
> wouldn't be able to tell as well. Now we're taking that metric
> and expose even more knobs to userspace.

For sure, it's a stupid metric. But at the same time, educating people
on this can be like talking to a brick wall, and it'll be years of doing
that before we're making a dent in it. Hence I do think that just
exposing the knob and letting the storage side use it, if they want, is
the path of least resistance. I'm personally not going to do a crusade
on iowait to eliminate it, I don't have the time for that. I'll educate
people when it comes up, like I have been doing, but pulling this to
conclusion would be 10+ years easily.

> Another argument against is that per ctx is not the right place
> to have it. It's a system metric, and you can imagine some system
> admin looking for it. Even in cases when had some meaning w/o
> io_uring now without looking at what flags io_uring has it's
> completely meaningless, and it's too much to ask.
> 
> I don't understand why people freak out at seeing hi iowait,
> IMHO it perfectly fits the definition of io_uring waiting for
> IO / completions, but at this point it might be better to just
> revert it to the old behaviour of not reporting iowait at all.
> And if we want to save the cpu freq iowait optimisation, we
> should just split notion of iowait reporting and iowait cpufreq
> tuning.

For io_uring, splitting the cpufreq from iowait is certainly the right
approach. And then just getting rid of iowait completely on the io_uring
side. This can be done without preaching about iowait to everyone that
has bad metrics for their healt monitoring, which is why I like that a
lot. I did ponder that the other day as well.

You still kind of run into a problem with that in terms of when short vs
long waits are expected. On the io_uring side, we use the "do I have
any requests pending" for that, which is obviously not fine grained
enough. We could apply it on just "do I have any requests against
regular files" instead, which would then translate to needing further
tracking on our side. Probably fine to just apply it for the existing
logic, imho.
Jens Axboe Feb. 24, 2024, 6:55 p.m. UTC | #5
On 2/24/24 10:20 AM, David Wei wrote:
>> I don't believe it's a sane approach. I think we agree that per
>> cpu iowait is a silly and misleading metric. I have hard time to
>> define what it is, and I'm sure most probably people complaining
>> wouldn't be able to tell as well. Now we're taking that metric
>> and expose even more knobs to userspace.
>>
>> Another argument against is that per ctx is not the right place
>> to have it. It's a system metric, and you can imagine some system
>> admin looking for it. Even in cases when had some meaning w/o
>> io_uring now without looking at what flags io_uring has it's
>> completely meaningless, and it's too much to ask.> 
>> I don't understand why people freak out at seeing hi iowait,
>> IMHO it perfectly fits the definition of io_uring waiting for
>> IO / completions, but at this point it might be better to just
>> revert it to the old behaviour of not reporting iowait at all.
> 
> Irrespective of how misleading iowait is, many tools include it in its
> CPU util/load calculations and users then use those metrics for e.g.
> load balancing. In situations with storage workloads, iowait can be
> useful even if its usefulness is limited. The problem that this patch is
> trying to resolve is in mixed storage/network workloads on the same
> system, where iowait has some usefulness (due to storage workloads)
> _but_ I don't want network workloads contributing to the metric.
> 
> This does put the onus on userspace to do the right thing - decide
> whether iowait makes sense for a workload or not. I don't have enough
> kernel experience to know whether this expectation is realistic or not.
> But, it is turned off by default so if userspace does not set it (which
> seems like the most likely thing) then iowait accounting is off just
> like the old behaviour. Perhaps we need to make it clearer to storage
> use-cases to turn it on in order to get the optimisation?

Personally I don't care too much about per-ctx iowait, I don't think
it's an issue at all. Fact is, most workloads that do storage and
networking would likely use a ring for each. And if they do mix, then
just pick if you care about iowait or not. Long term, would the toggle
iowait thing most likely just go away? Yep it would. But it's not like
it's any kind of maintenance burden. tldr - if we can do cpufreq
boosting easily on waits without adding iowait to the mix, then that'd
be great and we can just do that. If not, let's add the iowait toggle
and just be done with it.

>> And if we want to save the cpu freq iowait optimisation, we
>> should just split notion of iowait reporting and iowait cpufreq
>> tuning.
> 
> Yeah, that could be an option. I'll take a look at it.

It'd be trivial to do, only issue I see is that it'd require another set
of per-runqueue atomics to count for short waits on top of the
nr_iowaits we already do. I doubt the scheduling side will be receptive
to that.
Pavel Begunkov Feb. 25, 2024, 12:58 a.m. UTC | #6
On 2/24/24 18:51, Jens Axboe wrote:
> On 2/24/24 8:31 AM, Pavel Begunkov wrote:
>> On 2/24/24 05:07, David Wei wrote:
>>> Currently we unconditionally account time spent waiting for events in CQ
>>> ring as iowait time.
>>>
>>> Some userspace tools consider iowait time to be CPU util/load which can
>>> be misleading as the process is sleeping. High iowait time might be
>>> indicative of issues for storage IO, but for network IO e.g. socket
>>> recv() we do not control when the completions happen so its value
>>> misleads userspace tooling.
>>>
>>> This patch gates the previously unconditional iowait accounting behind a
>>> new IORING_REGISTER opcode. By default time is not accounted as iowait,
>>> unless this is explicitly enabled for a ring. Thus userspace can decide,
>>> depending on the type of work it expects to do, whether it wants to
>>> consider cqring wait time as iowait or not.
>>
>> I don't believe it's a sane approach. I think we agree that per
>> cpu iowait is a silly and misleading metric. I have hard time to
>> define what it is, and I'm sure most probably people complaining
>> wouldn't be able to tell as well. Now we're taking that metric
>> and expose even more knobs to userspace.
> 
> For sure, it's a stupid metric. But at the same time, educating people
> on this can be like talking to a brick wall, and it'll be years of doing
> that before we're making a dent in it. Hence I do think that just
> exposing the knob and letting the storage side use it, if they want, is
> the path of least resistance. I'm personally not going to do a crusade
> on iowait to eliminate it, I don't have the time for that. I'll educate

Exactly my point but with a different conclusion. The path of least
resistance is to have io_uring not accounted to iowait. That's how
it was so nobody should complain about it, you don't have to care about
it at all, you don't have to educate people on iowait when it comes up
with in the context of that knob, and you don't have to educate folks
on what this knob is and wtf it's there, and we're not pretending that
it works when it's not.

> people when it comes up, like I have been doing, but pulling this to
> conclusion would be 10+ years easily.
> 
>> Another argument against is that per ctx is not the right place
>> to have it. It's a system metric, and you can imagine some system
>> admin looking for it. Even in cases when had some meaning w/o
>> io_uring now without looking at what flags io_uring has it's
>> completely meaningless, and it's too much to ask.
>>
>> I don't understand why people freak out at seeing hi iowait,
>> IMHO it perfectly fits the definition of io_uring waiting for
>> IO / completions, but at this point it might be better to just
>> revert it to the old behaviour of not reporting iowait at all.
>> And if we want to save the cpu freq iowait optimisation, we
>> should just split notion of iowait reporting and iowait cpufreq
>> tuning.
> 
> For io_uring, splitting the cpufreq from iowait is certainly the right
> approach. And then just getting rid of iowait completely on the io_uring
> side. This can be done without preaching about iowait to everyone that
> has bad metrics for their healt monitoring, which is why I like that a
> lot. I did ponder that the other day as well.
> 
> You still kind of run into a problem with that in terms of when short vs
> long waits are expected. On the io_uring side, we use the "do I have
> any requests pending" for that, which is obviously not fine grained
> enough. We could apply it on just "do I have any requests against
> regular files" instead, which would then translate to needing further
> tracking on our side. Probably fine to just apply it for the existing
> logic, imho.

Let's say there are two problems, one is the accounting mess, which
is IMHO clear. The second is the optimisation, which is not mentioned
in the patch and kind of an orthogonal issue. If we want a knob to
disable/enable the cpufreq thing, it should be separate from iowait
accounting, because it sounds like a really unfortunate side effect
when you enable the optimisation and the iowait goes pounding the roof.
Then people never touch it, especially in a framework, because an
system admin will be pretty surprised by the metric.

Not like I'm a fan of the idea of having a userspace knob for the
optimisation, it'd be pretty complicated to use, and hopefully there
is a more intelligent approach.
Pavel Begunkov Feb. 25, 2024, 1:39 a.m. UTC | #7
On 2/24/24 18:55, Jens Axboe wrote:
> On 2/24/24 10:20 AM, David Wei wrote:
>>> I don't believe it's a sane approach. I think we agree that per
>>> cpu iowait is a silly and misleading metric. I have hard time to
>>> define what it is, and I'm sure most probably people complaining
>>> wouldn't be able to tell as well. Now we're taking that metric
>>> and expose even more knobs to userspace.
>>>
>>> Another argument against is that per ctx is not the right place
>>> to have it. It's a system metric, and you can imagine some system
>>> admin looking for it. Even in cases when had some meaning w/o
>>> io_uring now without looking at what flags io_uring has it's
>>> completely meaningless, and it's too much to ask.>
>>> I don't understand why people freak out at seeing hi iowait,
>>> IMHO it perfectly fits the definition of io_uring waiting for
>>> IO / completions, but at this point it might be better to just
>>> revert it to the old behaviour of not reporting iowait at all.
>>
>> Irrespective of how misleading iowait is, many tools include it in its
>> CPU util/load calculations and users then use those metrics for e.g.
>> load balancing. In situations with storage workloads, iowait can be
>> useful even if its usefulness is limited. The problem that this patch is
>> trying to resolve is in mixed storage/network workloads on the same
>> system, where iowait has some usefulness (due to storage workloads)
>> _but_ I don't want network workloads contributing to the metric.
>>
>> This does put the onus on userspace to do the right thing - decide
>> whether iowait makes sense for a workload or not. I don't have enough
>> kernel experience to know whether this expectation is realistic or not.
>> But, it is turned off by default so if userspace does not set it (which
>> seems like the most likely thing) then iowait accounting is off just
>> like the old behaviour. Perhaps we need to make it clearer to storage
>> use-cases to turn it on in order to get the optimisation?
> 
> Personally I don't care too much about per-ctx iowait, I don't think
> it's an issue at all. Fact is, most workloads that do storage and
> networking would likely use a ring for each. And if they do mix, then
> just pick if you care about iowait or not. Long term, would the toggle

Let's say you want the optimisation, but don't want to screw up system
iowait stats because as we've seen there will be people complaining.
What do you do? 99% of frameworks and libraries would never enable it,
which is a shame. If it's some container hosting, the vendors might
start complaining, especially since it's inconsistent and depends on
the user, then we might need to blacklist it globally. Then the cases
when you control the entire stack, you need to tell people from other
teams and PEs that it's how it is.

> iowait thing most likely just go away? Yep it would. But it's not like

I predict that with enough time if you'd try to root it out
there will be someone complaining that iowait is unexpectedly
0, it's a regression please return the flag back. I doubt it
would just go away, but probably depends on the timeline.

> it's any kind of maintenance burden. tldr - if we can do cpufreq

ala death from thousand useless apis nobody cares about
nor truly understands.

> boosting easily on waits without adding iowait to the mix, then that'd
> be great and we can just do that. If not, let's add the iowait toggle
> and just be done with it.
> 
>>> And if we want to save the cpu freq iowait optimisation, we
>>> should just split notion of iowait reporting and iowait cpufreq
>>> tuning.
>>
>> Yeah, that could be an option. I'll take a look at it.
> 
> It'd be trivial to do, only issue I see is that it'd require another set
> of per-runqueue atomics to count for short waits on top of the
> nr_iowaits we already do. I doubt the scheduling side will be receptive
> to that.

Looked it up, sounds unfortunate, but also seems like the
status quo can be optimised with an additional cpu local
var.
Jens Axboe Feb. 25, 2024, 4:39 p.m. UTC | #8
On 2/24/24 5:58 PM, Pavel Begunkov wrote:
> On 2/24/24 18:51, Jens Axboe wrote:
>> On 2/24/24 8:31 AM, Pavel Begunkov wrote:
>>> On 2/24/24 05:07, David Wei wrote:
>>>> Currently we unconditionally account time spent waiting for events in CQ
>>>> ring as iowait time.
>>>>
>>>> Some userspace tools consider iowait time to be CPU util/load which can
>>>> be misleading as the process is sleeping. High iowait time might be
>>>> indicative of issues for storage IO, but for network IO e.g. socket
>>>> recv() we do not control when the completions happen so its value
>>>> misleads userspace tooling.
>>>>
>>>> This patch gates the previously unconditional iowait accounting behind a
>>>> new IORING_REGISTER opcode. By default time is not accounted as iowait,
>>>> unless this is explicitly enabled for a ring. Thus userspace can decide,
>>>> depending on the type of work it expects to do, whether it wants to
>>>> consider cqring wait time as iowait or not.
>>>
>>> I don't believe it's a sane approach. I think we agree that per
>>> cpu iowait is a silly and misleading metric. I have hard time to
>>> define what it is, and I'm sure most probably people complaining
>>> wouldn't be able to tell as well. Now we're taking that metric
>>> and expose even more knobs to userspace.
>>
>> For sure, it's a stupid metric. But at the same time, educating people
>> on this can be like talking to a brick wall, and it'll be years of doing
>> that before we're making a dent in it. Hence I do think that just
>> exposing the knob and letting the storage side use it, if they want, is
>> the path of least resistance. I'm personally not going to do a crusade
>> on iowait to eliminate it, I don't have the time for that. I'll educate
> 
> Exactly my point but with a different conclusion. The path of least

I think that's because I'm a realist, and you are an idealist ;-)

> resistance is to have io_uring not accounted to iowait. That's how
> it was so nobody should complain about it, you don't have to care about
> it at all, you don't have to educate people on iowait when it comes up
> with in the context of that knob, and you don't have to educate folks
> on what this knob is and wtf it's there, and we're not pretending that
> it works when it's not.

I don't think anyone cares about iowait going away for waiting on events
with io_uring, but some would very much care about losing the cpufreq
connection which is why it got added in the first place. If we can
trivially do that without iowait, then we should certainly just do that
and call it good. THAT is the main question to answer, in form of a
patch.

>> people when it comes up, like I have been doing, but pulling this to
>> conclusion would be 10+ years easily.
>>
>>> Another argument against is that per ctx is not the right place
>>> to have it. It's a system metric, and you can imagine some system
>>> admin looking for it. Even in cases when had some meaning w/o
>>> io_uring now without looking at what flags io_uring has it's
>>> completely meaningless, and it's too much to ask.
>>>
>>> I don't understand why people freak out at seeing hi iowait,
>>> IMHO it perfectly fits the definition of io_uring waiting for
>>> IO / completions, but at this point it might be better to just
>>> revert it to the old behaviour of not reporting iowait at all.
>>> And if we want to save the cpu freq iowait optimisation, we
>>> should just split notion of iowait reporting and iowait cpufreq
>>> tuning.
>>
>> For io_uring, splitting the cpufreq from iowait is certainly the right
>> approach. And then just getting rid of iowait completely on the io_uring
>> side. This can be done without preaching about iowait to everyone that
>> has bad metrics for their healt monitoring, which is why I like that a
>> lot. I did ponder that the other day as well.
>>
>> You still kind of run into a problem with that in terms of when short vs
>> long waits are expected. On the io_uring side, we use the "do I have
>> any requests pending" for that, which is obviously not fine grained
>> enough. We could apply it on just "do I have any requests against
>> regular files" instead, which would then translate to needing further
>> tracking on our side. Probably fine to just apply it for the existing
>> logic, imho.
> 
> Let's say there are two problems, one is the accounting mess, which
> is IMHO clear. The second is the optimisation, which is not mentioned
> in the patch and kind of an orthogonal issue. If we want a knob to
> disable/enable the cpufreq thing, it should be separate from iowait
> accounting, because it sounds like a really unfortunate side effect
> when you enable the optimisation and the iowait goes pounding the roof.
> Then people never touch it, especially in a framework, because an
> system admin will be pretty surprised by the metric.

Fully agree, it's all about not losing the short sleep high latency
wakeup, which is what caused the performance to drop for low QD IOs.

> Not like I'm a fan of the idea of having a userspace knob for the
> optimisation, it'd be pretty complicated to use, and hopefully there
> is a more intelligent approach.

That would certainly be nice.
Jens Axboe Feb. 25, 2024, 4:43 p.m. UTC | #9
On 2/24/24 6:39 PM, Pavel Begunkov wrote:
> On 2/24/24 18:55, Jens Axboe wrote:
>> On 2/24/24 10:20 AM, David Wei wrote:
>>>> I don't believe it's a sane approach. I think we agree that per
>>>> cpu iowait is a silly and misleading metric. I have hard time to
>>>> define what it is, and I'm sure most probably people complaining
>>>> wouldn't be able to tell as well. Now we're taking that metric
>>>> and expose even more knobs to userspace.
>>>>
>>>> Another argument against is that per ctx is not the right place
>>>> to have it. It's a system metric, and you can imagine some system
>>>> admin looking for it. Even in cases when had some meaning w/o
>>>> io_uring now without looking at what flags io_uring has it's
>>>> completely meaningless, and it's too much to ask.>
>>>> I don't understand why people freak out at seeing hi iowait,
>>>> IMHO it perfectly fits the definition of io_uring waiting for
>>>> IO / completions, but at this point it might be better to just
>>>> revert it to the old behaviour of not reporting iowait at all.
>>>
>>> Irrespective of how misleading iowait is, many tools include it in its
>>> CPU util/load calculations and users then use those metrics for e.g.
>>> load balancing. In situations with storage workloads, iowait can be
>>> useful even if its usefulness is limited. The problem that this patch is
>>> trying to resolve is in mixed storage/network workloads on the same
>>> system, where iowait has some usefulness (due to storage workloads)
>>> _but_ I don't want network workloads contributing to the metric.
>>>
>>> This does put the onus on userspace to do the right thing - decide
>>> whether iowait makes sense for a workload or not. I don't have enough
>>> kernel experience to know whether this expectation is realistic or not.
>>> But, it is turned off by default so if userspace does not set it (which
>>> seems like the most likely thing) then iowait accounting is off just
>>> like the old behaviour. Perhaps we need to make it clearer to storage
>>> use-cases to turn it on in order to get the optimisation?
>>
>> Personally I don't care too much about per-ctx iowait, I don't think
>> it's an issue at all. Fact is, most workloads that do storage and
>> networking would likely use a ring for each. And if they do mix, then
>> just pick if you care about iowait or not. Long term, would the toggle
> 
> Let's say you want the optimisation, but don't want to screw up system
> iowait stats because as we've seen there will be people complaining.
> What do you do? 99% of frameworks and libraries would never enable it,
> which is a shame. If it's some container hosting, the vendors might
> start complaining, especially since it's inconsistent and depends on
> the user, then we might need to blacklist it globally. Then the cases
> when you control the entire stack, you need to tell people from other
> teams and PEs that it's how it is.

I suspect the best way would be to leave it as-is, eg iowait is on by
default. That way if people start noticing they will look around, find
the knob, an tweak it. For a pure storage kind of workload, people are
used to iowait and they will probably think nothing of it. For a pure
networking workload, people might find it odd and that'll trigger the
incentive to start searching for it and then they will pretty quickly
figure out how to turn it off.

>> iowait thing most likely just go away? Yep it would. But it's not like
> 
> I predict that with enough time if you'd try to root it out
> there will be someone complaining that iowait is unexpectedly
> 0, it's a regression please return the flag back. I doubt it
> would just go away, but probably depends on the timeline.

Right, so just keep it as-is by default.

>> it's any kind of maintenance burden. tldr - if we can do cpufreq
> 
> ala death from thousand useless apis nobody cares about
> nor truly understands.

Let's be real, some toggle function isn't really going to cause any
maintenance burden going forward. It's not like it's a complicated API.
If it rots in a corner years from now, which I hope it will, it won't
really add any burden on maintainers. It'll most likely never get
touched again once it has landed.

>> boosting easily on waits without adding iowait to the mix, then that'd
>> be great and we can just do that. If not, let's add the iowait toggle
>> and just be done with it.
>>
>>>> And if we want to save the cpu freq iowait optimisation, we
>>>> should just split notion of iowait reporting and iowait cpufreq
>>>> tuning.
>>>
>>> Yeah, that could be an option. I'll take a look at it.
>>
>> It'd be trivial to do, only issue I see is that it'd require another set
>> of per-runqueue atomics to count for short waits on top of the
>> nr_iowaits we already do. I doubt the scheduling side will be receptive
>> to that.
> 
> Looked it up, sounds unfortunate, but also seems like the
> status quo can be optimised with an additional cpu local
> var.

If you are motivated, please dig into it. If not, I guess I will take a
look this week.
Jens Axboe Feb. 25, 2024, 9:11 p.m. UTC | #10
On 2/25/24 9:43 AM, Jens Axboe wrote:
> If you are motivated, please dig into it. If not, I guess I will take a
> look this week. 

The straight forward approach - add a nr_short_wait and ->in_short_wait
and ensure that the idle governor factors that in. Not sure how
palatable it is, would be nice fold iowait under this, but doesn't
really work with how we pass back the previous state.

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b96e3da0fedd..39f05d6062e1 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -119,7 +119,7 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned int nr_short_waiters)
 {
 	int bucket = 0;
 
@@ -129,7 +129,7 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
 	 * This allows us to calculate
 	 * E(duration)|iowait
 	 */
-	if (nr_iowaiters)
+	if (nr_short_waiters)
 		bucket = BUCKETS/2;
 
 	if (duration_ns < 10ULL * NSEC_PER_USEC)
@@ -152,10 +152,10 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
  * to be, the higher this multiplier, and thus the higher
  * the barrier to go to an expensive C state.
  */
-static inline int performance_multiplier(unsigned int nr_iowaiters)
+static inline int performance_multiplier(unsigned int nr_short_waiters)
 {
 	/* for IO wait tasks (per cpu!) we add 10x each */
-	return 1 + 10 * nr_iowaiters;
+	return 1 + 10 * nr_short_waiters;
 }
 
 static DEFINE_PER_CPU(struct menu_device, menu_devices);
@@ -266,7 +266,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	u64 predicted_ns;
 	u64 interactivity_req;
-	unsigned int nr_iowaiters;
+	unsigned int nr_short_waiters;
 	ktime_t delta, delta_tick;
 	int i, idx;
 
@@ -275,7 +275,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		data->needs_update = 0;
 	}
 
-	nr_iowaiters = nr_iowait_cpu(dev->cpu);
+	nr_short_waiters = nr_iowait_cpu(dev->cpu) + nr_short_wait_cpu(dev->cpu);
 
 	/* Find the shortest expected idle interval. */
 	predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
@@ -290,7 +290,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		}
 
 		data->next_timer_ns = delta;
-		data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+		data->bucket = which_bucket(data->next_timer_ns, nr_short_waiters);
 
 		/* Round up the result for half microseconds. */
 		timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
@@ -308,7 +308,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 */
 		data->next_timer_ns = KTIME_MAX;
 		delta_tick = TICK_NSEC / 2;
-		data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
+		data->bucket = which_bucket(KTIME_MAX, nr_short_waiters);
 	}
 
 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
@@ -341,7 +341,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * latency_req to determine the maximum exit latency.
 		 */
 		interactivity_req = div64_u64(predicted_ns,
-					      performance_multiplier(nr_iowaiters));
+					      performance_multiplier(nr_short_waiters));
 		if (latency_req > interactivity_req)
 			latency_req = interactivity_req;
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..b04c08cadf1f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -923,6 +923,7 @@ struct task_struct {
 	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
+	unsigned			in_short_wait:1;
 #ifndef TIF_RESTORE_SIGMASK
 	unsigned			restore_sigmask:1;
 #endif
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 0108a38bb64d..12f5795c4c32 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -21,6 +21,7 @@ extern unsigned int nr_running(void);
 extern bool single_task_running(void);
 extern unsigned int nr_iowait(void);
 extern unsigned int nr_iowait_cpu(int cpu);
+unsigned int nr_short_wait_cpu(int cpu);
 
 static inline int sched_info_on(void)
 {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f6332fc56bed..024af44ead12 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2519,7 +2519,7 @@ static bool current_pending_io(void)
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq)
 {
-	int io_wait, ret;
+	int short_wait, ret;
 
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
@@ -2537,15 +2537,15 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 * can take into account that the task is waiting for IO - turns out
 	 * to be important for low QD IO.
 	 */
-	io_wait = current->in_iowait;
+	short_wait = current->in_short_wait;
 	if (current_pending_io())
-		current->in_iowait = 1;
+		current->in_short_wait = 1;
 	ret = 0;
 	if (iowq->timeout == KTIME_MAX)
 		schedule();
 	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
 		ret = -ETIME;
-	current->in_iowait = io_wait;
+	current->in_short_wait = short_wait;
 	return ret;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..dc7a08db8921 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3790,6 +3790,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
+	} else if (p->in_short_wait) {
+		atomic_dec(&task_rq(p)->nr_short_wait);
 	}
 
 	activate_task(rq, p, en_flags);
@@ -4356,6 +4358,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 			if (p->in_iowait) {
 				delayacct_blkio_end(p);
 				atomic_dec(&task_rq(p)->nr_iowait);
+			} else if (p->in_short_wait) {
+				atomic_dec(&task_rq(p)->nr_short_wait);
 			}
 
 			wake_flags |= WF_MIGRATED;
@@ -5506,6 +5510,11 @@ unsigned int nr_iowait(void)
 	return sum;
 }
 
+unsigned int nr_short_wait_cpu(int cpu)
+{
+	return atomic_read(&cpu_rq(cpu)->nr_short_wait);
+}
+
 #ifdef CONFIG_SMP
 
 /*
@@ -6683,6 +6692,8 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 			if (prev->in_iowait) {
 				atomic_inc(&rq->nr_iowait);
 				delayacct_blkio_start();
+			} else if (prev->in_short_wait) {
+				atomic_inc(&rq->nr_short_wait);
 			}
 		}
 		switch_count = &prev->nvcsw;
@@ -10030,6 +10041,7 @@ void __init sched_init(void)
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
+		atomic_set(&rq->nr_short_wait, 0);
 
 #ifdef CONFIG_SCHED_CORE
 		rq->core = rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 533547e3c90a..9d4fc0b9de26 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6721,7 +6721,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
 	 * passed.
 	 */
-	if (p->in_iowait)
+	if (p->in_iowait || p->in_short_wait)
 		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 
 	for_each_sched_entity(se) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..0530e9e97ecb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1050,6 +1050,7 @@ struct rq {
 #endif
 
 	atomic_t		nr_iowait;
+	atomic_t		nr_short_wait;
 
 #ifdef CONFIG_SCHED_DEBUG
 	u64 last_seen_need_resched_ns;
Jens Axboe Feb. 25, 2024, 9:33 p.m. UTC | #11
On 2/25/24 2:11 PM, Jens Axboe wrote:
> On 2/25/24 9:43 AM, Jens Axboe wrote:
>> If you are motivated, please dig into it. If not, I guess I will take a
>> look this week. 
> 
> The straight forward approach - add a nr_short_wait and ->in_short_wait
> and ensure that the idle governor factors that in. Not sure how
> palatable it is, would be nice fold iowait under this, but doesn't
> really work with how we pass back the previous state.

Split into 3 separate patches here. No changes from the previous email.
Tested it, and it works.

https://git.kernel.dk/cgit/linux/log/?h=iowait
Pavel Begunkov Feb. 26, 2024, 2:56 p.m. UTC | #12
On 2/25/24 21:11, Jens Axboe wrote:
> On 2/25/24 9:43 AM, Jens Axboe wrote:
>> If you are motivated, please dig into it. If not, I guess I will take a
>> look this week.

I tried to split the atomic as mentioned, but I don't think anybody
cares, it was 0.1% in perf, so wouldn't even be benchmarkeable,
and it's iowait only patch anyway. If anything you'd need to read
two vars every tick now, so nevermind

> The straight forward approach - add a nr_short_wait and ->in_short_wait
> and ensure that the idle governor factors that in. Not sure how
> palatable it is, would be nice fold iowait under this, but doesn't
> really work with how we pass back the previous state.

It might look nicer if instead adding nr_short_waiters you'd
do nr_iowait_account for the iowait% and leave nr_iowait
for cpufreq.

The block iowaiting / io_schedule / etc. would need to set
both flags...

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9116bcc90346..dc7a08db8921 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3790,6 +3790,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
>   	if (p->in_iowait) {
>   		delayacct_blkio_end(p);
>   		atomic_dec(&task_rq(p)->nr_iowait);
> +	} else if (p->in_short_wait) {
> +		atomic_dec(&task_rq(p)->nr_short_wait);
>   	}
>   
>   	activate_task(rq, p, en_flags);
> @@ -4356,6 +4358,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>   			if (p->in_iowait) {
>   				delayacct_blkio_end(p);
>   				atomic_dec(&task_rq(p)->nr_iowait);
> +			} else if (p->in_short_wait) {
> +				atomic_dec(&task_rq(p)->nr_short_wait);

... which would get this branch folded into the previous one,
which should be more welcome by the sched guys.
Jens Axboe Feb. 26, 2024, 3:22 p.m. UTC | #13
On 2/26/24 7:56 AM, Pavel Begunkov wrote:
> On 2/25/24 21:11, Jens Axboe wrote:
>> On 2/25/24 9:43 AM, Jens Axboe wrote:
>>> If you are motivated, please dig into it. If not, I guess I will take a
>>> look this week.
> 
> I tried to split the atomic as mentioned, but I don't think anybody
> cares, it was 0.1% in perf, so wouldn't even be benchmarkeable,
> and it's iowait only patch anyway. If anything you'd need to read
> two vars every tick now, so nevermind

Agree, I did ponder that too, but seems not worth it at all.

>> The straight forward approach - add a nr_short_wait and ->in_short_wait
>> and ensure that the idle governor factors that in. Not sure how
>> palatable it is, would be nice fold iowait under this, but doesn't
>> really work with how we pass back the previous state.
> 
> It might look nicer if instead adding nr_short_waiters you'd
> do nr_iowait_account for the iowait% and leave nr_iowait
> for cpufreq.
> 
> The block iowaiting / io_schedule / etc. would need to set
> both flags...

That's what I meant with the nesting too, but then we need to return
flags from eg io_schedule_prepare(). Not a big issue as I think that's
the only spot, and we can even just keep the type the same. Callers
should treat it as a cookie/token anyway.

I'll make that change.
Christian Loehle March 5, 2024, 2:59 p.m. UTC | #14
Hi folks,

On 25/02/2024 16:39, Jens Axboe wrote:
> On 2/24/24 5:58 PM, Pavel Begunkov wrote:
>> On 2/24/24 18:51, Jens Axboe wrote:
>>> On 2/24/24 8:31 AM, Pavel Begunkov wrote:
>>>> On 2/24/24 05:07, David Wei wrote:
>>>>> Currently we unconditionally account time spent waiting for events in CQ
>>>>> ring as iowait time.
>>>>>
>>>>> Some userspace tools consider iowait time to be CPU util/load which can
>>>>> be misleading as the process is sleeping. High iowait time might be
>>>>> indicative of issues for storage IO, but for network IO e.g. socket
>>>>> recv() we do not control when the completions happen so its value
>>>>> misleads userspace tooling.
>>>>>
>>>>> This patch gates the previously unconditional iowait accounting behind a
>>>>> new IORING_REGISTER opcode. By default time is not accounted as iowait,
>>>>> unless this is explicitly enabled for a ring. Thus userspace can decide,
>>>>> depending on the type of work it expects to do, whether it wants to
>>>>> consider cqring wait time as iowait or not.
>>>>
>>>> I don't believe it's a sane approach. I think we agree that per
>>>> cpu iowait is a silly and misleading metric. I have hard time to
>>>> define what it is, and I'm sure most probably people complaining
>>>> wouldn't be able to tell as well. Now we're taking that metric
>>>> and expose even more knobs to userspace.
>>>
>>> For sure, it's a stupid metric. But at the same time, educating people
>>> on this can be like talking to a brick wall, and it'll be years of doing
>>> that before we're making a dent in it. Hence I do think that just
>>> exposing the knob and letting the storage side use it, if they want, is
>>> the path of least resistance. I'm personally not going to do a crusade
>>> on iowait to eliminate it, I don't have the time for that. I'll educate
>>
>> Exactly my point but with a different conclusion. The path of least
> 
> I think that's because I'm a realist, and you are an idealist ;-)
> 
>> resistance is to have io_uring not accounted to iowait. That's how
>> it was so nobody should complain about it, you don't have to care about
>> it at all, you don't have to educate people on iowait when it comes up
>> with in the context of that knob, and you don't have to educate folks
>> on what this knob is and wtf it's there, and we're not pretending that
>> it works when it's not.
> 
> I don't think anyone cares about iowait going away for waiting on events
> with io_uring, but some would very much care about losing the cpufreq
> connection which is why it got added in the first place. If we can
> trivially do that without iowait, then we should certainly just do that
> and call it good. THAT is the main question to answer, in form of a
> patch.

I commented on Jens' patch regarding iowait and iowait_acct, which is
probably the path of least resistance for that specific issue, but let
me expand a bit on the cpufreq iowait connection problem.
cpufreq iowait handling and cpuidle iowait handling I would consider vastly
different in that respect and it seems to me that improving cpufreq should
be feasible effort (if it only catches 90% of scenarios at first then so
be it).
I'm thinking something of a in_iowait_boost (or the opposite in_iowait_queue_full
full meaning reasonably non-empty).
The current behaviour of boosting CPU frequency on anything is just very
unfortunate and for io_uring in particular destroys all of the power-savings
it could have due to reduced CPU usage.
(Just to be clear, current iowait boosting is not just broken in io_uring,
but rather everywhere, particularly because of the lack of definition what
iowait even means and when it should be set. Thus boosting on any iowait seen
is like taking a sledgehammer to crack a nut.)
I'm happy to propose some io_uring patch (that is probably nonsensical because
of a lot of reasons) or test whatever ideas you have.
Something like
if pending_requests > device_queue_size: Don't boost
would be an improvement from what I can tell.

And I know I'm heavily reducing io_uring to block IO here and am aware of how
wrong that is, but storage device IO was the original story that got iowait
boosting introduced in the first place.
If you have any cpufreq (or rather DVFS) related issues with anything else
io_uring like networking I'll give reproducing that a shot as well.
Would love to hear your thoughts!

Kind Regards,
Christian
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bd7071aeec5d..c568e6b8c9f9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -242,6 +242,7 @@  struct io_ring_ctx {
 		unsigned int		drain_disabled: 1;
 		unsigned int		compat: 1;
 		unsigned int		iowq_limits_set : 1;
+		unsigned int		iowait_enabled: 1;
 
 		struct task_struct	*submitter_task;
 		struct io_rings		*rings;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7bd10201a02b..b068898c2283 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -575,6 +575,9 @@  enum {
 	IORING_REGISTER_NAPI			= 27,
 	IORING_UNREGISTER_NAPI			= 28,
 
+	/* account time spent in cqring wait as iowait */
+	IORING_REGISTER_IOWAIT			= 29,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cf2f514b7cc0..7f8d2a03cce6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2533,12 +2533,13 @@  static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 		return 0;
 
 	/*
-	 * Mark us as being in io_wait if we have pending requests, so cpufreq
-	 * can take into account that the task is waiting for IO - turns out
-	 * to be important for low QD IO.
+	 * Mark us as being in io_wait if we have pending requests if enabled
+	 * via IORING_REGISTER_IOWAIT, so cpufreq can take into account that
+	 * the task is waiting for IO - turns out to be important for low QD
+	 * IO.
 	 */
 	io_wait = current->in_iowait;
-	if (current_pending_io())
+	if (ctx->iowait_enabled && current_pending_io())
 		current->in_iowait = 1;
 	ret = 0;
 	if (iowq->timeout == KTIME_MAX)
diff --git a/io_uring/register.c b/io_uring/register.c
index 99c37775f974..fbdf3d3461d8 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -387,6 +387,17 @@  static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+static int io_register_iowait(struct io_ring_ctx *ctx, int val)
+{
+	int was_enabled = ctx->iowait_enabled;
+
+	if (val)
+		ctx->iowait_enabled = 1;
+	else
+		ctx->iowait_enabled = 0;
+	return was_enabled;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -563,6 +574,12 @@  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_napi(ctx, arg);
 		break;
+	case IORING_REGISTER_IOWAIT:
+		ret = -EINVAL;
+		if (arg)
+			break;
+		ret = io_register_iowait(ctx, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;