diff mbox series

[v6,08/16] sched/cpufreq: uclamp: Add utilization clamping for FAIR tasks

Message ID 20190115101513.2822-9-patrick.bellasi@arm.com (mailing list archive)
State Not Applicable, archived
Headers show
Series Add utilization clamping support | expand

Commit Message

Patrick Bellasi Jan. 15, 2019, 10:15 a.m. UTC
Each time a frequency update is required via schedutil, a frequency is
selected to (possibly) satisfy the utilization reported by each
scheduling class. However, when utilization clamping is in use, the
frequency selection should consider userspace utilization clamping
hints.  This will allow, for example, to:

 - boost tasks which are directly affecting the user experience
   by running them at least at a minimum "requested" frequency

 - cap low priority tasks not directly affecting the user experience
   by running them only up to a maximum "allowed" frequency

These constraints are meant to support a per-task based tuning of the
frequency selection thus supporting a fine grained definition of
performance boosting vs energy saving strategies in kernel space.

Add support to clamp the utilization and IOWait boost of RUNNABLE FAIR
tasks within the boundaries defined by their aggregated utilization
clamp constraints.
Based on the max(min_util, max_util) of each task, max-aggregated the
CPU clamp value in a way to give the boosted tasks the performance they
need when they happen to be co-scheduled with other capped tasks.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

---
Changes in v6:
 Message-ID: <20181107113849.GC14309@e110439-lin>
 - sanity check util_max >= util_min
 Others:
 - wholesale s/group/bucket/
 - wholesale s/_{get,put}/_{inc,dec}/ to match refcount APIs
---
 kernel/sched/cpufreq_schedutil.c | 27 ++++++++++++++++++++++++---
 kernel/sched/sched.h             | 23 +++++++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

Comments

Rafael J. Wysocki Jan. 22, 2019, 10:37 a.m. UTC | #1
On Tuesday, January 15, 2019 11:15:05 AM CET Patrick Bellasi wrote:
> Each time a frequency update is required via schedutil, a frequency is
> selected to (possibly) satisfy the utilization reported by each
> scheduling class. However, when utilization clamping is in use, the
> frequency selection should consider userspace utilization clamping
> hints.  This will allow, for example, to:
> 
>  - boost tasks which are directly affecting the user experience
>    by running them at least at a minimum "requested" frequency
> 
>  - cap low priority tasks not directly affecting the user experience
>    by running them only up to a maximum "allowed" frequency
> 
> These constraints are meant to support a per-task based tuning of the
> frequency selection thus supporting a fine grained definition of
> performance boosting vs energy saving strategies in kernel space.
> 
> Add support to clamp the utilization and IOWait boost of RUNNABLE FAIR
> tasks within the boundaries defined by their aggregated utilization
> clamp constraints.
> Based on the max(min_util, max_util) of each task, max-aggregated the
> CPU clamp value in a way to give the boosted tasks the performance they
> need when they happen to be co-scheduled with other capped tasks.
> 
> Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> ---
> Changes in v6:
>  Message-ID: <20181107113849.GC14309@e110439-lin>
>  - sanity check util_max >= util_min
>  Others:
>  - wholesale s/group/bucket/
>  - wholesale s/_{get,put}/_{inc,dec}/ to match refcount APIs
> ---
>  kernel/sched/cpufreq_schedutil.c | 27 ++++++++++++++++++++++++---
>  kernel/sched/sched.h             | 23 +++++++++++++++++++++++
>  2 files changed, 47 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index 033ec7c45f13..520ee2b785e7 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -218,8 +218,15 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
>  	 * CFS tasks and we use the same metric to track the effective
>  	 * utilization (PELT windows are synchronized) we can directly add them
>  	 * to obtain the CPU's actual utilization.
> +	 *
> +	 * CFS utilization can be boosted or capped, depending on utilization
> +	 * clamp constraints requested by currently RUNNABLE tasks.
> +	 * When there are no CFS RUNNABLE tasks, clamps are released and
> +	 * frequency will be gracefully reduced with the utilization decay.
>  	 */
> -	util = util_cfs;
> +	util = (type == ENERGY_UTIL)
> +		? util_cfs
> +		: uclamp_util(rq, util_cfs);
>  	util += cpu_util_rt(rq);
>  
>  	dl_util = cpu_util_dl(rq);
> @@ -327,6 +334,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
>  			       unsigned int flags)
>  {
>  	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
> +	unsigned int max_boost;
>  
>  	/* Reset boost if the CPU appears to have been idle enough */
>  	if (sg_cpu->iowait_boost &&
> @@ -342,11 +350,24 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
>  		return;
>  	sg_cpu->iowait_boost_pending = true;
>  
> +	/*
> +	 * Boost FAIR tasks only up to the CPU clamped utilization.
> +	 *
> +	 * Since DL tasks have a much more advanced bandwidth control, it's
> +	 * safe to assume that IO boost does not apply to those tasks.
> +	 * Instead, since RT tasks are not utilization clamped, we don't want
> +	 * to apply clamping on IO boost while there is blocked RT
> +	 * utilization.
> +	 */
> +	max_boost = sg_cpu->iowait_boost_max;
> +	if (!cpu_util_rt(cpu_rq(sg_cpu->cpu)))
> +		max_boost = uclamp_util(cpu_rq(sg_cpu->cpu), max_boost);
> +
>  	/* Double the boost at each request */
>  	if (sg_cpu->iowait_boost) {
>  		sg_cpu->iowait_boost <<= 1;
> -		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
> -			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
> +		if (sg_cpu->iowait_boost > max_boost)
> +			sg_cpu->iowait_boost = max_boost;
>  		return;
>  	}
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index b7f3ee8ba164..95d62a2a0b44 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2267,6 +2267,29 @@ static inline unsigned int uclamp_none(int clamp_id)
>  	return SCHED_CAPACITY_SCALE;
>  }
>  
> +#ifdef CONFIG_UCLAMP_TASK
> +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
> +{
> +	unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
> +	unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
> +
> +	/*
> +	 * Since CPU's {min,max}_util clamps are MAX aggregated considering
> +	 * RUNNABLE tasks with _different_ clamps, we can end up with an
> +	 * invertion, which we can fix at usage time.
> +	 */
> +	if (unlikely(min_util >= max_util))
> +		return min_util;
> +
> +	return clamp(util, min_util, max_util);
> +}
> +#else /* CONFIG_UCLAMP_TASK */
> +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
> +{
> +	return util;
> +}
> +#endif /* CONFIG_UCLAMP_TASK */
> +
>  #ifdef arch_scale_freq_capacity
>  # ifndef arch_scale_freq_invariant
>  #  define arch_scale_freq_invariant()	true
> 

IMO it would be better to combine this patch with the next one.

At least some things in it I was about to ask about would go away
then. :-)

Besides, I don't really see a reason for the split here.
Patrick Bellasi Jan. 22, 2019, 11:02 a.m. UTC | #2
On 22-Jan 11:37, Rafael J. Wysocki wrote:
> On Tuesday, January 15, 2019 11:15:05 AM CET Patrick Bellasi wrote:
> > Each time a frequency update is required via schedutil, a frequency is
> > selected to (possibly) satisfy the utilization reported by each
> > scheduling class. However, when utilization clamping is in use, the
> > frequency selection should consider userspace utilization clamping
> > hints.  This will allow, for example, to:
> > 
> >  - boost tasks which are directly affecting the user experience
> >    by running them at least at a minimum "requested" frequency
> > 
> >  - cap low priority tasks not directly affecting the user experience
> >    by running them only up to a maximum "allowed" frequency
> > 
> > These constraints are meant to support a per-task based tuning of the
> > frequency selection thus supporting a fine grained definition of
> > performance boosting vs energy saving strategies in kernel space.
> > 
> > Add support to clamp the utilization and IOWait boost of RUNNABLE FAIR
> > tasks within the boundaries defined by their aggregated utilization
> > clamp constraints.
> > Based on the max(min_util, max_util) of each task, max-aggregated the
> > CPU clamp value in a way to give the boosted tasks the performance they
> > need when they happen to be co-scheduled with other capped tasks.
> > 
> > Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > 
> > ---
> > Changes in v6:
> >  Message-ID: <20181107113849.GC14309@e110439-lin>
> >  - sanity check util_max >= util_min
> >  Others:
> >  - wholesale s/group/bucket/
> >  - wholesale s/_{get,put}/_{inc,dec}/ to match refcount APIs
> > ---
> >  kernel/sched/cpufreq_schedutil.c | 27 ++++++++++++++++++++++++---
> >  kernel/sched/sched.h             | 23 +++++++++++++++++++++++
> >  2 files changed, 47 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> > index 033ec7c45f13..520ee2b785e7 100644
> > --- a/kernel/sched/cpufreq_schedutil.c
> > +++ b/kernel/sched/cpufreq_schedutil.c
> > @@ -218,8 +218,15 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
> >  	 * CFS tasks and we use the same metric to track the effective
> >  	 * utilization (PELT windows are synchronized) we can directly add them
> >  	 * to obtain the CPU's actual utilization.
> > +	 *
> > +	 * CFS utilization can be boosted or capped, depending on utilization
> > +	 * clamp constraints requested by currently RUNNABLE tasks.
> > +	 * When there are no CFS RUNNABLE tasks, clamps are released and
> > +	 * frequency will be gracefully reduced with the utilization decay.
> >  	 */
> > -	util = util_cfs;
> > +	util = (type == ENERGY_UTIL)
> > +		? util_cfs
> > +		: uclamp_util(rq, util_cfs);
> >  	util += cpu_util_rt(rq);
> >  
> >  	dl_util = cpu_util_dl(rq);
> > @@ -327,6 +334,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> >  			       unsigned int flags)
> >  {
> >  	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
> > +	unsigned int max_boost;
> >  
> >  	/* Reset boost if the CPU appears to have been idle enough */
> >  	if (sg_cpu->iowait_boost &&
> > @@ -342,11 +350,24 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> >  		return;
> >  	sg_cpu->iowait_boost_pending = true;
> >  
> > +	/*
> > +	 * Boost FAIR tasks only up to the CPU clamped utilization.
> > +	 *
> > +	 * Since DL tasks have a much more advanced bandwidth control, it's
> > +	 * safe to assume that IO boost does not apply to those tasks.
> > +	 * Instead, since RT tasks are not utilization clamped, we don't want
> > +	 * to apply clamping on IO boost while there is blocked RT
> > +	 * utilization.
> > +	 */
> > +	max_boost = sg_cpu->iowait_boost_max;
> > +	if (!cpu_util_rt(cpu_rq(sg_cpu->cpu)))
> > +		max_boost = uclamp_util(cpu_rq(sg_cpu->cpu), max_boost);
> > +
> >  	/* Double the boost at each request */
> >  	if (sg_cpu->iowait_boost) {
> >  		sg_cpu->iowait_boost <<= 1;
> > -		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
> > -			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
> > +		if (sg_cpu->iowait_boost > max_boost)
> > +			sg_cpu->iowait_boost = max_boost;
> >  		return;
> >  	}
> >  
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index b7f3ee8ba164..95d62a2a0b44 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2267,6 +2267,29 @@ static inline unsigned int uclamp_none(int clamp_id)
> >  	return SCHED_CAPACITY_SCALE;
> >  }
> >  
> > +#ifdef CONFIG_UCLAMP_TASK
> > +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
> > +{
> > +	unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
> > +	unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
> > +
> > +	/*
> > +	 * Since CPU's {min,max}_util clamps are MAX aggregated considering
> > +	 * RUNNABLE tasks with _different_ clamps, we can end up with an
> > +	 * invertion, which we can fix at usage time.
> > +	 */
> > +	if (unlikely(min_util >= max_util))
> > +		return min_util;
> > +
> > +	return clamp(util, min_util, max_util);
> > +}
> > +#else /* CONFIG_UCLAMP_TASK */
> > +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
> > +{
> > +	return util;
> > +}
> > +#endif /* CONFIG_UCLAMP_TASK */
> > +
> >  #ifdef arch_scale_freq_capacity
> >  # ifndef arch_scale_freq_invariant
> >  #  define arch_scale_freq_invariant()	true
> > 
> 
> IMO it would be better to combine this patch with the next one.

Main reason was to better document in the changelog what we do for the
two different classes...

> At least some things in it I was about to ask about would go away
> then. :-)

... but if it creates confusion I can certainly merge them.

Or maybe clarify better in this patch what's not clear: may I ask what
were your questions ?

> Besides, I don't really see a reason for the split here.

Was mainly to make the changes required for RT more self-contained.

For that class only, not for FAIR, we have additional code in the
following patch which add uclamp_default_perf which are system
defaults used to track/account tasks requesting the maximum frequency.

Again, I can either better clarify the above patch or just merge the
two together: what do you prefer ?
Rafael J. Wysocki Jan. 22, 2019, 11:04 a.m. UTC | #3
On Tue, Jan 22, 2019 at 12:02 PM Patrick Bellasi
<patrick.bellasi@arm.com> wrote:
>
> On 22-Jan 11:37, Rafael J. Wysocki wrote:
> > On Tuesday, January 15, 2019 11:15:05 AM CET Patrick Bellasi wrote:

[cut]

> >
> > IMO it would be better to combine this patch with the next one.
>
> Main reason was to better document in the changelog what we do for the
> two different classes...
>
> > At least some things in it I was about to ask about would go away
> > then. :-)
>
> ... but if it creates confusion I can certainly merge them.
>
> Or maybe clarify better in this patch what's not clear: may I ask what
> were your questions ?
>
> > Besides, I don't really see a reason for the split here.
>
> Was mainly to make the changes required for RT more self-contained.
>
> For that class only, not for FAIR, we have additional code in the
> following patch which add uclamp_default_perf which are system
> defaults used to track/account tasks requesting the maximum frequency.
>
> Again, I can either better clarify the above patch or just merge the
> two together: what do you prefer ?

Merge the two together, please.
Patrick Bellasi Jan. 22, 2019, 11:27 a.m. UTC | #4
On 22-Jan 12:04, Rafael J. Wysocki wrote:
> On Tue, Jan 22, 2019 at 12:02 PM Patrick Bellasi
> <patrick.bellasi@arm.com> wrote:
> >
> > On 22-Jan 11:37, Rafael J. Wysocki wrote:
> > > On Tuesday, January 15, 2019 11:15:05 AM CET Patrick Bellasi wrote:
> 
> Merge the two together, please.

Ok, will do in v7, thanks.
Peter Zijlstra Jan. 22, 2019, 3:21 p.m. UTC | #5
On Tue, Jan 15, 2019 at 10:15:05AM +0000, Patrick Bellasi wrote:
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -218,8 +218,15 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
>  	 * CFS tasks and we use the same metric to track the effective
>  	 * utilization (PELT windows are synchronized) we can directly add them
>  	 * to obtain the CPU's actual utilization.
> +	 *
> +	 * CFS utilization can be boosted or capped, depending on utilization
> +	 * clamp constraints requested by currently RUNNABLE tasks.
> +	 * When there are no CFS RUNNABLE tasks, clamps are released and
> +	 * frequency will be gracefully reduced with the utilization decay.
>  	 */
> -	util = util_cfs;
> +	util = (type == ENERGY_UTIL)
> +		? util_cfs
> +		: uclamp_util(rq, util_cfs);

That's pretty horrible; what's wrong with:

	util = util_cfs;
	if (type == FREQUENCY_UTIL)
		util = uclamp_util(rq, util);

That should generate the same code, but is (IMO) far easier to read.

>  	util += cpu_util_rt(rq);
>  
>  	dl_util = cpu_util_dl(rq);
Patrick Bellasi Jan. 22, 2019, 3:45 p.m. UTC | #6
On 22-Jan 16:21, Peter Zijlstra wrote:
> On Tue, Jan 15, 2019 at 10:15:05AM +0000, Patrick Bellasi wrote:
> > --- a/kernel/sched/cpufreq_schedutil.c
> > +++ b/kernel/sched/cpufreq_schedutil.c
> > @@ -218,8 +218,15 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
> >  	 * CFS tasks and we use the same metric to track the effective
> >  	 * utilization (PELT windows are synchronized) we can directly add them
> >  	 * to obtain the CPU's actual utilization.
> > +	 *
> > +	 * CFS utilization can be boosted or capped, depending on utilization
> > +	 * clamp constraints requested by currently RUNNABLE tasks.
> > +	 * When there are no CFS RUNNABLE tasks, clamps are released and
> > +	 * frequency will be gracefully reduced with the utilization decay.
> >  	 */
> > -	util = util_cfs;
> > +	util = (type == ENERGY_UTIL)
> > +		? util_cfs
> > +		: uclamp_util(rq, util_cfs);
> 
> That's pretty horrible; what's wrong with:
> 
> 	util = util_cfs;
> 	if (type == FREQUENCY_UTIL)
> 		util = uclamp_util(rq, util);
> 
> That should generate the same code, but is (IMO) far easier to read.

Yes, right... and that's also the pattern we end up with the
following patch on RT integration.

However, as suggested by Rafael, I'll squash these two patches
together and we will get rid of the above for free ;)

> >  	util += cpu_util_rt(rq);
> >  
> >  	dl_util = cpu_util_dl(rq);
Peter Zijlstra Jan. 22, 2019, 5:13 p.m. UTC | #7
On Tue, Jan 15, 2019 at 10:15:05AM +0000, Patrick Bellasi wrote:
> @@ -342,11 +350,24 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
>  		return;
>  	sg_cpu->iowait_boost_pending = true;
>  
> +	/*
> +	 * Boost FAIR tasks only up to the CPU clamped utilization.
> +	 *
> +	 * Since DL tasks have a much more advanced bandwidth control, it's
> +	 * safe to assume that IO boost does not apply to those tasks.

I'm not buying that argument. IO-boost isn't related to b/w management.

IO-boot is more about compensating for hidden dependencies, and those
don't get less hidden for using a different scheduling class.

Now, arguably DL should not be doing IO in the first place, but that's a
whole different discussion.

> +	 * Instead, since RT tasks are not utilization clamped, we don't want
> +	 * to apply clamping on IO boost while there is blocked RT
> +	 * utilization.
> +	 */
> +	max_boost = sg_cpu->iowait_boost_max;
> +	if (!cpu_util_rt(cpu_rq(sg_cpu->cpu)))
> +		max_boost = uclamp_util(cpu_rq(sg_cpu->cpu), max_boost);
> +
>  	/* Double the boost at each request */
>  	if (sg_cpu->iowait_boost) {
>  		sg_cpu->iowait_boost <<= 1;
> -		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
> -			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
> +		if (sg_cpu->iowait_boost > max_boost)
> +			sg_cpu->iowait_boost = max_boost;
>  		return;
>  	}

Hurmph...  so I'm not sold on this bit.
Patrick Bellasi Jan. 22, 2019, 6:18 p.m. UTC | #8
On 22-Jan 18:13, Peter Zijlstra wrote:
> On Tue, Jan 15, 2019 at 10:15:05AM +0000, Patrick Bellasi wrote:
> > @@ -342,11 +350,24 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> >  		return;
> >  	sg_cpu->iowait_boost_pending = true;
> >  
> > +	/*
> > +	 * Boost FAIR tasks only up to the CPU clamped utilization.
> > +	 *
> > +	 * Since DL tasks have a much more advanced bandwidth control, it's
> > +	 * safe to assume that IO boost does not apply to those tasks.
> 
> I'm not buying that argument. IO-boost isn't related to b/w management.
> 
> IO-boot is more about compensating for hidden dependencies, and those
> don't get less hidden for using a different scheduling class.
> 
> Now, arguably DL should not be doing IO in the first place, but that's a
> whole different discussion.

My understanding is that IOBoost is there to help tasks doing many
and _frequent_ IO operations, which are relatively _not so much_
computational intensive on the CPU.

Those tasks generate a small utilization and, without IOBoost, will be
executed at a lower frequency and will add undesired latency on
triggering the next IO operation.

Isn't mainly that the reason for it?

IO operations have also to be _frequent_ since we don't got to max OPP
at the very first wakeup from IO. We double frequency and get to max
only if we have a stable stream of IO operations.

IMHO, it makes perfectly sense to use DL for these kind of operations
but I would expect that, since you care about latency we should come
up with a proper description of the required bandwidth... eventually
accounting for an additional headroom to compensate for "hidden
dependencies"... without relaying on a quite dummy policy like
IOBoost to get our DL tasks working.

At the end, DL is now quite good in driving the freq as high has it
needs... and by closing userspace feedback loops it can also
compensate for all sort of fluctuations and noise... as demonstrated
by Alessio during last OSPM:

   http://retis.sssup.it/luca/ospm-summit/2018/Downloads/OSPM_deadline_audio.pdf

> > +	 * Instead, since RT tasks are not utilization clamped, we don't want
> > +	 * to apply clamping on IO boost while there is blocked RT
> > +	 * utilization.
> > +	 */
> > +	max_boost = sg_cpu->iowait_boost_max;
> > +	if (!cpu_util_rt(cpu_rq(sg_cpu->cpu)))
> > +		max_boost = uclamp_util(cpu_rq(sg_cpu->cpu), max_boost);
> > +
> >  	/* Double the boost at each request */
> >  	if (sg_cpu->iowait_boost) {
> >  		sg_cpu->iowait_boost <<= 1;
> > -		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
> > -			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
> > +		if (sg_cpu->iowait_boost > max_boost)
> > +			sg_cpu->iowait_boost = max_boost;
> >  		return;
> >  	}
> 
> Hurmph...  so I'm not sold on this bit.

If a task is not clamped we execute it at its required utilization or
even max frequency in case of wakeup from IO.

When a task is util_max clamped instead, we are saying that we don't
care to run it above the specified clamp value and, if possible, we
should run it below that capacity level.

If that's the case, why this clamping hints should not be enforced on
IO wakeups too?

At the end it's still a user-space decision, we basically allow
userspace to defined what's the max IO boost they like to get.
Peter Zijlstra Jan. 23, 2019, 9:52 a.m. UTC | #9
On Tue, Jan 22, 2019 at 06:18:31PM +0000, Patrick Bellasi wrote:
> On 22-Jan 18:13, Peter Zijlstra wrote:
> > On Tue, Jan 15, 2019 at 10:15:05AM +0000, Patrick Bellasi wrote:
> > > @@ -342,11 +350,24 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
> > >  		return;
> > >  	sg_cpu->iowait_boost_pending = true;
> > >  
> > > +	/*
> > > +	 * Boost FAIR tasks only up to the CPU clamped utilization.
> > > +	 *
> > > +	 * Since DL tasks have a much more advanced bandwidth control, it's
> > > +	 * safe to assume that IO boost does not apply to those tasks.
> > 
> > I'm not buying that argument. IO-boost isn't related to b/w management.
> > 
> > IO-boot is more about compensating for hidden dependencies, and those
> > don't get less hidden for using a different scheduling class.
> > 
> > Now, arguably DL should not be doing IO in the first place, but that's a
> > whole different discussion.
> 
> My understanding is that IOBoost is there to help tasks doing many
> and _frequent_ IO operations, which are relatively _not so much_
> computational intensive on the CPU.
> 
> Those tasks generate a small utilization and, without IOBoost, will be
> executed at a lower frequency and will add undesired latency on
> triggering the next IO operation.
> 
> Isn't mainly that the reason for it?

  http://lkml.kernel.org/r/20170522082154.f57cqovterd2qajv@hirez.programming.kicks-ass.net

Using a lower frequency will allow the IO device to go idle while we try
and get the next request going.

The connection between IO device and task/freq selection is hidden/lost.
We could potentially do better here, but fundamentally a completion
doesn't have an 'owner', there can be multiple waiters etc.

We loose (through our software architecture, and this we could possibly
improve, although it would be fairly invasive) the device busy state,
and it would be the device that raises the CPU frequency (to the point
where request submission is no longer the bottle neck to staying busy).

Currently all we do is mark a task as sleeping on IO and loose any
and all device relations/metrics.

So I don't think the task clamping should affect the IO boosting, as
that is meant to represent the device state, not the task utilization.

> IMHO, it makes perfectly sense to use DL for these kind of operations
> but I would expect that, since you care about latency we should come
> up with a proper description of the required bandwidth... eventually
> accounting for an additional headroom to compensate for "hidden
> dependencies"... without relaying on a quite dummy policy like
> IOBoost to get our DL tasks working.

Deadline is about determinsm, (file/disk) IO is typically the
anti-thesis of that.

> At the end, DL is now quite good in driving the freq as high has it
> needs... and by closing userspace feedback loops it can also
> compensate for all sort of fluctuations and noise... as demonstrated
> by Alessio during last OSPM:
> 
>    http://retis.sssup.it/luca/ospm-summit/2018/Downloads/OSPM_deadline_audio.pdf

Audio is a special in that it is indeed a deterministic device, also, I
don't think ALSA touches the IO-wait code, that is typically all
filesystem stuff.

> > > +	 * Instead, since RT tasks are not utilization clamped, we don't want
> > > +	 * to apply clamping on IO boost while there is blocked RT
> > > +	 * utilization.
> > > +	 */
> > > +	max_boost = sg_cpu->iowait_boost_max;
> > > +	if (!cpu_util_rt(cpu_rq(sg_cpu->cpu)))
> > > +		max_boost = uclamp_util(cpu_rq(sg_cpu->cpu), max_boost);
> > > +
> > >  	/* Double the boost at each request */
> > >  	if (sg_cpu->iowait_boost) {
> > >  		sg_cpu->iowait_boost <<= 1;
> > > -		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
> > > -			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
> > > +		if (sg_cpu->iowait_boost > max_boost)
> > > +			sg_cpu->iowait_boost = max_boost;
> > >  		return;
> > >  	}
> > 
> > Hurmph...  so I'm not sold on this bit.
> 
> If a task is not clamped we execute it at its required utilization or
> even max frequency in case of wakeup from IO.
> 
> When a task is util_max clamped instead, we are saying that we don't
> care to run it above the specified clamp value and, if possible, we
> should run it below that capacity level.
> 
> If that's the case, why this clamping hints should not be enforced on
> IO wakeups too?
> 
> At the end it's still a user-space decision, we basically allow
> userspace to defined what's the max IO boost they like to get.

Because it is the wrong knob for it.

Ideally we'd extend the IO-wait state to include the device-busy state
at the time of sleep. At the very least double state io_schedule() state
space from 1 to 2 bits, where we not only indicate: yes this is an
IO-sleep, but also can indicate device saturation. When the device is
saturated, we don't need to boost further.

(this binary state will ofcourse cause oscilations where we drop the
freq, drop device saturation, then ramp the freq, regain device
saturation etc..)

However, doing this is going to require fairly massive surgery on our
whole IO stack.

Also; how big of a problem is 'supriouos' boosting really? Joel tried to
introduce a boost_max tunable, but the grandual boosting thing was good
enough at the time.
Patrick Bellasi Jan. 23, 2019, 2:24 p.m. UTC | #10
On 23-Jan 10:52, Peter Zijlstra wrote:
> On Tue, Jan 22, 2019 at 06:18:31PM +0000, Patrick Bellasi wrote:
> > On 22-Jan 18:13, Peter Zijlstra wrote:
> > > On Tue, Jan 15, 2019 at 10:15:05AM +0000, Patrick Bellasi wrote:

[...]

> > If a task is not clamped we execute it at its required utilization or
> > even max frequency in case of wakeup from IO.
> > 
> > When a task is util_max clamped instead, we are saying that we don't
> > care to run it above the specified clamp value and, if possible, we
> > should run it below that capacity level.
> > 
> > If that's the case, why this clamping hints should not be enforced on
> > IO wakeups too?
> > 
> > At the end it's still a user-space decision, we basically allow
> > userspace to defined what's the max IO boost they like to get.
> 
> Because it is the wrong knob for it.
> 
> Ideally we'd extend the IO-wait state to include the device-busy state
> at the time of sleep. At the very least double state io_schedule() state
> space from 1 to 2 bits, where we not only indicate: yes this is an
> IO-sleep, but also can indicate device saturation. When the device is
> saturated, we don't need to boost further.
> 
> (this binary state will ofcourse cause oscilations where we drop the
> freq, drop device saturation, then ramp the freq, regain device
> saturation etc..)
> 
> However, doing this is going to require fairly massive surgery on our
> whole IO stack.
> 
> Also; how big of a problem is 'supriouos' boosting really? Joel tried to
> introduce a boost_max tunable, but the grandual boosting thing was good
> enough at the time.

Ok then, I'll drop the clamp on IOBoost... you right and moreover we
can always investigate for a better solution in the future with a
real use-case on hand.

Cheers.
diff mbox series

Patch

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 033ec7c45f13..520ee2b785e7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -218,8 +218,15 @@  unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
 	 * CFS tasks and we use the same metric to track the effective
 	 * utilization (PELT windows are synchronized) we can directly add them
 	 * to obtain the CPU's actual utilization.
+	 *
+	 * CFS utilization can be boosted or capped, depending on utilization
+	 * clamp constraints requested by currently RUNNABLE tasks.
+	 * When there are no CFS RUNNABLE tasks, clamps are released and
+	 * frequency will be gracefully reduced with the utilization decay.
 	 */
-	util = util_cfs;
+	util = (type == ENERGY_UTIL)
+		? util_cfs
+		: uclamp_util(rq, util_cfs);
 	util += cpu_util_rt(rq);
 
 	dl_util = cpu_util_dl(rq);
@@ -327,6 +334,7 @@  static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 			       unsigned int flags)
 {
 	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+	unsigned int max_boost;
 
 	/* Reset boost if the CPU appears to have been idle enough */
 	if (sg_cpu->iowait_boost &&
@@ -342,11 +350,24 @@  static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 		return;
 	sg_cpu->iowait_boost_pending = true;
 
+	/*
+	 * Boost FAIR tasks only up to the CPU clamped utilization.
+	 *
+	 * Since DL tasks have a much more advanced bandwidth control, it's
+	 * safe to assume that IO boost does not apply to those tasks.
+	 * Instead, since RT tasks are not utilization clamped, we don't want
+	 * to apply clamping on IO boost while there is blocked RT
+	 * utilization.
+	 */
+	max_boost = sg_cpu->iowait_boost_max;
+	if (!cpu_util_rt(cpu_rq(sg_cpu->cpu)))
+		max_boost = uclamp_util(cpu_rq(sg_cpu->cpu), max_boost);
+
 	/* Double the boost at each request */
 	if (sg_cpu->iowait_boost) {
 		sg_cpu->iowait_boost <<= 1;
-		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
-			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+		if (sg_cpu->iowait_boost > max_boost)
+			sg_cpu->iowait_boost = max_boost;
 		return;
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b7f3ee8ba164..95d62a2a0b44 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2267,6 +2267,29 @@  static inline unsigned int uclamp_none(int clamp_id)
 	return SCHED_CAPACITY_SCALE;
 }
 
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+	unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+	unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+	/*
+	 * Since CPU's {min,max}_util clamps are MAX aggregated considering
+	 * RUNNABLE tasks with _different_ clamps, we can end up with an
+	 * invertion, which we can fix at usage time.
+	 */
+	if (unlikely(min_util >= max_util))
+		return min_util;
+
+	return clamp(util, min_util, max_util);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+	return util;
+}
+#endif /* CONFIG_UCLAMP_TASK */
+
 #ifdef arch_scale_freq_capacity
 # ifndef arch_scale_freq_invariant
 #  define arch_scale_freq_invariant()	true