diff mbox series

[1/1] sched/eas: introduce system-wide overutil indicator

Message ID 1568877622-28073-1-git-send-email-yt.chang@mediatek.com (mailing list archive)
State New, archived
Headers show
Series [1/1] sched/eas: introduce system-wide overutil indicator | expand

Commit Message

YT Chang Sept. 19, 2019, 7:20 a.m. UTC
When the system is overutilization, the load-balance crossing
clusters will be triggered and scheduler will not use energy
aware scheduling to choose CPUs.

The overutilization means the loading of  ANY CPUs
exceeds threshold (80%).

However, only 1 heavy task or while-1 program will run on highest
capacity CPUs and it still result to trigger overutilization. So
the system will not use Energy Aware scheduling.

To avoid it, a system-wide over-utilization indicator to trigger
load-balance cross clusters.

The policy is:
	The loading of "ALL CPUs in the highest capacity"
						exceeds threshold(80%) or
	The loading of "Any CPUs not in the highest capacity"
						exceed threshold(80%)

Signed-off-by: YT Chang <yt.chang@mediatek.com>
---
 kernel/sched/fair.c | 76 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 11 deletions(-)

Comments

Vincent Guittot Sept. 19, 2019, 8 a.m. UTC | #1
On Thu, 19 Sep 2019 at 09:20, YT Chang <yt.chang@mediatek.com> wrote:
>
> When the system is overutilization, the load-balance crossing

s/overutilization/overutilized/

> clusters will be triggered and scheduler will not use energy
> aware scheduling to choose CPUs.
>
> The overutilization means the loading of  ANY CPUs

s/ANY/any/

> exceeds threshold (80%).
>
> However, only 1 heavy task or while-1 program will run on highest
> capacity CPUs and it still result to trigger overutilization. So
> the system will not use Energy Aware scheduling.
>
> To avoid it, a system-wide over-utilization indicator to trigger
> load-balance cross clusters.

The current rd->overutilized is already system wide. I mean that as
soon as one CPU is overutilized, the whole system is considered as
overutilized whereas you would like a finer grain level of
overutilization.
I remember a patch that was proposing a per sched_domain
overutilization detection. The load_balance at one sched_domain level
was enabled only if the child level was not able to handle the
overutilization and the energy aware scheduling was still used in the
other sched_domain

>
> The policy is:
>         The loading of "ALL CPUs in the highest capacity"
>                                                 exceeds threshold(80%) or
>         The loading of "Any CPUs not in the highest capacity"
>                                                 exceed threshold(80%)

Do you have UCs or figures that show a benefit with this change ?

>
> Signed-off-by: YT Chang <yt.chang@mediatek.com>
> ---
>  kernel/sched/fair.c | 76 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 65 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 036be95..f4c3d70 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5182,10 +5182,71 @@ static inline bool cpu_overutilized(int cpu)
>  static inline void update_overutilized_status(struct rq *rq)
>  {
>         if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
> -               WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> -               trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> +               if (capacity_orig_of(cpu_of(rq)) < rq->rd->max_cpu_capacity) {
> +                       WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> +                       trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> +               }
>         }
>  }
> +
> +static
> +void update_system_overutilized(struct sched_domain *sd, struct cpumask *cpus)
> +{
> +       unsigned long group_util;
> +       bool intra_overutil = false;
> +       unsigned long max_capacity;
> +       struct sched_group *group = sd->groups;
> +       struct root_domain *rd;
> +       int this_cpu;
> +       bool overutilized;
> +       int i;
> +
> +       this_cpu = smp_processor_id();
> +       rd = cpu_rq(this_cpu)->rd;
> +       overutilized = READ_ONCE(rd->overutilized);
> +       max_capacity = rd->max_cpu_capacity;
> +
> +       do {
> +               group_util = 0;
> +               for_each_cpu_and(i, sched_group_span(group), cpus) {
> +                       group_util += cpu_util(i);
> +                       if (cpu_overutilized(i)) {
> +                               if (capacity_orig_of(i) < max_capacity) {
> +                                       intra_overutil = true;
> +                                       break;
> +                               }
> +                       }
> +               }
> +
> +               /*
> +                * A capacity base hint for over-utilization.
> +                * Not to trigger system overutiled if heavy tasks
> +                * in Big.cluster, so
> +                * add the free room(20%) of Big.cluster is impacted which means
> +                * system-wide over-utilization,
> +                * that considers whole cluster not single cpu
> +                */
> +               if (group->group_weight > 1 && (group->sgc->capacity * 1024 <
> +                                               group_util * capacity_margin)) {
> +                       intra_overutil = true;
> +                       break;
> +               }
> +
> +               group = group->next;
> +
> +       } while (group != sd->groups && !intra_overutil);
> +
> +       if (overutilized != intra_overutil) {
> +               if (intra_overutil == true) {
> +                       WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
> +                       trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
> +               } else {
> +                       WRITE_ONCE(rd->overutilized, 0);
> +                       trace_sched_overutilized_tp(rd, 0);
> +               }
> +       }
> +}
> +
>  #else
>  static inline void update_overutilized_status(struct rq *rq) { }
>  #endif
> @@ -8242,15 +8303,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>
>                 /* update overload indicator if we are at root domain */
>                 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
> -
> -               /* Update over-utilization (tipping point, U >= 0) indicator */
> -               WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
> -               trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
> -       } else if (sg_status & SG_OVERUTILIZED) {
> -               struct root_domain *rd = env->dst_rq->rd;
> -
> -               WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
> -               trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
>         }
>  }
>
> @@ -8476,6 +8528,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>          */
>         update_sd_lb_stats(env, &sds);
>
> +       update_system_overutilized(env->sd, env->cpus);

This should be called only if (sched_energy_enabled())

> +
>         if (sched_energy_enabled()) {
>                 struct root_domain *rd = env->dst_rq->rd;
>
> --
> 1.9.1
>
Quentin Perret Sept. 19, 2019, 8:10 a.m. UTC | #2
Hi,

Could you please CC me on later versions of this ? I'm interested.

On Thursday 19 Sep 2019 at 15:20:22 (+0800), YT Chang wrote:
> When the system is overutilization, the load-balance crossing
> clusters will be triggered and scheduler will not use energy
> aware scheduling to choose CPUs.
> 
> The overutilization means the loading of  ANY CPUs
> exceeds threshold (80%).
> 
> However, only 1 heavy task or while-1 program will run on highest
> capacity CPUs and it still result to trigger overutilization. So
> the system will not use Energy Aware scheduling.
> 
> To avoid it, a system-wide over-utilization indicator to trigger
> load-balance cross clusters.
> 
> The policy is:
> 	The loading of "ALL CPUs in the highest capacity"
> 						exceeds threshold(80%) or
> 	The loading of "Any CPUs not in the highest capacity"
> 						exceed threshold(80%)
> 
> Signed-off-by: YT Chang <yt.chang@mediatek.com>

Right, so we originally went for the simpler implementation because in
general when you have the biggest CPUs of the system running flat out at
max freq, the micro-optimizations for energy on littles don't matter all
that much. Is there a use-case where you see a big difference ?

A second thing is RT pressure. If a big CPU is used at 50% by a CFS task
and 50% by RT, we should mark it overutilized. Otherwise EAS will think
the CFS task is 50% and try to down-migrate it. But the truth is, we
dont know the size of the task ... So, I believe your patch breaks that
ATM.

And there is a similar problem with misfit. That is, a task running flat
out on a big CPU will be flagged as misfit, even if there is nothing we
can do about (we can't up-migrate it for obvious reasons). So perhaps we
should look at a common solution for both issues, if deemed useful.

> ---
>  kernel/sched/fair.c | 76 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 65 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 036be95..f4c3d70 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5182,10 +5182,71 @@ static inline bool cpu_overutilized(int cpu)
>  static inline void update_overutilized_status(struct rq *rq)
>  {
>  	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
> -		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> -		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> +		if (capacity_orig_of(cpu_of(rq)) < rq->rd->max_cpu_capacity) {
> +			WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> +			trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> +		}
>  	}
>  }
> +
> +static
> +void update_system_overutilized(struct sched_domain *sd, struct cpumask *cpus)
> +{
> +	unsigned long group_util;
> +	bool intra_overutil = false;
> +	unsigned long max_capacity;
> +	struct sched_group *group = sd->groups;
> +	struct root_domain *rd;
> +	int this_cpu;
> +	bool overutilized;
> +	int i;
> +
> +	this_cpu = smp_processor_id();
> +	rd = cpu_rq(this_cpu)->rd;
> +	overutilized = READ_ONCE(rd->overutilized);
> +	max_capacity = rd->max_cpu_capacity;
> +
> +	do {
> +		group_util = 0;
> +		for_each_cpu_and(i, sched_group_span(group), cpus) {
> +			group_util += cpu_util(i);
> +			if (cpu_overutilized(i)) {
> +				if (capacity_orig_of(i) < max_capacity) {

This is what breaks things with RT pressure I think.

> +					intra_overutil = true;
> +					break;
> +				}
> +			}
> +		}
> +
> +		/*
> +		 * A capacity base hint for over-utilization.
> +		 * Not to trigger system overutiled if heavy tasks
> +		 * in Big.cluster, so
> +		 * add the free room(20%) of Big.cluster is impacted which means
> +		 * system-wide over-utilization,
> +		 * that considers whole cluster not single cpu
> +		 */
> +		if (group->group_weight > 1 && (group->sgc->capacity * 1024 <
> +						group_util * capacity_margin)) {
> +			intra_overutil = true;
> +			break;
> +		}

What if we have only one big MC domain with both big and little CPUs and
no DIE ? Say you have 4 big tasks, 4 big CPUs, 4 little CPUs (idle).
You'll fail to mark the system overutilized no ?

> +
> +		group = group->next;
> +
> +	} while (group != sd->groups && !intra_overutil);
> +
> +	if (overutilized != intra_overutil) {
> +		if (intra_overutil == true) {
> +			WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
> +			trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
> +		} else {
> +			WRITE_ONCE(rd->overutilized, 0);
> +			trace_sched_overutilized_tp(rd, 0);
> +		}
> +	}
> +}
> +
>  #else
>  static inline void update_overutilized_status(struct rq *rq) { }
>  #endif
> @@ -8242,15 +8303,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>  
>  		/* update overload indicator if we are at root domain */
>  		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
> -
> -		/* Update over-utilization (tipping point, U >= 0) indicator */
> -		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
> -		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
> -	} else if (sg_status & SG_OVERUTILIZED) {
> -		struct root_domain *rd = env->dst_rq->rd;
> -
> -		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
> -		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
>  	}
>  }
>  
> @@ -8476,6 +8528,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>  	 */
>  	update_sd_lb_stats(env, &sds);
>  
> +	update_system_overutilized(env->sd, env->cpus);
> +
>  	if (sched_energy_enabled()) {
>  		struct root_domain *rd = env->dst_rq->rd;
>  
> -- 
> 1.9.1
>
kernel test robot Sept. 21, 2019, 2:44 p.m. UTC | #3
Hi YT,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.3 next-20190918]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/YT-Chang/sched-eas-introduce-system-wide-overutil-indicator/20190919-152213
config: x86_64-randconfig-s1-201937 (attached as .config)
compiler: gcc-6 (Debian 6.3.0-18+deb9u1) 6.3.0 20170516
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 
:::::: branch date: 2 hours ago
:::::: commit date: 2 hours ago

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   kernel/sched/fair.c: In function 'update_system_overutilized':
>> kernel/sched/fair.c:5234:20: error: 'capacity_margin' undeclared (first use in this function)
          group_util * capacity_margin)) {
                       ^~~~~~~~~~~~~~~
   kernel/sched/fair.c:5234:20: note: each undeclared identifier is reported only once for each function it appears in

# https://github.com/0day-ci/linux/commit/58f2ed2a11501d4de287fafc0a7b3385d54f8238
git remote add linux-review https://github.com/0day-ci/linux
git remote update linux-review
git checkout 58f2ed2a11501d4de287fafc0a7b3385d54f8238
vim +/capacity_margin +5234 kernel/sched/fair.c

58f2ed2a11501d YT Chang 2019-09-19  5195  
58f2ed2a11501d YT Chang 2019-09-19  5196  static
58f2ed2a11501d YT Chang 2019-09-19  5197  void update_system_overutilized(struct sched_domain *sd, struct cpumask *cpus)
58f2ed2a11501d YT Chang 2019-09-19  5198  {
58f2ed2a11501d YT Chang 2019-09-19  5199  	unsigned long group_util;
58f2ed2a11501d YT Chang 2019-09-19  5200  	bool intra_overutil = false;
58f2ed2a11501d YT Chang 2019-09-19  5201  	unsigned long max_capacity;
58f2ed2a11501d YT Chang 2019-09-19  5202  	struct sched_group *group = sd->groups;
58f2ed2a11501d YT Chang 2019-09-19  5203  	struct root_domain *rd;
58f2ed2a11501d YT Chang 2019-09-19  5204  	int this_cpu;
58f2ed2a11501d YT Chang 2019-09-19  5205  	bool overutilized;
58f2ed2a11501d YT Chang 2019-09-19  5206  	int i;
58f2ed2a11501d YT Chang 2019-09-19  5207  
58f2ed2a11501d YT Chang 2019-09-19  5208  	this_cpu = smp_processor_id();
58f2ed2a11501d YT Chang 2019-09-19  5209  	rd = cpu_rq(this_cpu)->rd;
58f2ed2a11501d YT Chang 2019-09-19  5210  	overutilized = READ_ONCE(rd->overutilized);
58f2ed2a11501d YT Chang 2019-09-19  5211  	max_capacity = rd->max_cpu_capacity;
58f2ed2a11501d YT Chang 2019-09-19  5212  
58f2ed2a11501d YT Chang 2019-09-19  5213  	do {
58f2ed2a11501d YT Chang 2019-09-19  5214  		group_util = 0;
58f2ed2a11501d YT Chang 2019-09-19  5215  		for_each_cpu_and(i, sched_group_span(group), cpus) {
58f2ed2a11501d YT Chang 2019-09-19  5216  			group_util += cpu_util(i);
58f2ed2a11501d YT Chang 2019-09-19  5217  			if (cpu_overutilized(i)) {
58f2ed2a11501d YT Chang 2019-09-19  5218  				if (capacity_orig_of(i) < max_capacity) {
58f2ed2a11501d YT Chang 2019-09-19  5219  					intra_overutil = true;
58f2ed2a11501d YT Chang 2019-09-19  5220  					break;
58f2ed2a11501d YT Chang 2019-09-19  5221  				}
58f2ed2a11501d YT Chang 2019-09-19  5222  			}
58f2ed2a11501d YT Chang 2019-09-19  5223  		}
58f2ed2a11501d YT Chang 2019-09-19  5224  
58f2ed2a11501d YT Chang 2019-09-19  5225  		/*
58f2ed2a11501d YT Chang 2019-09-19  5226  		 * A capacity base hint for over-utilization.
58f2ed2a11501d YT Chang 2019-09-19  5227  		 * Not to trigger system overutiled if heavy tasks
58f2ed2a11501d YT Chang 2019-09-19  5228  		 * in Big.cluster, so
58f2ed2a11501d YT Chang 2019-09-19  5229  		 * add the free room(20%) of Big.cluster is impacted which means
58f2ed2a11501d YT Chang 2019-09-19  5230  		 * system-wide over-utilization,
58f2ed2a11501d YT Chang 2019-09-19  5231  		 * that considers whole cluster not single cpu
58f2ed2a11501d YT Chang 2019-09-19  5232  		 */
58f2ed2a11501d YT Chang 2019-09-19  5233  		if (group->group_weight > 1 && (group->sgc->capacity * 1024 <
58f2ed2a11501d YT Chang 2019-09-19 @5234  						group_util * capacity_margin)) {
58f2ed2a11501d YT Chang 2019-09-19  5235  			intra_overutil = true;
58f2ed2a11501d YT Chang 2019-09-19  5236  			break;
58f2ed2a11501d YT Chang 2019-09-19  5237  		}
58f2ed2a11501d YT Chang 2019-09-19  5238  
58f2ed2a11501d YT Chang 2019-09-19  5239  		group = group->next;
58f2ed2a11501d YT Chang 2019-09-19  5240  
58f2ed2a11501d YT Chang 2019-09-19  5241  	} while (group != sd->groups && !intra_overutil);
58f2ed2a11501d YT Chang 2019-09-19  5242  
58f2ed2a11501d YT Chang 2019-09-19  5243  	if (overutilized != intra_overutil) {
58f2ed2a11501d YT Chang 2019-09-19  5244  		if (intra_overutil == true) {
58f2ed2a11501d YT Chang 2019-09-19  5245  			WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
58f2ed2a11501d YT Chang 2019-09-19  5246  			trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
58f2ed2a11501d YT Chang 2019-09-19  5247  		} else {
58f2ed2a11501d YT Chang 2019-09-19  5248  			WRITE_ONCE(rd->overutilized, 0);
58f2ed2a11501d YT Chang 2019-09-19  5249  			trace_sched_overutilized_tp(rd, 0);
58f2ed2a11501d YT Chang 2019-09-19  5250  		}
58f2ed2a11501d YT Chang 2019-09-19  5251  	}
58f2ed2a11501d YT Chang 2019-09-19  5252  }
58f2ed2a11501d YT Chang 2019-09-19  5253  

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Dietmar Eggemann Sept. 23, 2019, 8:05 a.m. UTC | #4
On 9/19/19 9:20 AM, YT Chang wrote:
> When the system is overutilization, the load-balance crossing
> clusters will be triggered and scheduler will not use energy
> aware scheduling to choose CPUs.

We're currently transitioning from traditional big.LITTLE (the CPUs of 1
cluster (all having the same CPU (original) capacity) represent a DIE
Sched Domain (SD) level Sched Group (SG)) to DynamIQ systems. Later can
share CPUs with different CPU (original) capacity in one cluster.
In Linux mainline with today's DynamIQ systems (1 cluster) you will
only have 1 cluster, i.e. 1 MC SD level SG.

For those systems the current approach is much more applicable.

Or do you apply the out-of-tree Phantom Domain concept, which creates n
(n=2 or 3 ((huge,) big, little)) DIE SGs on your 1 cluster DynamIQ system?

> The overutilization means the loading of  ANY CPUs
> exceeds threshold (80%).
> 
> However, only 1 heavy task or while-1 program will run on highest
> capacity CPUs and it still result to trigger overutilization. So
> the system will not use Energy Aware scheduling.

The patch-header of commit 2802bf3cd936 ("sched/fair: Add
over-utilization/tipping point indicator") mentioned why the current
approach is so conservatively defined.

> To avoid it, a system-wide over-utilization indicator to trigger
> load-balance cross clusters.
> 
> The policy is:
> 	The loading of "ALL CPUs in the highest capacity"
> 						exceeds threshold(80%) or
> 	The loading of "Any CPUs not in the highest capacity"
> 						exceed threshold(80%)

We experimented with an overutilized (tipping point) indicator per SD
from Thara Gopinath (Linaro), mentioned by Vincent already, till v2 of
the Energy Aware Scheduling patch-set in 2018 but we couldn't find any
advantage using it over the one you now find in mainline.

https://lore.kernel.org/r/20180406153607.17815-4-dietmar.eggemann@arm.com

Maybe you can have a look at this patch and see if it gives you an
advantage with your use cases and system topology layout?

The 'system-wide' in the name of the patch is misleading. The current
approach is also system-wide, we have the overutilized information on
the root domain (system here stands for root domain). You change the
detection mechanism from per-CPU to a mixed-mode detection (per-CPU and
per-SG).

> Signed-off-by: YT Chang <yt.chang@mediatek.com>
> ---
>  kernel/sched/fair.c | 76 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 65 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 036be95..f4c3d70 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5182,10 +5182,71 @@ static inline bool cpu_overutilized(int cpu)
>  static inline void update_overutilized_status(struct rq *rq)
>  {
>  	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
> -		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> -		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> +		if (capacity_orig_of(cpu_of(rq)) < rq->rd->max_cpu_capacity) {
> +			WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> +			trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> +		}
>  	}
>  }
> +
> +static
> +void update_system_overutilized(struct sched_domain *sd, struct cpumask *cpus)
> +{
> +	unsigned long group_util;
> +	bool intra_overutil = false;
> +	unsigned long max_capacity;
> +	struct sched_group *group = sd->groups;
> +	struct root_domain *rd;
> +	int this_cpu;
> +	bool overutilized;
> +	int i;
> +
> +	this_cpu = smp_processor_id();
> +	rd = cpu_rq(this_cpu)->rd;
> +	overutilized = READ_ONCE(rd->overutilized);
> +	max_capacity = rd->max_cpu_capacity;
> +
> +	do {
> +		group_util = 0;
> +		for_each_cpu_and(i, sched_group_span(group), cpus) {
> +			group_util += cpu_util(i);
> +			if (cpu_overutilized(i)) {
> +				if (capacity_orig_of(i) < max_capacity) {
> +					intra_overutil = true;
> +					break;
> +				}
> +			}
> +		}
> +
> +		/*
> +		 * A capacity base hint for over-utilization.
> +		 * Not to trigger system overutiled if heavy tasks
> +		 * in Big.cluster, so
> +		 * add the free room(20%) of Big.cluster is impacted which means
> +		 * system-wide over-utilization,
> +		 * that considers whole cluster not single cpu
> +		 */
> +		if (group->group_weight > 1 && (group->sgc->capacity * 1024 <
> +						group_util * capacity_margin)) {

Why 'group->group_weight > 1' ? Do you have some out-of-tree code which
lets SGs with 1 CPU survive?

[...]
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 036be95..f4c3d70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5182,10 +5182,71 @@  static inline bool cpu_overutilized(int cpu)
 static inline void update_overutilized_status(struct rq *rq)
 {
 	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
-		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
-		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+		if (capacity_orig_of(cpu_of(rq)) < rq->rd->max_cpu_capacity) {
+			WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+			trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+		}
 	}
 }
+
+static
+void update_system_overutilized(struct sched_domain *sd, struct cpumask *cpus)
+{
+	unsigned long group_util;
+	bool intra_overutil = false;
+	unsigned long max_capacity;
+	struct sched_group *group = sd->groups;
+	struct root_domain *rd;
+	int this_cpu;
+	bool overutilized;
+	int i;
+
+	this_cpu = smp_processor_id();
+	rd = cpu_rq(this_cpu)->rd;
+	overutilized = READ_ONCE(rd->overutilized);
+	max_capacity = rd->max_cpu_capacity;
+
+	do {
+		group_util = 0;
+		for_each_cpu_and(i, sched_group_span(group), cpus) {
+			group_util += cpu_util(i);
+			if (cpu_overutilized(i)) {
+				if (capacity_orig_of(i) < max_capacity) {
+					intra_overutil = true;
+					break;
+				}
+			}
+		}
+
+		/*
+		 * A capacity base hint for over-utilization.
+		 * Not to trigger system overutiled if heavy tasks
+		 * in Big.cluster, so
+		 * add the free room(20%) of Big.cluster is impacted which means
+		 * system-wide over-utilization,
+		 * that considers whole cluster not single cpu
+		 */
+		if (group->group_weight > 1 && (group->sgc->capacity * 1024 <
+						group_util * capacity_margin)) {
+			intra_overutil = true;
+			break;
+		}
+
+		group = group->next;
+
+	} while (group != sd->groups && !intra_overutil);
+
+	if (overutilized != intra_overutil) {
+		if (intra_overutil == true) {
+			WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
+			trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+		} else {
+			WRITE_ONCE(rd->overutilized, 0);
+			trace_sched_overutilized_tp(rd, 0);
+		}
+	}
+}
+
 #else
 static inline void update_overutilized_status(struct rq *rq) { }
 #endif
@@ -8242,15 +8303,6 @@  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 		/* update overload indicator if we are at root domain */
 		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
-
-		/* Update over-utilization (tipping point, U >= 0) indicator */
-		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
-		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
-	} else if (sg_status & SG_OVERUTILIZED) {
-		struct root_domain *rd = env->dst_rq->rd;
-
-		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
-		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
 	}
 }
 
@@ -8476,6 +8528,8 @@  static struct sched_group *find_busiest_group(struct lb_env *env)
 	 */
 	update_sd_lb_stats(env, &sds);
 
+	update_system_overutilized(env->sd, env->cpus);
+
 	if (sched_energy_enabled()) {
 		struct root_domain *rd = env->dst_rq->rd;