diff mbox series

sched: dynamic config sd_flags if described in DT

Message ID 1647331137-69890-1-git-send-email-wangqing@vivo.com (mailing list archive)
State New, archived
Headers show
Series sched: dynamic config sd_flags if described in DT | expand

Commit Message

王擎 March 15, 2022, 7:58 a.m. UTC
From: Wang Qing <wangqing@vivo.com>

The device tree can describe cache topology by "next-level-cache",
Prefer configuring SD_SHARE_PKG_RESOURCES from DT instead of default value.

Signed-off-by: Wang Qing <wangqing@vivo.com>
---
 arch/arm/kernel/smp.c         |  1 +
 arch/arm64/kernel/smp.c       |  1 +
 arch/riscv/kernel/smpboot.c   |  1 +
 drivers/base/arch_topology.c  | 59 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/arch_topology.h |  2 ++
 kernel/sched/topology.c       | 10 ++++++++
 6 files changed, 74 insertions(+)

Comments

Peter Zijlstra March 15, 2022, 9:29 a.m. UTC | #1
On Tue, Mar 15, 2022 at 12:58:54AM -0700, Qing Wang wrote:
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1527,6 +1527,7 @@ sd_init(struct sched_domain_topology_level *tl,
>  	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
>  	int sd_id, sd_weight, sd_flags = 0;
>  	struct cpumask *sd_span;
> +	int ret;
>  
>  #ifdef CONFIG_NUMA
>  	/*
> @@ -1539,6 +1540,15 @@ sd_init(struct sched_domain_topology_level *tl,
>  
>  	if (tl->sd_flags)
>  		sd_flags = (*tl->sd_flags)();
> +
> +#ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
> +	ret = cpus_share_self_cache(cpu_map);
> +	if (ret == 1)
> +		sd_flags |= SD_SHARE_PKG_RESOURCES;
> +	else if (ret == 0)
> +		sd_flags &= ~SD_SHARE_PKG_RESOURCES;
> +#endif
> +
>  	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
>  			"wrong sd_flags in topology description\n"))
>  		sd_flags &= TOPOLOGY_SD_FLAGS;

Still NAK, you don't have to touch generic code for any of this.
kernel test robot March 15, 2022, 11:52 a.m. UTC | #2
Hi Qing,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on driver-core/driver-core-testing tip/sched/core linus/master v5.17-rc8 next-20220310]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Qing-Wang/sched-dynamic-config-sd_flags-if-described-in-DT/20220315-160039
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: riscv-nommu_k210_sdcard_defconfig (https://download.01.org/0day-ci/archive/20220315/202203151939.NI7Rsold-lkp@intel.com/config)
compiler: riscv64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/3322560249c42b0a3e719b19842a3ace7d5ffb6a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Qing-Wang/sched-dynamic-config-sd_flags-if-described-in-DT/20220315-160039
        git checkout 3322560249c42b0a3e719b19842a3ace7d5ffb6a
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=riscv SHELL=/bin/bash drivers/base/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   drivers/base/arch_topology.c: In function 'cpus_share_self_cache':
>> drivers/base/arch_topology.c:648:37: error: 'cpu' undeclared (first use in this function)
     648 |                 if (!cache_topology[cpu][cache_level])
         |                                     ^~~
   drivers/base/arch_topology.c:648:37: note: each undeclared identifier is reported only once for each function it appears in


vim +/cpu +648 drivers/base/arch_topology.c

   639	
   640	int cpus_share_self_cache(const struct cpumask *cpu_map)
   641	{
   642		int cache_level, cpu_id;
   643		int first, last;
   644		int id = cpumask_first(cpu_map);
   645		int size = cpumask_weight(cpu_map);
   646	
   647		for (cache_level = 0; cache_level < MAX_CACHE_LEVEL; cache_level++) {
 > 648			if (!cache_topology[cpu][cache_level])
   649				return -1;
   650	
   651			first = -1;
   652			last = id;
   653			for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
   654				if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
   655					if (cpu_id < id || cpu_id >= id + size)
   656						return 0;
   657	
   658					first = (first == -1)?cpu_id:first;
   659					last = cpu_id;
   660				}
   661			}
   662	
   663			if (first == id && last == id + size)
   664				return 1;
   665		}
   666	
   667		return 0;
   668	}
   669	

---
0-DAY CI Kernel Test Service
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot March 15, 2022, 1:24 p.m. UTC | #3
Hi Qing,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on driver-core/driver-core-testing tip/sched/core linus/master v5.17-rc8 next-20220310]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Qing-Wang/sched-dynamic-config-sd_flags-if-described-in-DT/20220315-160039
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: riscv-randconfig-r025-20220314 (https://download.01.org/0day-ci/archive/20220315/202203152145.vPr4Qv42-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project a6b2f50fb47da3baeee10b1906da6e30ac5d26ec)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/0day-ci/linux/commit/3322560249c42b0a3e719b19842a3ace7d5ffb6a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Qing-Wang/sched-dynamic-config-sd_flags-if-described-in-DT/20220315-160039
        git checkout 3322560249c42b0a3e719b19842a3ace7d5ffb6a
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash drivers/base/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> drivers/base/arch_topology.c:648:23: error: use of undeclared identifier 'cpu'
                   if (!cache_topology[cpu][cache_level])
                                       ^
   drivers/base/arch_topology.c:654:23: error: use of undeclared identifier 'cpu'
                           if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
                                              ^
   2 errors generated.


vim +/cpu +648 drivers/base/arch_topology.c

   639	
   640	int cpus_share_self_cache(const struct cpumask *cpu_map)
   641	{
   642		int cache_level, cpu_id;
   643		int first, last;
   644		int id = cpumask_first(cpu_map);
   645		int size = cpumask_weight(cpu_map);
   646	
   647		for (cache_level = 0; cache_level < MAX_CACHE_LEVEL; cache_level++) {
 > 648			if (!cache_topology[cpu][cache_level])
   649				return -1;
   650	
   651			first = -1;
   652			last = id;
   653			for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
   654				if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
   655					if (cpu_id < id || cpu_id >= id + size)
   656						return 0;
   657	
   658					first = (first == -1)?cpu_id:first;
   659					last = cpu_id;
   660				}
   661			}
   662	
   663			if (first == id && last == id + size)
   664				return 1;
   665		}
   666	
   667		return 0;
   668	}
   669	

---
0-DAY CI Kernel Test Service
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Dietmar Eggemann March 15, 2022, 5:18 p.m. UTC | #4
On 15/03/2022 08:58, Qing Wang wrote:
> From: Wang Qing <wangqing@vivo.com>

(1) Can you share more information about your CPU topology?

I guess it is a single DSU (DynamIQ Shared Unit) ARMv9 system with 8
CPUs? So L3 spans over [CPU0..CPU7].

You also mentioned complexes. Am I right in assuming that [CPU0..CPU3]
are Cortex-A510 cores where each 2 CPUs share a complex?

What kind of uarch are the CPUs in [CPU4..CPU7]? Are they Cortex-A510's
as well? I'm not sure after reading your email:

https://lkml.kernel.org/r/SL2PR06MB30828CF9FF2879AFC9DC53D2BD0C9@SL2PR06MB3082.apcprd06.prod.outlook.com

You might run into the issue that individual CPUs of your system see a
different SD hierarchy in case that [CPU4..CPU7] aren't Cortex-A510's,
i.e. CPUs not sharing complexes.

(2) Related to your MC Sched Domain (SD) layer:

If you have a single DSU ARMv9 system, then in Linux kernel mainline you
shouldn't have sub-clustering of [CPU0..CPU3] and [CPU4...CPU7].

I.e. the cpu-map entry in your dts file should only list cores, not
clusters.

I know that in Android the cluster entries are used to sub-group
different uarch CPUs in an asymmetric CPU capacity system (a.k.a. Arm
DynamIQ and Phantom domains) but this is eclipsing the true L3 (LLC)
information and is not "supported" (in the sense of "used") in mainline.

But I have a hard time to see what [CPU0..CPU3] or [CPU4..CPU7] are
shareing in your system.

(3) Why do you want this different SD hierarchy?

I assume in mainline your system will have a single SD which is MC (w/o
the Phantom domain approach from Android).

You mentioned cpus_share_cache(). Or is it the extra SD level which
changes the behaviour of CFS load-balancing? I'm just wondering since
EAS wouldn't be affected here. I'm sure I can understand this better
once we know more about your CPU topology.

[...]
王擎 March 16, 2022, 2:46 a.m. UTC | #5
>(1) Can you share more information about your CPU topology?
>
>I guess it is a single DSU (DynamIQ Shared Unit) ARMv9 system with 8
>CPUs? So L3 spans over [CPU0..CPU7].
>
>You also mentioned complexes. Am I right in assuming that [CPU0..CPU3]
>are Cortex-A510 cores where each 2 CPUs share a complex?
>
>What kind of uarch are the CPUs in [CPU4..CPU7]? Are they Cortex-A510's
>as well? I'm not sure after reading your email:

Yes, Android systems are currently used default_domain with wrong sd_flags, 
take Qualcomm SM8450 as an example, the CPU and cache topology(1+3+4):
|                           DSU                            |
|           cluster0         |       cluster1     |cluster2|
| core0  core1  core2  core3 |  core4 core5 core6 | core7  |
|   complex0  |   complex1   |  ------------------------   |
|   L2 cache  |   L2 cache   |   L2  |  L2 |  L2  |   L2   |
|                         L3 cache                         |

The sched domain now:
DIE[0-7]  (no SD_SHARE_PKG_RESOURCES)
MC[0-3][4-6][7] (SD_SHARE_PKG_RESOURCES)

The sched domain should be:
DIE[0-7]  (SD_SHARE_PKG_RESOURCES)
MC[0-3][4-6][7] (no SD_SHARE_PKG_RESOURCES)
*CLS[0-1][2-3](SD_SHARE_PKG_RESOURCES)

>https://lkml.kernel.org/r/SL2PR06MB30828CF9FF2879AFC9DC53D2BD0C9@SL2PR06MB3082.apcprd06.prod.outlook.com
>
>You might run into the issue that individual CPUs of your system see a
>different SD hierarchy in case that [CPU4..CPU7] aren't Cortex-A510's,
>i.e. CPUs not sharing complexes.
>
>(2) Related to your MC Sched Domain (SD) layer:
>
>If you have a single DSU ARMv9 system, then in Linux kernel mainline you
>shouldn't have sub-clustering of [CPU0..CPU3] and [CPU4...CPU7].
>
>I.e. the cpu-map entry in your dts file should only list cores, not
>clusters.

But in fact we will, as mentioned above.
>
>I know that in Android the cluster entries are used to sub-group
>different uarch CPUs in an asymmetric CPU capacity system (a.k.a. Arm
>DynamIQ and Phantom domains) but this is eclipsing the true L3 (LLC)
>information and is not "supported" (in the sense of "used") in mainline.
>
>But I have a hard time to see what [CPU0..CPU3] or [CPU4..CPU7] are
>shareing in your system.

They share L3 cache, but no share L2 which only shared within complex.
>
>(3) Why do you want this different SD hierarchy?
>
>I assume in mainline your system will have a single SD which is MC (w/o
>the Phantom domain approach from Android).
>
>You mentioned cpus_share_cache(). Or is it the extra SD level which
>changes the behaviour of CFS load-balancing? I'm just wondering since
>EAS wouldn't be affected here. I'm sure I can understand this better
>once we know more about your CPU topology.

What I want to do is :
1.Config the right sd_llc to sd, better to get it dynamically from DT
2.Benefit from the shared cache(L2) of the complex
i.e. when we look for sibling idle CPU, prior select the L2 shared CPU(inner complex) if L3 is all shared.

Thanks,
Wang
Dietmar Eggemann March 17, 2022, 12:10 p.m. UTC | #6
On 16/03/2022 03:46, 王擎 wrote:
> 
>> (1) Can you share more information about your CPU topology?
>>
>> I guess it is a single DSU (DynamIQ Shared Unit) ARMv9 system with 8
>> CPUs? So L3 spans over [CPU0..CPU7].
>>
>> You also mentioned complexes. Am I right in assuming that [CPU0..CPU3]
>> are Cortex-A510 cores where each 2 CPUs share a complex?
>>
>> What kind of uarch are the CPUs in [CPU4..CPU7]? Are they Cortex-A510's
>> as well? I'm not sure after reading your email:
> 
> Yes, Android systems are currently used default_domain with wrong sd_flags, 
> take Qualcomm SM8450 as an example, the CPU and cache topology(1+3+4):

Ah, your system looks like this:

      .---------------.
CPU   |0 1 2 3 4 5 6 7|
      +---------------+
uarch |l l l l m m m b| (so called tri-gear: little, medium, big)
      +---------------+
  L2  |   |   | | | | |
      +---------------+
  L3  |<--         -->|
      +---------------+
      |<-- cluster -->|
      +---------------+
      |<--   DSU   -->|
      '---------------'

> |                           DSU                            |
> |           cluster0         |       cluster1     |cluster2|

^^^ Those aren't real clusters, hence the name <Phantom> SD. The cluster
is [CPU0...CPU7]. Android uses Phantom SD to subgroup CPUs with the same
uarch. That's why you get your MC->DIE SD's on your system and
SHARE_PKG_RESOURCES (ShPR) on MC, rather DIE.

Note, you should already have an asymmetric SD hierarchy. CPU7 should
only have DIE not MC! Each CPU has its own SD hierarchy!

> | core0  core1  core2  core3 |  core4 core5 core6 | core7  |
> |   complex0  |   complex1   |  ------------------------   |
> |   L2 cache  |   L2 cache   |   L2  |  L2 |  L2  |   L2   |
> |                         L3 cache                         |
> 
> The sched domain now:
> DIE[0-7]  (no SD_SHARE_PKG_RESOURCES)
> MC[0-3][4-6][7] (SD_SHARE_PKG_RESOURCES)
> 
> The sched domain should be:
> DIE[0-7]  (SD_SHARE_PKG_RESOURCES)
> MC[0-3][4-6][7] (no SD_SHARE_PKG_RESOURCES)

First remember, using Phantom SD in Android is already a hack. Normally
your system should only have an MC SD for each CPU (with ShPR).

Now, if you want to move ShPR from MC to DIE then a custom topology
table should do it, i.e. you don't have to change any generic task
scheduler code.

static inline int cpu_cpu_flags(void)
{
       return SD_SHARE_PKG_RESOURCES;
}

static struct sched_domain_topology_level custom_topology[] = {
#ifdef CONFIG_SCHED_SMT
        { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif

#ifdef CONFIG_SCHED_CLUSTER
        { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
#endif

#ifdef CONFIG_SCHED_MC
        { cpu_coregroup_mask, SD_INIT_NAME(MC) },
                            ^^^^
#endif
        { cpu_cpu_mask, cpu_cpu_flags, SD_INIT_NAME(DIE) },
                        ^^^^^^^^^^^^^
        { NULL, },
};

set_sched_topology(custom_topology);

> *CLS[0-1][2-3](SD_SHARE_PKG_RESOURCES)

But why do you want to have yet another SD underneath MC for CPU0-CPU3?
sd_llc is assigned to the highest ShPR SD, which would be DIE.

>> https://lkml.kernel.org/r/SL2PR06MB30828CF9FF2879AFC9DC53D2BD0C9@SL2PR06MB3082.apcprd06.prod.outlook.com
>>
>> You might run into the issue that individual CPUs of your system see a
>> different SD hierarchy in case that [CPU4..CPU7] aren't Cortex-A510's,
>> i.e. CPUs not sharing complexes.
>>
>> (2) Related to your MC Sched Domain (SD) layer:
>>
>> If you have a single DSU ARMv9 system, then in Linux kernel mainline you
>> shouldn't have sub-clustering of [CPU0..CPU3] and [CPU4...CPU7].
>>
>> I.e. the cpu-map entry in your dts file should only list cores, not
>> clusters.
> 
> But in fact we will, as mentioned above.

OK, so your system needs this `fake` sub-grouping on uarch boundaries.
Probably because of (out-of-tree) Android/platform code? Have you tried
to run on a clean mainline SD hierarchy (only MC)? Is the Phantom SD
still required?

>> I know that in Android the cluster entries are used to sub-group
>> different uarch CPUs in an asymmetric CPU capacity system (a.k.a. Arm
>> DynamIQ and Phantom domains) but this is eclipsing the true L3 (LLC)
>> information and is not "supported" (in the sense of "used") in mainline.
>>
>> But I have a hard time to see what [CPU0..CPU3] or [CPU4..CPU7] are
>> shareing in your system.
> 
> They share L3 cache, but no share L2 which only shared within complex.

I should have written: What do they share exclusively? I can only see
that they ([CPU0..CPU3], [CPU4..CPU6], [CPU7]) share uarch exclusively.
Which relates to this fake (Phantom) SD. L3 is shared between all CPUs.

>> (3) Why do you want this different SD hierarchy?
>>
>> I assume in mainline your system will have a single SD which is MC (w/o
>> the Phantom domain approach from Android).
>>
>> You mentioned cpus_share_cache(). Or is it the extra SD level which
>> changes the behaviour of CFS load-balancing? I'm just wondering since
>> EAS wouldn't be affected here. I'm sure I can understand this better
>> once we know more about your CPU topology.
> 
> What I want to do is :
> 1.Config the right sd_llc to sd, better to get it dynamically from DT

So this should be ShPR on DIE. You have 2 choices here. Use flat MC SD
(mainline) or Phantom SD's + custom_topology.

> 2.Benefit from the shared cache(L2) of the complex
> i.e. when we look for sibling idle CPU, prior select the L2 shared CPU(inner complex) if L3 is all shared.

But cpus_share_cache() operates on sd_llc_id which relates to DIE SD
even for [CPU0..CPU3]?
王擎 March 23, 2022, 6:45 a.m. UTC | #7
>> 
>>> (1) Can you share more information about your CPU topology?
>>>
>>> I guess it is a single DSU (DynamIQ Shared Unit) ARMv9 system with 8
>>> CPUs? So L3 spans over [CPU0..CPU7].
>>>
>>> You also mentioned complexes. Am I right in assuming that [CPU0..CPU3]
>>> are Cortex-A510 cores where each 2 CPUs share a complex?
>>>
>>> What kind of uarch are the CPUs in [CPU4..CPU7]? Are they Cortex-A510's
>>> as well? I'm not sure after reading your email:
>> 
>> Yes, Android systems are currently used default_domain with wrong sd_flags, 
>> take Qualcomm SM8450 as an example, the CPU and cache topology(1+3+4):
>
>Ah, your system looks like this:
>
>      .---------------.
>CPU   |0 1 2 3 4 5 6 7|
>      +---------------+
>uarch |l l l l m m m b| (so called tri-gear: little, medium, big)
>      +---------------+
>  L2  |   |   | | | | |
>      +---------------+
>  L3  |<--         -->|
>      +---------------+
>      |<-- cluster -->|
>      +---------------+
>      |<--   DSU   -->|
>      '---------------'
>
>> |                           DSU                            |
>> |           cluster0         |       cluster1     |cluster2|
>
>^^^ Those aren't real clusters, hence the name <Phantom> SD. The cluster
>is [CPU0...CPU7]. Android uses Phantom SD to subgroup CPUs with the same
>uarch. That's why you get your MC->DIE SD's on your system and
>SHARE_PKG_RESOURCES (ShPR) on MC, rather DIE.
>
>Note, you should already have an asymmetric SD hierarchy. CPU7 should
>only have DIE not MC! Each CPU has its own SD hierarchy!
>
>> | core0  core1  core2  core3 |  core4 core5 core6 | core7  |
>> |   complex0  |   complex1   |  ------------------------   |
>> |   L2 cache  |   L2 cache   |   L2  |  L2 |  L2  |   L2   |
>> |                         L3 cache                         |
>> 
>> The sched domain now:
>> DIE[0-7]  (no SD_SHARE_PKG_RESOURCES)
>> MC[0-3][4-6][7] (SD_SHARE_PKG_RESOURCES)
>> 
>> The sched domain should be:
>> DIE[0-7]  (SD_SHARE_PKG_RESOURCES)
>> MC[0-3][4-6][7] (no SD_SHARE_PKG_RESOURCES)
>
>First remember, using Phantom SD in Android is already a hack. Normally
>your system should only have an MC SD for each CPU (with ShPR).
>
>Now, if you want to move ShPR from MC to DIE then a custom topology
>table should do it, i.e. you don't have to change any generic task
>scheduler code.
>
>static inline int cpu_cpu_flags(void)
>{
>       return SD_SHARE_PKG_RESOURCES;
>}
>
>static struct sched_domain_topology_level custom_topology[] = {
>#ifdef CONFIG_SCHED_SMT
>        { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
>#endif
>
>#ifdef CONFIG_SCHED_CLUSTER
>        { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
>#endif
>
>#ifdef CONFIG_SCHED_MC
>        { cpu_coregroup_mask, SD_INIT_NAME(MC) },
>                            ^^^^
>#endif
>        { cpu_cpu_mask, cpu_cpu_flags, SD_INIT_NAME(DIE) },
>                        ^^^^^^^^^^^^^
>        { NULL, },
>};
>
>set_sched_topology(custom_topology);

However, due to the limitation of GKI, we cannot change the sd topology
by ourselves. But we can configure CPU and cache topology through DT.

So why not get the ShPR from DT first? If not configured, use the default.
>
>> *CLS[0-1][2-3](SD_SHARE_PKG_RESOURCES)
>
>But why do you want to have yet another SD underneath MC for CPU0-CPU3?
>sd_llc is assigned to the highest ShPR SD, which would be DIE.

We want do something from the shared L2 cache(for complex, like walt), 
you can ignore it here and talk about it when we done.

Thanks,
Wang
Dietmar Eggemann March 25, 2022, 12:06 p.m. UTC | #8
On 23/03/2022 07:45, 王擎 wrote:

[...]

>> Now, if you want to move ShPR from MC to DIE then a custom topology
>> table should do it, i.e. you don't have to change any generic task
>> scheduler code.
>>
>> static inline int cpu_cpu_flags(void)
>> {
>>        return SD_SHARE_PKG_RESOURCES;
>> }
>>
>> static struct sched_domain_topology_level custom_topology[] = {
>> #ifdef CONFIG_SCHED_SMT
>>         { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
>> #endif
>>
>> #ifdef CONFIG_SCHED_CLUSTER
>>         { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
>> #endif
>>
>> #ifdef CONFIG_SCHED_MC
>>         { cpu_coregroup_mask, SD_INIT_NAME(MC) },
>>                             ^^^^
>> #endif
>>         { cpu_cpu_mask, cpu_cpu_flags, SD_INIT_NAME(DIE) },
>>                         ^^^^^^^^^^^^^
>>         { NULL, },
>> };
>>
>> set_sched_topology(custom_topology);
> 
> However, due to the limitation of GKI, we cannot change the sd topology
> by ourselves. But we can configure CPU and cache topology through DT.

IMHO, mainline can't do anything here. You should talk to your Android
platform provider in this case. Android concepts like Generic Kernel
Image (GKI) are normally not discussed here.

From mainline perspective we're OK with scheduling such a system flat,
e.g. only with a single MC SD [CPU0..CPU7] for each CPU.
It could be that the Phantom SD is still needed for additional
proprietary or Android add-ons though?

In case you would remove `clusterX` from your DT cpu-map (Phantom SD
information, i.e. the reason for why you have e.g. for CPU0: `MC (ShPR)
[CPU0..CPU3] and DIE [CPU0..CPU7]`) , you should see the natural
topology: only `MC (ShPR) [CPU0..CPU7]`.

> So why not get the ShPR from DT first? If not configured, use the default.

I'm not convinced that mainline will accept a change which is necessary
for a out-of-tree tweak (Phantom SD).

>>> *CLS[0-1][2-3](SD_SHARE_PKG_RESOURCES)
>>
>> But why do you want to have yet another SD underneath MC for CPU0-CPU3?
>> sd_llc is assigned to the highest ShPR SD, which would be DIE.
> 
> We want do something from the shared L2 cache(for complex, like walt), 
> you can ignore it here and talk about it when we done.

I assume you refer to the proprietary load-tracking mechanism `Window
Assisted Load Tracking` (WALT) here? It's also not in mainline.
diff mbox series

Patch

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 73fc645..62bbd9a
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -508,6 +508,7 @@  void __init smp_prepare_cpus(unsigned int max_cpus)
 	unsigned int ncores = num_possible_cpus();
 
 	init_cpu_topology();
+	init_cpu_cache_topology();
 
 	smp_store_cpu_info(smp_processor_id());
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 27df5c1..94cf649
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -723,6 +723,7 @@  void __init smp_prepare_cpus(unsigned int max_cpus)
 	unsigned int this_cpu;
 
 	init_cpu_topology();
+	init_cpu_cache_topology();
 
 	this_cpu = smp_processor_id();
 	store_cpu_topology(this_cpu);
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 622f226..4f5a8b7
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -41,6 +41,7 @@  static DECLARE_COMPLETION(cpu_running);
 void __init smp_prepare_boot_cpu(void)
 {
 	init_cpu_topology();
+	init_cpu_cache_topology();
 #ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE
 	apply_boot_alternatives();
 #endif
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 9761541..127f540
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -609,6 +609,65 @@  static int __init parse_dt_topology(void)
 #endif
 
 /*
+ * cpu cache topology table
+ */
+#define MAX_CACHE_LEVEL 7
+struct device_node *cache_topology[NR_CPUS][MAX_CACHE_LEVEL];
+
+void init_cpu_cache_topology(void)
+{
+	struct device_node *node_cpu, *node_cache;
+	int cpu, level;
+
+	for_each_possible_cpu(cpu) {
+		node_cpu = of_get_cpu_node(cpu, NULL);
+		if (!node_cpu)
+			continue;
+
+		level = 0;
+		node_cache = node_cpu;
+		while (level < MAX_CACHE_LEVEL) {
+			node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
+			if (!node_cache)
+				break;
+
+			cache_topology[cpu][level++] = node_cache;
+		}
+		of_node_put(node_cpu);
+	}
+}
+
+int cpus_share_self_cache(const struct cpumask *cpu_map)
+{
+	int cache_level, cpu_id;
+	int first, last;
+	int id = cpumask_first(cpu_map);
+	int size = cpumask_weight(cpu_map);
+
+	for (cache_level = 0; cache_level < MAX_CACHE_LEVEL; cache_level++) {
+		if (!cache_topology[cpu][cache_level])
+			return -1;
+
+		first = -1;
+		last = id;
+		for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
+			if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
+				if (cpu_id < id || cpu_id >= id + size)
+					return 0;
+
+				first = (first == -1)?cpu_id:first;
+				last = cpu_id;
+			}
+		}
+
+		if (first == id && last == id + size)
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
  * cpu topology table
  */
 struct cpu_topology cpu_topology[NR_CPUS];
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index cce6136b..862e584
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -82,6 +82,8 @@  extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_cluster_cpumask(cpu)	(&cpu_topology[cpu].cluster_sibling)
 #define topology_llc_cpumask(cpu)	(&cpu_topology[cpu].llc_sibling)
 void init_cpu_topology(void);
+void init_cpu_cache_topology(void);
+int cpus_share_self_cache(const struct cpumask *cpu_map);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 const struct cpumask *cpu_clustergroup_mask(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a70..8264e2d
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1527,6 +1527,7 @@  sd_init(struct sched_domain_topology_level *tl,
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	int sd_id, sd_weight, sd_flags = 0;
 	struct cpumask *sd_span;
+	int ret;
 
 #ifdef CONFIG_NUMA
 	/*
@@ -1539,6 +1540,15 @@  sd_init(struct sched_domain_topology_level *tl,
 
 	if (tl->sd_flags)
 		sd_flags = (*tl->sd_flags)();
+
+#ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
+	ret = cpus_share_self_cache(cpu_map);
+	if (ret == 1)
+		sd_flags |= SD_SHARE_PKG_RESOURCES;
+	else if (ret == 0)
+		sd_flags &= ~SD_SHARE_PKG_RESOURCES;
+#endif
+
 	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
 			"wrong sd_flags in topology description\n"))
 		sd_flags &= TOPOLOGY_SD_FLAGS;