diff mbox series

[v2,3/8] arch_topology: Set cluster identifier in each core/thread from /cpu-map

Message ID 20220518093325.2070336-4-sudeep.holla@arm.com (mailing list archive)
State New, archived
Headers show
Series arch_topology: Updates to add socket support and fix cluster ids | expand

Commit Message

Sudeep Holla May 18, 2022, 9:33 a.m. UTC
Let us set the cluster identifier as parsed from the device tree
cluster nodes within /cpu-map.

We don't support nesting of clusters yet as there are no real hardware
to support clusters of clusters.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/base/arch_topology.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

Comments

Ionela Voinescu May 19, 2022, 4:55 p.m. UTC | #1
Hi,

As said before, this creates trouble for CONFIG_SCHED_CLUSTER=y.
The output below is obtained from Juno.

When cluster_id is populated, a new CLS level is created by the scheduler
topology code. In this case the clusters in DT determine that the cluster
siblings and llc siblings are the same so the MC scheduler domain will
be removed and, for Juno, only CLS and DIE will be kept.

root@debian-arm64-buster:/sys/kernel/debug/sched/domains/cpu1# grep . */*
domain0/busy_factor:16
domain0/cache_nice_tries:1
domain0/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
domain0/imbalance_pct:117
domain0/max_interval:4
domain0/max_newidle_lb_cost:14907
domain0/min_interval:2
domain0/name:CLS
domain1/busy_factor:16
domain1/cache_nice_tries:1
domain1/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_ASYM_CPUCAPACITY SD_ASYM_CPUCAPACITY_FULL SD_PREFER_SIBLING
domain1/imbalance_pct:117
domain1/max_interval:12
domain1/max_newidle_lb_cost:11858
domain1/min_interval:6
domain1/name:DIE

To be noted that we also get a new flag SD_PREFER_SIBLING for the CLS
level that is not appropriate. We usually remove it for the child of a
SD_ASYM_CPUCAPACITY domain, but we don't currently redo this after some
levels are degenerated. This is a fixable issue.

But looking at the bigger picture, a good question is what is the best
thing to do when cluster domains and llc domains span the same CPUs?

Possibly it would be best to restrict clusters (which are almost an
arbitrary concept) to always span a subset of CPUs of the llc domain,
if llc siblings can be obtained? If those clusters are not properly set
up in DT to respect this condition, cluster_siblings would need to be
cleared (or set to the current CPU) so the CLS domain is not created at
all.

Additionally, should we use cluster information from DT (cluster_id) to
create an MC level if we don't have llc information, even if
CONFIG_SCHED_CLUSTER=n?

I currently don't have a very clear picture of how cluster domains and
llc domains would "live" together in a variety of topologies. I'll try
other DT topologies to see if there are others that can lead to trouble.

Thanks,
Ionela.

On Wednesday 18 May 2022 at 10:33:20 (+0100), Sudeep Holla wrote:
> Let us set the cluster identifier as parsed from the device tree
> cluster nodes within /cpu-map.
> 
> We don't support nesting of clusters yet as there are no real hardware
> to support clusters of clusters.
> 
> Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> ---
>  drivers/base/arch_topology.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 7f5aa655c1f4..bdb6f2a17df0 100644
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -491,7 +491,7 @@ static int __init get_cpu_for_node(struct device_node *node)
>  }
>  
>  static int __init parse_core(struct device_node *core, int package_id,
> -			     int core_id)
> +			     int cluster_id, int core_id)
>  {
>  	char name[20];
>  	bool leaf = true;
> @@ -507,6 +507,7 @@ static int __init parse_core(struct device_node *core, int package_id,
>  			cpu = get_cpu_for_node(t);
>  			if (cpu >= 0) {
>  				cpu_topology[cpu].package_id = package_id;
> +				cpu_topology[cpu].cluster_id = cluster_id;
>  				cpu_topology[cpu].core_id = core_id;
>  				cpu_topology[cpu].thread_id = i;
>  			} else if (cpu != -ENODEV) {
> @@ -528,6 +529,7 @@ static int __init parse_core(struct device_node *core, int package_id,
>  		}
>  
>  		cpu_topology[cpu].package_id = package_id;
> +		cpu_topology[cpu].cluster_id = cluster_id;
>  		cpu_topology[cpu].core_id = core_id;
>  	} else if (leaf && cpu != -ENODEV) {
>  		pr_err("%pOF: Can't get CPU for leaf core\n", core);
> @@ -537,7 +539,8 @@ static int __init parse_core(struct device_node *core, int package_id,
>  	return 0;
>  }
>  
> -static int __init parse_cluster(struct device_node *cluster, int depth)
> +static int __init
> +parse_cluster(struct device_node *cluster, int cluster_id, int depth)
>  {
>  	char name[20];
>  	bool leaf = true;
> @@ -557,7 +560,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
>  		c = of_get_child_by_name(cluster, name);
>  		if (c) {
>  			leaf = false;
> -			ret = parse_cluster(c, depth + 1);
> +			ret = parse_cluster(c, i, depth + 1);
>  			of_node_put(c);
>  			if (ret != 0)
>  				return ret;
> @@ -581,7 +584,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
>  			}
>  
>  			if (leaf) {
> -				ret = parse_core(c, 0, core_id++);
> +				ret = parse_core(c, 0, cluster_id, core_id++);
>  			} else {
>  				pr_err("%pOF: Non-leaf cluster with core %s\n",
>  				       cluster, name);
> @@ -621,7 +624,7 @@ static int __init parse_dt_topology(void)
>  	if (!map)
>  		goto out;
>  
> -	ret = parse_cluster(map, 0);
> +	ret = parse_cluster(map, -1, 0);
>  	if (ret != 0)
>  		goto out_map;
>  
> -- 
> 2.36.1
> 
>
Dietmar Eggemann May 20, 2022, 12:33 p.m. UTC | #2
On 19/05/2022 18:55, Ionela Voinescu wrote:
> Hi,
> 
> As said before, this creates trouble for CONFIG_SCHED_CLUSTER=y.
> The output below is obtained from Juno.
> 
> When cluster_id is populated, a new CLS level is created by the scheduler
> topology code. In this case the clusters in DT determine that the cluster
> siblings and llc siblings are the same so the MC scheduler domain will
> be removed and, for Juno, only CLS and DIE will be kept.

[...]

> To be noted that we also get a new flag SD_PREFER_SIBLING for the CLS
> level that is not appropriate. We usually remove it for the child of a
> SD_ASYM_CPUCAPACITY domain, but we don't currently redo this after some
> levels are degenerated. This is a fixable issue.
> 
> But looking at the bigger picture, a good question is what is the best
> thing to do when cluster domains and llc domains span the same CPUs?
> 
> Possibly it would be best to restrict clusters (which are almost an
> arbitrary concept) to always span a subset of CPUs of the llc domain,
> if llc siblings can be obtained? If those clusters are not properly set
> up in DT to respect this condition, cluster_siblings would need to be
> cleared (or set to the current CPU) so the CLS domain is not created at
> all.
> 
> Additionally, should we use cluster information from DT (cluster_id) to
> create an MC level if we don't have llc information, even if
> CONFIG_SCHED_CLUSTER=n?
> 
> I currently don't have a very clear picture of how cluster domains and
> llc domains would "live" together in a variety of topologies. I'll try
> other DT topologies to see if there are others that can lead to trouble.

This would be an issue. Depending on CONFIG_SCHED_CLUSTER we would get
two different systems from the viewpoint of the scheduler.

To me `cluster_id/_sibling` don't describe a certain level of CPU
grouping (e.g. one level above core or one level below package).

They were introduced to describe one level below LLC (e.g. Kunpeng920 L3
(24 CPUs LLC) -> L3 tag (4 CPUs) or x86 Jacobsville L3 -> L2), (Commit
                 ^^^^^^                                   ^^
c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")).

The Ampere Altra issue already gave us a taste of the possible issues of
this definition, commit db1e59483dfd ("topology: make core_mask include
at least cluster_siblings").

If we link `cluster_id/_sibling` against (1. level) cpu-map cluster
nodes plus using llc and `cluster_sibling >= llc_sibling` we will run
into these issues.
Sudeep Holla May 20, 2022, 1:54 p.m. UTC | #3
On Fri, May 20, 2022 at 02:33:19PM +0200, Dietmar Eggemann wrote:
> On 19/05/2022 18:55, Ionela Voinescu wrote:
> > Hi,
> > 
> > As said before, this creates trouble for CONFIG_SCHED_CLUSTER=y.
> > The output below is obtained from Juno.
> > 
> > When cluster_id is populated, a new CLS level is created by the scheduler
> > topology code. In this case the clusters in DT determine that the cluster
> > siblings and llc siblings are the same so the MC scheduler domain will
> > be removed and, for Juno, only CLS and DIE will be kept.
> 
> [...]
> 
> > To be noted that we also get a new flag SD_PREFER_SIBLING for the CLS
> > level that is not appropriate. We usually remove it for the child of a
> > SD_ASYM_CPUCAPACITY domain, but we don't currently redo this after some
> > levels are degenerated. This is a fixable issue.
> > 
> > But looking at the bigger picture, a good question is what is the best
> > thing to do when cluster domains and llc domains span the same CPUs?
> > 
> > Possibly it would be best to restrict clusters (which are almost an
> > arbitrary concept) to always span a subset of CPUs of the llc domain,
> > if llc siblings can be obtained? If those clusters are not properly set
> > up in DT to respect this condition, cluster_siblings would need to be
> > cleared (or set to the current CPU) so the CLS domain is not created at
> > all.
> > 
> > Additionally, should we use cluster information from DT (cluster_id) to
> > create an MC level if we don't have llc information, even if
> > CONFIG_SCHED_CLUSTER=n?
> > 
> > I currently don't have a very clear picture of how cluster domains and
> > llc domains would "live" together in a variety of topologies. I'll try
> > other DT topologies to see if there are others that can lead to trouble.
> 
> This would be an issue. Depending on CONFIG_SCHED_CLUSTER we would get
> two different systems from the viewpoint of the scheduler.
>

Agreed, but that is the issue with the change that updates cpu_coregroup_mask
based on CONFIG_SCHED_CLUSTER, the one that we added recent for Ampere
systems. Sorry, but I admit I was OK for the work around then but all
these discussions has made to disagree with that change now.

> To me `cluster_id/_sibling` don't describe a certain level of CPU
> grouping (e.g. one level above core or one level below package).
>

True, but based on how it is extracted from the firmware todat(ACPI and DT
with this series, it is one level above the cores.

> They were introduced to describe one level below LLC (e.g. Kunpeng920 L3
> (24 CPUs LLC) -> L3 tag (4 CPUs) or x86 Jacobsville L3 -> L2), (Commit
>                  ^^^^^^                                   ^^
> c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")).
>

Again correct, but the description of sysfs is what we need to rely on from
now. If that is not clear, we need to make it clear. But I don't real bother
much on how it is related to LLC as it is known to vary with different
systems.

> The Ampere Altra issue already gave us a taste of the possible issues of
> this definition, commit db1e59483dfd ("topology: make core_mask include
> at least cluster_siblings").
>

Yes this is one I am referring above. I tend to disagree with that now.

> If we link `cluster_id/_sibling` against (1. level) cpu-map cluster
> nodes plus using llc and `cluster_sibling >= llc_sibling` we will run
> into these issues.

As I said you can't change the topology because it causes issues the
way we build sched_domains. You need to figure how to build sched domains
for such systems. If current set of domains or how the masks for each of
the domains are derived is not sufficient or incorrect, we need to fix that.
We are not changing topology masks for that, whatever the reason might be,
sorry as these are userspace visible.

--
Regards,
Sudeep
Sudeep Holla May 20, 2022, 3:27 p.m. UTC | #4
On Thu, May 19, 2022 at 05:55:30PM +0100, Ionela Voinescu wrote:
> Hi,
> 
> As said before, this creates trouble for CONFIG_SCHED_CLUSTER=y.
> The output below is obtained from Juno.
> 
> When cluster_id is populated, a new CLS level is created by the scheduler
> topology code. In this case the clusters in DT determine that the cluster
> siblings and llc siblings are the same so the MC scheduler domain will
> be removed and, for Juno, only CLS and DIE will be kept.
>

Yes I have seen that. 

1. Will that differ with ACPI on juno ?

2. Is that a problem ? I mean we are fixing those masks that are user
visible with this series and if using them as is in sched domain is
incorrect or not sufficient, we need to fix that. We can't change these
masks.

> root@debian-arm64-buster:/sys/kernel/debug/sched/domains/cpu1# grep . */*
> domain0/busy_factor:16
> domain0/cache_nice_tries:1
> domain0/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING
> domain0/imbalance_pct:117
> domain0/max_interval:4
> domain0/max_newidle_lb_cost:14907
> domain0/min_interval:2
> domain0/name:CLS
> domain1/busy_factor:16
> domain1/cache_nice_tries:1
> domain1/flags:SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_ASYM_CPUCAPACITY SD_ASYM_CPUCAPACITY_FULL SD_PREFER_SIBLING
> domain1/imbalance_pct:117
> domain1/max_interval:12
> domain1/max_newidle_lb_cost:11858
> domain1/min_interval:6
> domain1/name:DIE
> 
> To be noted that we also get a new flag SD_PREFER_SIBLING for the CLS
> level that is not appropriate. We usually remove it for the child of a
> SD_ASYM_CPUCAPACITY domain, but we don't currently redo this after some
> levels are degenerated. This is a fixable issue.
>

OK good.

> But looking at the bigger picture, a good question is what is the best
> thing to do when cluster domains and llc domains span the same CPUs?
>

Indeed, I will leave that to scheduler experts 
diff mbox series

Patch

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 7f5aa655c1f4..bdb6f2a17df0 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -491,7 +491,7 @@  static int __init get_cpu_for_node(struct device_node *node)
 }
 
 static int __init parse_core(struct device_node *core, int package_id,
-			     int core_id)
+			     int cluster_id, int core_id)
 {
 	char name[20];
 	bool leaf = true;
@@ -507,6 +507,7 @@  static int __init parse_core(struct device_node *core, int package_id,
 			cpu = get_cpu_for_node(t);
 			if (cpu >= 0) {
 				cpu_topology[cpu].package_id = package_id;
+				cpu_topology[cpu].cluster_id = cluster_id;
 				cpu_topology[cpu].core_id = core_id;
 				cpu_topology[cpu].thread_id = i;
 			} else if (cpu != -ENODEV) {
@@ -528,6 +529,7 @@  static int __init parse_core(struct device_node *core, int package_id,
 		}
 
 		cpu_topology[cpu].package_id = package_id;
+		cpu_topology[cpu].cluster_id = cluster_id;
 		cpu_topology[cpu].core_id = core_id;
 	} else if (leaf && cpu != -ENODEV) {
 		pr_err("%pOF: Can't get CPU for leaf core\n", core);
@@ -537,7 +539,8 @@  static int __init parse_core(struct device_node *core, int package_id,
 	return 0;
 }
 
-static int __init parse_cluster(struct device_node *cluster, int depth)
+static int __init
+parse_cluster(struct device_node *cluster, int cluster_id, int depth)
 {
 	char name[20];
 	bool leaf = true;
@@ -557,7 +560,7 @@  static int __init parse_cluster(struct device_node *cluster, int depth)
 		c = of_get_child_by_name(cluster, name);
 		if (c) {
 			leaf = false;
-			ret = parse_cluster(c, depth + 1);
+			ret = parse_cluster(c, i, depth + 1);
 			of_node_put(c);
 			if (ret != 0)
 				return ret;
@@ -581,7 +584,7 @@  static int __init parse_cluster(struct device_node *cluster, int depth)
 			}
 
 			if (leaf) {
-				ret = parse_core(c, 0, core_id++);
+				ret = parse_core(c, 0, cluster_id, core_id++);
 			} else {
 				pr_err("%pOF: Non-leaf cluster with core %s\n",
 				       cluster, name);
@@ -621,7 +624,7 @@  static int __init parse_dt_topology(void)
 	if (!map)
 		goto out;
 
-	ret = parse_cluster(map, 0);
+	ret = parse_cluster(map, -1, 0);
 	if (ret != 0)
 		goto out_map;