diff mbox series

[v6,05/14] sched/topology: Reference the Energy Model of CPUs when available

Message ID 20180820094420.26590-6-quentin.perret@arm.com (mailing list archive)
State Changes Requested, archived
Headers show
Series Energy Aware Scheduling | expand

Commit Message

Quentin Perret Aug. 20, 2018, 9:44 a.m. UTC
The existing scheduling domain hierarchy is defined to map to the cache
topology of the system. However, Energy Aware Scheduling (EAS) requires
more knowledge about the platform, and specifically needs to know about
the span of Performance Domains (PD), which do not always align with
caches.

To address this issue, use the Energy Model (EM) of the system to extend
the scheduler topology code with a representation of the PDs, alongside
the scheduling domains. More specifically, a linked list of PDs is
attached to each root domain. When multiple root domains are in use,
each list contains only the PDs covering the CPUs of its root domain. If
a PD spans over CPUs of two different root domains, it will be
duplicated in both lists.

The lists are fully maintained by the scheduler from
partition_sched_domains() in order to cope with hotplug and cpuset
changes. As for scheduling domains, the list are protected by RCU to
ensure safe concurrent updates.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
---
 kernel/sched/sched.h    |  21 +++++++
 kernel/sched/topology.c | 134 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 151 insertions(+), 4 deletions(-)

Comments

Patrick Bellasi Aug. 29, 2018, 4:22 p.m. UTC | #1
Hi Quentin,
likely the organization and naming on this file would require some
lovely refactoring and cleanup... but of course that's outside of the
scope of this patch.

Still, maybe we can improve a bit its current status according to the
following comments ?

On 20-Aug 10:44, Quentin Perret wrote:
> The existing scheduling domain hierarchy is defined to map to the cache
> topology of the system. However, Energy Aware Scheduling (EAS) requires
> more knowledge about the platform, and specifically needs to know about
> the span of Performance Domains (PD), which do not always align with
> caches.
> 
> To address this issue, use the Energy Model (EM) of the system to extend
> the scheduler topology code with a representation of the PDs, alongside
> the scheduling domains. More specifically, a linked list of PDs is
> attached to each root domain. When multiple root domains are in use,
> each list contains only the PDs covering the CPUs of its root domain. If
> a PD spans over CPUs of two different root domains, it will be
> duplicated in both lists.
> 
> The lists are fully maintained by the scheduler from
> partition_sched_domains() in order to cope with hotplug and cpuset
> changes. As for scheduling domains, the list are protected by RCU to
> ensure safe concurrent updates.
> 
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Quentin Perret <quentin.perret@arm.com>
> ---
>  kernel/sched/sched.h    |  21 +++++++
>  kernel/sched/topology.c | 134 ++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 151 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 481e6759441b..97d79429e755 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -44,6 +44,7 @@
>  #include <linux/ctype.h>
>  #include <linux/debugfs.h>
>  #include <linux/delayacct.h>
> +#include <linux/energy_model.h>
>  #include <linux/init_task.h>
>  #include <linux/kprobes.h>
>  #include <linux/kthread.h>
> @@ -700,6 +701,12 @@ static inline bool sched_asym_prefer(int a, int b)
>  	return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
>  }
>  
> +struct perf_domain {
> +	struct em_perf_domain *obj;
> +	struct perf_domain *next;
> +	struct rcu_head rcu;
> +};
> +
>  /*
>   * We add the notion of a root-domain which will be used to define per-domain
>   * variables. Each exclusive cpuset essentially defines an island domain by
> @@ -752,6 +759,12 @@ struct root_domain {
>  	struct cpupri		cpupri;
>  
>  	unsigned long		max_cpu_capacity;
> +
> +	/*
> +	 * NULL-terminated list of performance domains intersecting with the
> +	 * CPUs of the rd. Protected by RCU.
> +	 */
> +	struct perf_domain	*pd;
>  };
>  
>  extern struct root_domain def_root_domain;
> @@ -2228,3 +2241,11 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
>  	return util;
>  }
>  #endif
> +
> +#ifdef CONFIG_SMP
> +#ifdef CONFIG_ENERGY_MODEL
> +#define perf_domain_span(pd) (to_cpumask(((pd)->obj->cpus)))
> +#else
> +#define perf_domain_span(pd) NULL
> +#endif
> +#endif
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index c4444ed77a55..545e2c7b6e66 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -201,6 +201,116 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
>  	return 1;
>  }
>  
> +#ifdef CONFIG_ENERGY_MODEL
> +static void free_pd(struct perf_domain *pd)
> +{
> +	struct perf_domain *tmp;
> +
> +	while (pd) {
> +		tmp = pd->next;
> +		kfree(pd);
> +		pd = tmp;
> +	}
> +}
> +
> +static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
> +{
> +	while (pd) {
> +		if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
> +			return pd;
> +		pd = pd->next;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct perf_domain *pd_init(int cpu)
> +{
> +	struct em_perf_domain *obj = em_cpu_get(cpu);
> +	struct perf_domain *pd;
> +
> +	if (!obj) {
> +		if (sched_debug())
> +			pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
> +		return NULL;
> +	}
> +
> +	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
> +	if (!pd)
> +		return NULL;
> +	pd->obj = obj;
> +
> +	return pd;
> +}
> +
> +
> +static void perf_domain_debug(const struct cpumask *cpu_map,
> +						struct perf_domain *pd)
> +{
> +	if (!sched_debug() || !pd)
> +		return;
> +
> +	printk(KERN_DEBUG "root_domain %*pbl: ", cpumask_pr_args(cpu_map));
> +
> +	while (pd) {
> +		printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
> +				cpumask_first(perf_domain_span(pd)),
> +				cpumask_pr_args(perf_domain_span(pd)),
> +				em_pd_nr_cap_states(pd->obj));
> +		pd = pd->next;
> +	}
> +
> +	printk(KERN_CONT "\n");
> +}
> +
> +static void destroy_perf_domain_rcu(struct rcu_head *rp)
> +{
> +	struct perf_domain *pd;
> +
> +	pd = container_of(rp, struct perf_domain, rcu);
> +	free_pd(pd);
> +}
> +
> +static void build_perf_domains(const struct cpumask *cpu_map)
> +{
> +	struct perf_domain *pd = NULL, *tmp;
> +	int cpu = cpumask_first(cpu_map);
> +	struct root_domain *rd = cpu_rq(cpu)->rd;
> +	int i;
> +
> +	for_each_cpu(i, cpu_map) {
> +		/* Skip already covered CPUs. */
> +		if (find_pd(pd, i))
> +			continue;
> +
> +		/* Create the new pd and add it to the local list. */
> +		tmp = pd_init(i);
> +		if (!tmp)
> +			goto free;
> +		tmp->next = pd;
> +		pd = tmp;
> +	}
> +
> +	perf_domain_debug(cpu_map, pd);
> +
> +	/* Attach the new list of performance domains to the root domain. */
> +	tmp = rd->pd;
> +	rcu_assign_pointer(rd->pd, pd);
> +	if (tmp)
> +		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);

We have:

  sched_cpu_activate/cpuset_cpu_inactive
    cpuset_cpu_active/sched_cpu_deactivate
      partition_sched_domains
        build_perf_domains

thus here we are building new SDs and, specifically, above we are
attaching the local list "pd" to a _new_ root domain... thus, there
cannot be already users of this new SDs and root domain at this stage,
isn't it ?

Do we really need that rcu_assign_pointer ?
Is the rcu_assign_pointer there just to "match" the following call_rcu ?

What about this path:

  sched_init_domains
     partition_sched_domains

in which case we do not call build_perf_domains... is that intended ?

> +
> +	return;
> +
> +free:
> +	free_pd(pd);
> +	tmp = rd->pd;
> +	rcu_assign_pointer(rd->pd, NULL);
> +	if (tmp)
> +		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
> +}

All the above functions use different naming conventions:

   "_pd" suffix, "pd_" prefix and "perf_domain_" prefix.

and you do it like that because it better matches the corresponding
call sites following down the file.

However, since we are into a "CONFIG_ENERGY_MODEL" guarded section,
why not start using a common prefix for all PD related functions?

I very like "perf_domain_" (or "pd_") as a prefix and I would try to
use it for all the functions you defined above:

   perf_domain_free
   perf_domain_find
   perf_domain_debug
   perf_domain_destroy_rcu
   perf_domain_build

> +#else
> +static void free_pd(struct perf_domain *pd) { }
> +#endif

Maybe better:

  #endif /* CONFIG_ENERGY_MODEL */

> +
>  static void free_rootdomain(struct rcu_head *rcu)
>  {
>  	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
> @@ -211,6 +321,7 @@ static void free_rootdomain(struct rcu_head *rcu)
>  	free_cpumask_var(rd->rto_mask);
>  	free_cpumask_var(rd->online);
>  	free_cpumask_var(rd->span);
> +	free_pd(rd->pd);
>  	kfree(rd);
>  }
>  
> @@ -1964,8 +2075,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
>  	/* Destroy deleted domains: */
>  	for (i = 0; i < ndoms_cur; i++) {
>  		for (j = 0; j < n && !new_topology; j++) {
> -			if (cpumask_equal(doms_cur[i], doms_new[j])
> -			    && dattrs_equal(dattr_cur, i, dattr_new, j))
> +			if (cpumask_equal(doms_cur[i], doms_new[j]) &&
> +			    dattrs_equal(dattr_cur, i, dattr_new, j))

This chunk looks more like a cleanup which is not really changing
anything: is it intentional?

>  				goto match1;
>  		}
>  		/* No match - a current sched domain not in new doms_new[] */
> @@ -1985,8 +2096,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
>  	/* Build new domains: */
>  	for (i = 0; i < ndoms_new; i++) {
>  		for (j = 0; j < n && !new_topology; j++) {
> -			if (cpumask_equal(doms_new[i], doms_cur[j])
> -			    && dattrs_equal(dattr_new, i, dattr_cur, j))
> +			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
> +			    dattrs_equal(dattr_new, i, dattr_cur, j))


Same comment for the chunk above

>  				goto match2;
>  		}
>  		/* No match - add a new doms_new */
> @@ -1995,6 +2106,21 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
>  		;
>  	}
>  
> +#ifdef CONFIG_ENERGY_MODEL
> +	/* Build perf. domains: */
> +	for (i = 0; i < ndoms_new; i++) {
> +		for (j = 0; j < n; j++) {
> +			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
> +			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
> +				goto match3;
> +		}
> +		/* No match - add perf. domains for a new rd */
> +		build_perf_domains(doms_new[i]);
> +match3:
> +		;
> +	}
> +#endif
> +


Since we already have a CONFIG_ENERGY_MODEL guarded section above,
maybe we can s/build_perf_domains/build_perf_root_domain/ and use
build_perf_domains to provide an inline function for the chunk above
in the guarded section at the beginning of the file ?

The #else section will then provide just an empty implementation.

Something like The diff below seems to work and it should do the
"cleanup" job by also moving at the beginning of the source file the
definition of the global variables (required by some functions).

Perhaps that's a bit of cleanup code that maintainer can accept...
but... to be verified. ;)

---8<---
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2b6df8edca2a..647667b89180 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,22 @@ DEFINE_MUTEX(sched_domains_mutex);
 cpumask_var_t sched_domains_tmpmask;
 cpumask_var_t sched_domains_tmpmask2;
 
+/* Current sched domains: */
+static cpumask_var_t			*doms_cur;
+
+/* Number of sched domains in 'doms_cur': */
+static int				ndoms_cur;
+
+/* Attribues of custom domains in 'doms_cur' */
+static struct sched_domain_attr		*dattr_cur;
+
+/*
+ * Special case: If a kmalloc() of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t			fallback_doms;
+
 #ifdef CONFIG_SCHED_DEBUG
 
 static int __init sched_debug_setup(char *str)
@@ -294,7 +310,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
 #define EM_MAX_COMPLEXITY 2048
 
 extern struct cpufreq_governor schedutil_gov;
-static void build_perf_domains(const struct cpumask *cpu_map)
+static void build_perf_root_domain(const struct cpumask *cpu_map)
 {
 	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
 	struct perf_domain *pd = NULL, *tmp;
@@ -395,9 +411,30 @@ static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[])
 		static_branch_enable_cpuslocked(&sched_energy_present);
 	}
 }
+
+static void build_perf_domains(int ndoms_new, cpumask_var_t doms_new[])
+{
+	int i, j;
+
+	/* Build perf. domains: */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < ndoms_cur; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
+				goto done;
+		}
+		build_perf_root_domain(doms_new[i]);
+done:
+		;
+	}
+	sched_energy_start(ndoms_new, doms_new);
+}
+
 #else
 static void free_pd(struct perf_domain *pd) { }
-#endif
+static void build_perf_domains(int ndoms_new, cpumask_var_t doms_new[]) { }
+#endif /* CONFIG_ENERGY_MODEL */
 
 static void free_rootdomain(struct rcu_head *rcu)
 {
@@ -2004,22 +2041,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	return ret;
 }
 
-/* Current sched domains: */
-static cpumask_var_t			*doms_cur;
-
-/* Number of sched domains in 'doms_cur': */
-static int				ndoms_cur;
-
-/* Attribues of custom domains in 'doms_cur' */
-static struct sched_domain_attr		*dattr_cur;
-
-/*
- * Special case: If a kmalloc() of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t			fallback_doms;
-
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
  * CPU core maps. It is supposed to return 1 if the topology changed
@@ -2198,21 +2219,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 		;
 	}
 
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-	/* Build perf. domains: */
-	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < n && !sched_energy_update; j++) {
-			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
-			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
-				goto match3;
-		}
-		/* No match - add perf. domains for a new rd */
-		build_perf_domains(doms_new[i]);
-match3:
-		;
-	}
-	sched_energy_start(ndoms_new, doms_new);
-#endif
+	build_perf_domains(ndoms_new, n, doms_new);
 
 	/* Remember the new sched domains: */
 	if (doms_cur != &fallback_doms)
---8<---



>  	/* Remember the new sched domains: */
>  	if (doms_cur != &fallback_doms)
>  		free_sched_domains(doms_cur, ndoms_cur);
> -- 
> 2.17.1
>
Quentin Perret Aug. 29, 2018, 4:56 p.m. UTC | #2
On Wednesday 29 Aug 2018 at 17:22:38 (+0100), Patrick Bellasi wrote:
> > +static void build_perf_domains(const struct cpumask *cpu_map)
> > +{
> > +	struct perf_domain *pd = NULL, *tmp;
> > +	int cpu = cpumask_first(cpu_map);
> > +	struct root_domain *rd = cpu_rq(cpu)->rd;
> > +	int i;
> > +
> > +	for_each_cpu(i, cpu_map) {
> > +		/* Skip already covered CPUs. */
> > +		if (find_pd(pd, i))
> > +			continue;
> > +
> > +		/* Create the new pd and add it to the local list. */
> > +		tmp = pd_init(i);
> > +		if (!tmp)
> > +			goto free;
> > +		tmp->next = pd;
> > +		pd = tmp;
> > +	}
> > +
> > +	perf_domain_debug(cpu_map, pd);
> > +
> > +	/* Attach the new list of performance domains to the root domain. */
> > +	tmp = rd->pd;
> > +	rcu_assign_pointer(rd->pd, pd);
> > +	if (tmp)
> > +		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
> 
> We have:
> 
>   sched_cpu_activate/cpuset_cpu_inactive
>     cpuset_cpu_active/sched_cpu_deactivate
>       partition_sched_domains
>         build_perf_domains
> 
> thus here we are building new SDs and, specifically, above we are
> attaching the local list "pd" to a _new_ root domain... thus, there
> cannot be already users of this new SDs and root domain at this stage,
> isn't it ?

Hmm, actually you can end up here even if the rd isn't new. That would
happen if you call rebuild_sched_domains() after the EM has been
registered for example. At this point, you might skip
detach_destroy_domains() and build_sched_domains() from
partition_sched_domains(), but still call build_perf_domains(), which
would then attach the pd list to the current rd.

That's one reason why rcu_assign_pointer() is probably a good idea. And
it's also nice from a doc standpoint I suppose.

> 
> Do we really need that rcu_assign_pointer ?
> Is the rcu_assign_pointer there just to "match" the following call_rcu ?
> 
> What about this path:
> 
>   sched_init_domains
>      partition_sched_domains
> 
> in which case we do not call build_perf_domains... is that intended ?

I assume you meant:

   sched_init_domains
     build_sched_domains

Is that right ?

If yes, I didn't bother calling build_perf_domains() from there because
I don't think there is a single platform out there which would have a
registered Energy Model that early in the boot process. Or maybe there
is one I don't know ?

Anyway, that is probably easy to fix, if need be.

> > +
> > +	return;
> > +
> > +free:
> > +	free_pd(pd);
> > +	tmp = rd->pd;
> > +	rcu_assign_pointer(rd->pd, NULL);
> > +	if (tmp)
> > +		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
> > +}
> 
> All the above functions use different naming conventions:
> 
>    "_pd" suffix, "pd_" prefix and "perf_domain_" prefix.
> 
> and you do it like that because it better matches the corresponding
> call sites following down the file.

That's right. The functions are supposed to vaguely look like existing
functions dealing with sched domains.

> However, since we are into a "CONFIG_ENERGY_MODEL" guarded section,
> why not start using a common prefix for all PD related functions?
> 
> I very like "perf_domain_" (or "pd_") as a prefix and I would try to
> use it for all the functions you defined above:
> 
>    perf_domain_free
>    perf_domain_find
>    perf_domain_debug
>    perf_domain_destroy_rcu
>    perf_domain_build

I kinda like the idea of keeping things consistent with the existing
code TBH. Especially because I'm terrible at naming things ... But if
there is a general agreement that I should rename everything I won't
argue.

> > +#else
> > +static void free_pd(struct perf_domain *pd) { }
> > +#endif
> 
> Maybe better:
> 
>   #endif /* CONFIG_ENERGY_MODEL */

Ack

> 
> > +
> >  static void free_rootdomain(struct rcu_head *rcu)
> >  {
> >  	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
> > @@ -211,6 +321,7 @@ static void free_rootdomain(struct rcu_head *rcu)
> >  	free_cpumask_var(rd->rto_mask);
> >  	free_cpumask_var(rd->online);
> >  	free_cpumask_var(rd->span);
> > +	free_pd(rd->pd);
> >  	kfree(rd);
> >  }
> >  
> > @@ -1964,8 +2075,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
> >  	/* Destroy deleted domains: */
> >  	for (i = 0; i < ndoms_cur; i++) {
> >  		for (j = 0; j < n && !new_topology; j++) {
> > -			if (cpumask_equal(doms_cur[i], doms_new[j])
> > -			    && dattrs_equal(dattr_cur, i, dattr_new, j))
> > +			if (cpumask_equal(doms_cur[i], doms_new[j]) &&
> > +			    dattrs_equal(dattr_cur, i, dattr_new, j))
> 
> This chunk looks more like a cleanup which is not really changing
> anything: is it intentional?

Yep, that's a cleanup Peter requested:
	20180705181407.GI2494@hirez.programming.kicks-ass.net

> 
> >  				goto match1;
> >  		}
> >  		/* No match - a current sched domain not in new doms_new[] */
> > @@ -1985,8 +2096,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
> >  	/* Build new domains: */
> >  	for (i = 0; i < ndoms_new; i++) {
> >  		for (j = 0; j < n && !new_topology; j++) {
> > -			if (cpumask_equal(doms_new[i], doms_cur[j])
> > -			    && dattrs_equal(dattr_new, i, dattr_cur, j))
> > +			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
> > +			    dattrs_equal(dattr_new, i, dattr_cur, j))
> 
> 
> Same comment for the chunk above

Ditto :-)

> 
> >  				goto match2;
> >  		}
> >  		/* No match - add a new doms_new */
> > @@ -1995,6 +2106,21 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
> >  		;
> >  	}
> >  
> > +#ifdef CONFIG_ENERGY_MODEL
> > +	/* Build perf. domains: */
> > +	for (i = 0; i < ndoms_new; i++) {
> > +		for (j = 0; j < n; j++) {
> > +			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
> > +			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
> > +				goto match3;
> > +		}
> > +		/* No match - add perf. domains for a new rd */
> > +		build_perf_domains(doms_new[i]);
> > +match3:
> > +		;
> > +	}
> > +#endif
> > +
> 
> 
> Since we already have a CONFIG_ENERGY_MODEL guarded section above,
> maybe we can s/build_perf_domains/build_perf_root_domain/ and use
> build_perf_domains to provide an inline function for the chunk above
> in the guarded section at the beginning of the file ?
> 
> The #else section will then provide just an empty implementation.

The only reason I didn't do this was because I wanted to keep all the
logic to skip (or not) building things centralized in
partition_sched_domains(), simply because I find it easier to
understand.

> Something like The diff below seems to work and it should do the
> "cleanup" job by also moving at the beginning of the source file the
> definition of the global variables (required by some functions).
> 
> Perhaps that's a bit of cleanup code that maintainer can accept...
> but... to be verified. ;)

Right, I guess it's mainly a matter of 'taste' here. So let's see ... :-)

Thanks !
Quentin
Patrick Bellasi Aug. 30, 2018, 10 a.m. UTC | #3
On 29-Aug 17:56, Quentin Perret wrote:
> On Wednesday 29 Aug 2018 at 17:22:38 (+0100), Patrick Bellasi wrote:
> > > +static void build_perf_domains(const struct cpumask *cpu_map)
> > > +{
> > > +	struct perf_domain *pd = NULL, *tmp;
> > > +	int cpu = cpumask_first(cpu_map);
> > > +	struct root_domain *rd = cpu_rq(cpu)->rd;
> > > +	int i;
> > > +
> > > +	for_each_cpu(i, cpu_map) {
> > > +		/* Skip already covered CPUs. */
> > > +		if (find_pd(pd, i))
> > > +			continue;
> > > +
> > > +		/* Create the new pd and add it to the local list. */
> > > +		tmp = pd_init(i);
> > > +		if (!tmp)
> > > +			goto free;
> > > +		tmp->next = pd;
> > > +		pd = tmp;
> > > +	}
> > > +
> > > +	perf_domain_debug(cpu_map, pd);
> > > +
> > > +	/* Attach the new list of performance domains to the root domain. */
> > > +	tmp = rd->pd;
> > > +	rcu_assign_pointer(rd->pd, pd);
> > > +	if (tmp)
> > > +		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
> > 
> > We have:
> > 
> >   sched_cpu_activate/cpuset_cpu_inactive
> >     cpuset_cpu_active/sched_cpu_deactivate
> >       partition_sched_domains
> >         build_perf_domains
> > 
> > thus here we are building new SDs and, specifically, above we are
> > attaching the local list "pd" to a _new_ root domain... thus, there
> > cannot be already users of this new SDs and root domain at this stage,
> > isn't it ?
> 
> Hmm, actually you can end up here even if the rd isn't new. That would
> happen if you call rebuild_sched_domains() after the EM has been
> registered for example.
> At this point, you might skip
> detach_destroy_domains() and build_sched_domains() from
> partition_sched_domains(), but still call build_perf_domains(), which
> would then attach the pd list to the current rd.

Ok... then it's just me that need to go back and better study how and
when SD are rebuilds.

> That's one reason why rcu_assign_pointer() is probably a good idea. And
> it's also nice from a doc standpoint I suppose.

If we can call into build_perf_domains() and reach the assignement
above with an existing RD, then yes, it makes perfect sense.

> > Do we really need that rcu_assign_pointer ?
> > Is the rcu_assign_pointer there just to "match" the following call_rcu ?
> > 
> > What about this path:
> > 
> >   sched_init_domains
> >      partition_sched_domains
> > 
> > in which case we do not call build_perf_domains... is that intended ?
> 
> I assume you meant:
> 
>    sched_init_domains
>      build_sched_domains
> 
> Is that right ?

Mmm... yes... seems so.

> If yes, I didn't bother calling build_perf_domains() from there because
> I don't think there is a single platform out there which would have a
> registered Energy Model that early in the boot process. Or maybe there
> is one I don't know ?

Dunno... but, in any case, probably we don't care about using EAS until
the boot complete, isn't it?

Just to better understand, what is the most common activation path we expect ?

  1. system boot
  2. CPUs online
  3. CPUFreq initialization
  4. EM registered
  X. ???
  N. partition_sched_domains
        build_perf_domains

IOW, who do we expect to call build_perf_domains for the first time ?

> Anyway, that is probably easy to fix, if need be.
> 
> > > +
> > > +	return;
> > > +
> > > +free:
> > > +	free_pd(pd);
> > > +	tmp = rd->pd;
> > > +	rcu_assign_pointer(rd->pd, NULL);
> > > +	if (tmp)
> > > +		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
> > > +}
> > 
> > All the above functions use different naming conventions:
> > 
> >    "_pd" suffix, "pd_" prefix and "perf_domain_" prefix.
> > 
> > and you do it like that because it better matches the corresponding
> > call sites following down the file.
> 
> That's right. The functions are supposed to vaguely look like existing
> functions dealing with sched domains.
> 
> > However, since we are into a "CONFIG_ENERGY_MODEL" guarded section,
> > why not start using a common prefix for all PD related functions?
> > 
> > I very like "perf_domain_" (or "pd_") as a prefix and I would try to
> > use it for all the functions you defined above:
> > 
> >    perf_domain_free
> >    perf_domain_find
> >    perf_domain_debug
> >    perf_domain_destroy_rcu
> >    perf_domain_build
> 
> I kinda like the idea of keeping things consistent with the existing
> code TBH. Especially because I'm terrible at naming things ... But if
> there is a general agreement that I should rename everything I won't
> argue.

I've just got the impression that naming in this file is a bit
fuzzy and it could be worth a cleanup... but of course there is also
value in minimizing the changes.

Was just wondering if a better file organization in general could help
to make topology.c more easy to compile for humans... but yes... let's
keep this for another time ;)

Cheers Patrick
Quentin Perret Aug. 30, 2018, 10:47 a.m. UTC | #4
On Thursday 30 Aug 2018 at 11:00:20 (+0100), Patrick Bellasi wrote:
> Dunno... but, in any case, probably we don't care about using EAS until
> the boot complete, isn't it?

So, as of now, EAS will typically start soon after CPUFreq. I don't see a
point in starting before, and I'm not sure if starting it later is
really what we want either ... It probably makes sense to start the
DVFS-related features (CPUFreq, EAS, ...) together.

> Just to better understand, what is the most common activation path we expect ?
> 
>   1. system boot
>   2. CPUs online
>   3. CPUFreq initialization
>   4. EM registered
>   X. ???

X is sort of arch dependent (like the EM registration actually). For
arm/arm64, the arch topology driver (see drivers/base/arch_topology.c)
will rebuild the sched_domains once the capacities of CPU are
discovered. In previous versions, I had a patch that did that explicitly:

https://lore.kernel.org/lkml/20180724122521.22109-14-quentin.perret@arm.com/

However, I dropped this patch for v6 because I rebased the series on top
of Morten's automatic flag detection patches, which happen to already do
that for me. More precisely, Morten's patches will rebuild the sched
domains if the SD_ASYM_CPUCAPACITY flag needs to be set somewhere in the
hierarchy. EAS depends on this flag anyway, so I guess it's fine to rely
on this rebuild to start EAS at the same time.

I have been wondering for a while if such a 'loose' way of starting EAS
was alright or not. My conclusion was that it should be OK for now,
because that should suit all the users I can see in the short/medium
term. An alternative could be to introduce something like a notifier in
the EM framework in order to let other subsystems know that the EM is
complete or something along those lines. And then we could register a
callback on this notifier to rebuild the sched_domains. But that's added
complexity, which isn't really needed (yet). If that needs to be done to
accomodate the needs of new users/archs, we can still implement that
later :-)

>   N. partition_sched_domains
>         build_perf_domains
> 
> IOW, who do we expect to call build_perf_domains for the first time ?

On arm/arm64, when the arch_topology driver calls rebuild_sched_domains.

Thanks !
Quentin
Patrick Bellasi Aug. 30, 2018, 12:50 p.m. UTC | #5
On 30-Aug 11:47, Quentin Perret wrote:
> On Thursday 30 Aug 2018 at 11:00:20 (+0100), Patrick Bellasi wrote:
> > Dunno... but, in any case, probably we don't care about using EAS until
> > the boot complete, isn't it?
> 
> So, as of now, EAS will typically start soon after CPUFreq. I don't see a
> point in starting before, and I'm not sure if starting it later is
> really what we want either ... It probably makes sense to start the
> DVFS-related features (CPUFreq, EAS, ...) together.
> 
> > Just to better understand, what is the most common activation path we expect ?
> > 
> >   1. system boot
> >   2. CPUs online
> >   3. CPUFreq initialization
> >   4. EM registered
> >   X. ???
> 
> X is sort of arch dependent (like the EM registration actually). For
> arm/arm64, the arch topology driver (see drivers/base/arch_topology.c)
> will rebuild the sched_domains once the capacities of CPU are
> discovered. In previous versions, I had a patch that did that explicitly:
> 
> https://lore.kernel.org/lkml/20180724122521.22109-14-quentin.perret@arm.com/
> 
> However, I dropped this patch for v6 because I rebased the series on top
> of Morten's automatic flag detection patches, which happen to already do
> that for me. More precisely, Morten's patches will rebuild the sched
> domains if the SD_ASYM_CPUCAPACITY flag needs to be set somewhere in the
> hierarchy. EAS depends on this flag anyway, so I guess it's fine to rely
> on this rebuild to start EAS at the same time.
> 
> I have been wondering for a while if such a 'loose' way of starting EAS
> was alright or not. My conclusion was that it should be OK for now,
> because that should suit all the users I can see in the short/medium
> term. An alternative could be to introduce something like a notifier in
> the EM framework in order to let other subsystems know that the EM is
> complete or something along those lines. And then we could register a
> callback on this notifier to rebuild the sched_domains. But that's added
> complexity, which isn't really needed (yet). If that needs to be done to
> accomodate the needs of new users/archs, we can still implement that
> later :-)
> 
> >   N. partition_sched_domains
> >         build_perf_domains
> > 
> > IOW, who do we expect to call build_perf_domains for the first time ?
> 
> On arm/arm64, when the arch_topology driver calls rebuild_sched_domains.

Great, I missed that point, which you actually call out in the cover
letter. Thanks also for the more comprehensive explanation above!

> Thanks !
> Quentin

Cheers Patrick
diff mbox series

Patch

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 481e6759441b..97d79429e755 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -44,6 +44,7 @@ 
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/delayacct.h>
+#include <linux/energy_model.h>
 #include <linux/init_task.h>
 #include <linux/kprobes.h>
 #include <linux/kthread.h>
@@ -700,6 +701,12 @@  static inline bool sched_asym_prefer(int a, int b)
 	return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
 }
 
+struct perf_domain {
+	struct em_perf_domain *obj;
+	struct perf_domain *next;
+	struct rcu_head rcu;
+};
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -752,6 +759,12 @@  struct root_domain {
 	struct cpupri		cpupri;
 
 	unsigned long		max_cpu_capacity;
+
+	/*
+	 * NULL-terminated list of performance domains intersecting with the
+	 * CPUs of the rd. Protected by RCU.
+	 */
+	struct perf_domain	*pd;
 };
 
 extern struct root_domain def_root_domain;
@@ -2228,3 +2241,11 @@  unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
 	return util;
 }
 #endif
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_ENERGY_MODEL
+#define perf_domain_span(pd) (to_cpumask(((pd)->obj->cpus)))
+#else
+#define perf_domain_span(pd) NULL
+#endif
+#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c4444ed77a55..545e2c7b6e66 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,6 +201,116 @@  sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 }
 
+#ifdef CONFIG_ENERGY_MODEL
+static void free_pd(struct perf_domain *pd)
+{
+	struct perf_domain *tmp;
+
+	while (pd) {
+		tmp = pd->next;
+		kfree(pd);
+		pd = tmp;
+	}
+}
+
+static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
+{
+	while (pd) {
+		if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
+			return pd;
+		pd = pd->next;
+	}
+
+	return NULL;
+}
+
+static struct perf_domain *pd_init(int cpu)
+{
+	struct em_perf_domain *obj = em_cpu_get(cpu);
+	struct perf_domain *pd;
+
+	if (!obj) {
+		if (sched_debug())
+			pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
+		return NULL;
+	}
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return NULL;
+	pd->obj = obj;
+
+	return pd;
+}
+
+static void perf_domain_debug(const struct cpumask *cpu_map,
+						struct perf_domain *pd)
+{
+	if (!sched_debug() || !pd)
+		return;
+
+	printk(KERN_DEBUG "root_domain %*pbl: ", cpumask_pr_args(cpu_map));
+
+	while (pd) {
+		printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+				cpumask_first(perf_domain_span(pd)),
+				cpumask_pr_args(perf_domain_span(pd)),
+				em_pd_nr_cap_states(pd->obj));
+		pd = pd->next;
+	}
+
+	printk(KERN_CONT "\n");
+}
+
+static void destroy_perf_domain_rcu(struct rcu_head *rp)
+{
+	struct perf_domain *pd;
+
+	pd = container_of(rp, struct perf_domain, rcu);
+	free_pd(pd);
+}
+
+static void build_perf_domains(const struct cpumask *cpu_map)
+{
+	struct perf_domain *pd = NULL, *tmp;
+	int cpu = cpumask_first(cpu_map);
+	struct root_domain *rd = cpu_rq(cpu)->rd;
+	int i;
+
+	for_each_cpu(i, cpu_map) {
+		/* Skip already covered CPUs. */
+		if (find_pd(pd, i))
+			continue;
+
+		/* Create the new pd and add it to the local list. */
+		tmp = pd_init(i);
+		if (!tmp)
+			goto free;
+		tmp->next = pd;
+		pd = tmp;
+	}
+
+	perf_domain_debug(cpu_map, pd);
+
+	/* Attach the new list of performance domains to the root domain. */
+	tmp = rd->pd;
+	rcu_assign_pointer(rd->pd, pd);
+	if (tmp)
+		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+	return;
+
+free:
+	free_pd(pd);
+	tmp = rd->pd;
+	rcu_assign_pointer(rd->pd, NULL);
+	if (tmp)
+		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+}
+#else
+static void free_pd(struct perf_domain *pd) { }
+#endif
+
 static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
@@ -211,6 +321,7 @@  static void free_rootdomain(struct rcu_head *rcu)
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
+	free_pd(rd->pd);
 	kfree(rd);
 }
 
@@ -1964,8 +2075,8 @@  void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 	/* Destroy deleted domains: */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_cur[i], doms_new[j])
-			    && dattrs_equal(dattr_cur, i, dattr_new, j))
+			if (cpumask_equal(doms_cur[i], doms_new[j]) &&
+			    dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* No match - a current sched domain not in new doms_new[] */
@@ -1985,8 +2096,8 @@  void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 	/* Build new domains: */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_new[i], doms_cur[j])
-			    && dattrs_equal(dattr_new, i, dattr_cur, j))
+			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+			    dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* No match - add a new doms_new */
@@ -1995,6 +2106,21 @@  void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 		;
 	}
 
+#ifdef CONFIG_ENERGY_MODEL
+	/* Build perf. domains: */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < n; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
+				goto match3;
+		}
+		/* No match - add perf. domains for a new rd */
+		build_perf_domains(doms_new[i]);
+match3:
+		;
+	}
+#endif
+
 	/* Remember the new sched domains: */
 	if (doms_cur != &fallback_doms)
 		free_sched_domains(doms_cur, ndoms_cur);