diff mbox

[2/6] drivers base/arch_topology: frequency-invariant load-tracking support

Message ID 20170608075513.12475-3-dietmar.eggemann@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dietmar Eggemann June 8, 2017, 7:55 a.m. UTC
Implements an arch-specific frequency-scaling function
topology_get_freq_scale() which provides the following frequency
scaling factor:

  current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_supported_freq(cpu)

The debug output in init_cpu_capacity_callback() has been changed to be
able to distinguish whether cpu capacity and max frequency or only max
frequency has been set. The latter case happens on systems where there
is no or broken cpu capacity binding (cpu node property
capacity-dmips-mhz) information.

One possible consumer of this is the Per-Entity Load Tracking (PELT)
mechanism of the task scheduler.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 drivers/base/arch_topology.c  | 52 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/arch_topology.h |  2 ++
 2 files changed, 51 insertions(+), 3 deletions(-)

Comments

Vincent Guittot June 12, 2017, 2:27 p.m. UTC | #1
On 8 June 2017 at 09:55, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> Implements an arch-specific frequency-scaling function
> topology_get_freq_scale() which provides the following frequency
> scaling factor:
>
>   current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_supported_freq(cpu)
>
> The debug output in init_cpu_capacity_callback() has been changed to be
> able to distinguish whether cpu capacity and max frequency or only max
> frequency has been set. The latter case happens on systems where there
> is no or broken cpu capacity binding (cpu node property
> capacity-dmips-mhz) information.
>
> One possible consumer of this is the Per-Entity Load Tracking (PELT)
> mechanism of the task scheduler.
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Russell King <linux@arm.linux.org.uk>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Juri Lelli <juri.lelli@arm.com>
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> ---
>  drivers/base/arch_topology.c  | 52 ++++++++++++++++++++++++++++++++++++++++---
>  include/linux/arch_topology.h |  2 ++
>  2 files changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 272831c89feb..f6f14670bdab 100644
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -24,12 +24,18 @@
>
>  static DEFINE_MUTEX(cpu_scale_mutex);
>  static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
> +static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
>
>  unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
>  {
>         return per_cpu(cpu_scale, cpu);
>  }
>
> +unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu)
> +{
> +       return per_cpu(freq_scale, cpu);
> +}
> +
>  void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
>  {
>         per_cpu(cpu_scale, cpu) = capacity;
> @@ -164,6 +170,7 @@ static cpumask_var_t cpus_to_visit;
>  static bool cap_parsing_done;
>  static void parsing_done_workfn(struct work_struct *work);
>  static DECLARE_WORK(parsing_done_work, parsing_done_workfn);
> +static DEFINE_PER_CPU(unsigned long, max_freq);
>
>  static int
>  init_cpu_capacity_callback(struct notifier_block *nb,
> @@ -185,6 +192,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
>                                cpus_to_visit,
>                                policy->related_cpus);
>                 for_each_cpu(cpu, policy->related_cpus) {
> +                       per_cpu(max_freq, cpu) = policy->cpuinfo.max_freq;
>                         if (cap_parsing_failed)
>                                 continue;
>                         raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) *
> @@ -195,8 +203,10 @@ init_cpu_capacity_callback(struct notifier_block *nb,
>                         if (!cap_parsing_failed) {
>                                 topology_normalize_cpu_scale();
>                                 kfree(raw_capacity);
> +                               pr_debug("cpu_capacity: parsing done\n");
> +                       } else {
> +                               pr_debug("cpu_capacity: max frequency parsing done\n");
>                         }
> -                       pr_debug("cpu_capacity: parsing done\n");
>                         cap_parsing_done = true;
>                         schedule_work(&parsing_done_work);
>                 }
> @@ -208,8 +218,38 @@ static struct notifier_block init_cpu_capacity_notifier = {
>         .notifier_call = init_cpu_capacity_callback,
>  };
>
> +static void set_freq_scale(unsigned int cpu, unsigned long freq)
> +{
> +       unsigned long max = per_cpu(max_freq, cpu);
> +
> +       if (!max)
> +               return;
> +
> +       per_cpu(freq_scale, cpu) = (freq << SCHED_CAPACITY_SHIFT) / max;
> +}
> +
> +static int set_freq_scale_callback(struct notifier_block *nb,
> +                                  unsigned long val,
> +                                  void *data)
> +{
> +       struct cpufreq_freqs *freq = data;
> +
> +       switch (val) {
> +       case CPUFREQ_PRECHANGE:
> +               set_freq_scale(freq->cpu, freq->new);
> +       }
> +
> +       return 0;
> +}
> +
> +static struct notifier_block set_freq_scale_notifier = {
> +       .notifier_call = set_freq_scale_callback,
> +};
> +
>  static int __init register_cpufreq_notifier(void)
>  {
> +       int ret;
> +
>         /*
>          * on ACPI-based systems we need to use the default cpu capacity
>          * until we have the necessary code to parse the cpu capacity, so
> @@ -225,8 +265,14 @@ static int __init register_cpufreq_notifier(void)
>
>         cpumask_copy(cpus_to_visit, cpu_possible_mask);
>
> -       return cpufreq_register_notifier(&init_cpu_capacity_notifier,
> -                                        CPUFREQ_POLICY_NOTIFIER);
> +       ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
> +                                       CPUFREQ_POLICY_NOTIFIER);
> +
> +       if (ret)

Don't you have to free memory allocated for cpus_to_visit in case of
errot ? it was not done before your patch as well

> +               return ret;
> +
> +       return cpufreq_register_notifier(&set_freq_scale_notifier,
> +                                        CPUFREQ_TRANSITION_NOTIFIER);

Don't you have to unregister the other cpufreq notifier if an error is
returned and free the mem allocated for cpus_to_visit ?

>  }
>  core_initcall(register_cpufreq_notifier);
>
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index 9af3c174c03a..3fb4d8ccb179 100644
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -12,6 +12,8 @@ int topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
>  struct sched_domain;
>  unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu);
>
> +unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu);
> +
>  void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
>
>  #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
> --
> 2.11.0
>
Viresh Kumar June 20, 2017, 6:17 a.m. UTC | #2
On Thu, Jun 8, 2017 at 1:25 PM, Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:

> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c

>  static int
>  init_cpu_capacity_callback(struct notifier_block *nb,
> @@ -185,6 +192,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
>                                cpus_to_visit,
>                                policy->related_cpus);
>                 for_each_cpu(cpu, policy->related_cpus) {
> +                       per_cpu(max_freq, cpu) = policy->cpuinfo.max_freq;

I am not sure about this but why shouldn't we use policy->max here ?
As that is the
max, we can set the frequency to right now.

>                         if (cap_parsing_failed)
>                                 continue;
>                         raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) *
> @@ -195,8 +203,10 @@ init_cpu_capacity_callback(struct notifier_block *nb,
>                         if (!cap_parsing_failed) {
>                                 topology_normalize_cpu_scale();
>                                 kfree(raw_capacity);
> +                               pr_debug("cpu_capacity: parsing done\n");
> +                       } else {
> +                               pr_debug("cpu_capacity: max frequency parsing done\n");
>                         }
> -                       pr_debug("cpu_capacity: parsing done\n");
>                         cap_parsing_done = true;
>                         schedule_work(&parsing_done_work);
>                 }
> @@ -208,8 +218,38 @@ static struct notifier_block init_cpu_capacity_notifier = {
>         .notifier_call = init_cpu_capacity_callback,
>  };
>
> +static void set_freq_scale(unsigned int cpu, unsigned long freq)
> +{
> +       unsigned long max = per_cpu(max_freq, cpu);
> +
> +       if (!max)
> +               return;
> +
> +       per_cpu(freq_scale, cpu) = (freq << SCHED_CAPACITY_SHIFT) / max;
> +}
> +
> +static int set_freq_scale_callback(struct notifier_block *nb,
> +                                  unsigned long val,
> +                                  void *data)
> +{
> +       struct cpufreq_freqs *freq = data;
> +
> +       switch (val) {
> +       case CPUFREQ_PRECHANGE:
> +               set_freq_scale(freq->cpu, freq->new);

Any specific reason on why are we doing this from PRECHANGE and
not POSTCHANGE ? i.e. we are doing this before the frequency is
really updated.

> +       }
> +
> +       return 0;
> +}
> +
> +static struct notifier_block set_freq_scale_notifier = {
> +       .notifier_call = set_freq_scale_callback,
> +};
> +
>  static int __init register_cpufreq_notifier(void)
>  {
> +       int ret;
> +
>         /*
>          * on ACPI-based systems we need to use the default cpu capacity
>          * until we have the necessary code to parse the cpu capacity, so
> @@ -225,8 +265,14 @@ static int __init register_cpufreq_notifier(void)
>
>         cpumask_copy(cpus_to_visit, cpu_possible_mask);
>
> -       return cpufreq_register_notifier(&init_cpu_capacity_notifier,
> -                                        CPUFREQ_POLICY_NOTIFIER);
> +       ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
> +                                       CPUFREQ_POLICY_NOTIFIER);

Wanted to make sure that we all understand the constraints this is going to add
for the ARM64 platforms.

With the introduction of this transition notifier, we would not be able to use
the fast-switch path in the schedutil governor. I am not sure if there are any
ARM platforms that can actually use the fast-switch path in future or not
though. The requirement of fast-switch path is that the freq can be changed
without sleeping in the hot-path.

So, will we ever want fast-switching for ARM platforms ?

--
viresh
Saravana Kannan June 21, 2017, 12:31 a.m. UTC | #3
On 06/19/2017 11:17 PM, Viresh Kumar wrote:
> On Thu, Jun 8, 2017 at 1:25 PM, Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
>
>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>
>>   static int __init register_cpufreq_notifier(void)
>>   {
>> +       int ret;
>> +
>>          /*
>>           * on ACPI-based systems we need to use the default cpu capacity
>>           * until we have the necessary code to parse the cpu capacity, so
>> @@ -225,8 +265,14 @@ static int __init register_cpufreq_notifier(void)
>>
>>          cpumask_copy(cpus_to_visit, cpu_possible_mask);
>>
>> -       return cpufreq_register_notifier(&init_cpu_capacity_notifier,
>> -                                        CPUFREQ_POLICY_NOTIFIER);
>> +       ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
>> +                                       CPUFREQ_POLICY_NOTIFIER);
>
> Wanted to make sure that we all understand the constraints this is going to add
> for the ARM64 platforms.
>
> With the introduction of this transition notifier, we would not be able to use
> the fast-switch path in the schedutil governor. I am not sure if there are any
> ARM platforms that can actually use the fast-switch path in future or not
> though. The requirement of fast-switch path is that the freq can be changed
> without sleeping in the hot-path.
>
> So, will we ever want fast-switching for ARM platforms ?
>

I don't think we should go down a path that'll prevent ARM platform from 
switching over to fast-switching in the future.

Having said that, I'm not sure I fully agree with the decision to 
completely disable notifiers in the fast-switching case. How many of the 
current users of notifiers truly need support for sleeping in the 
notifier? Why not make all the transition notifiers atomic? Or at least 
add atomic transition notifiers that can be registered for separately if 
the client doesn't need the ability to sleep?

Most of the clients don't seem like ones that'll need to sleep.

There are a bunch of generic off-tree drivers (can't upstream them yet 
because it depends on the bus scaling framework) that also depend on 
CPUfreq transition notifiers that are going to stop working if fast 
switching becomes available in the future. So, this decision to disallow 
transition notifiers is painful for other reasons too.

-Saravana
Viresh Kumar June 21, 2017, 5:37 a.m. UTC | #4
On 20-06-17, 17:31, Saravana Kannan wrote:
> On 06/19/2017 11:17 PM, Viresh Kumar wrote:
> >On Thu, Jun 8, 2017 at 1:25 PM, Dietmar Eggemann
> ><dietmar.eggemann@arm.com> wrote:
> >
> >>diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> >
> >>  static int __init register_cpufreq_notifier(void)
> >>  {
> >>+       int ret;
> >>+
> >>         /*
> >>          * on ACPI-based systems we need to use the default cpu capacity
> >>          * until we have the necessary code to parse the cpu capacity, so
> >>@@ -225,8 +265,14 @@ static int __init register_cpufreq_notifier(void)
> >>
> >>         cpumask_copy(cpus_to_visit, cpu_possible_mask);
> >>
> >>-       return cpufreq_register_notifier(&init_cpu_capacity_notifier,
> >>-                                        CPUFREQ_POLICY_NOTIFIER);
> >>+       ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
> >>+                                       CPUFREQ_POLICY_NOTIFIER);
> >
> >Wanted to make sure that we all understand the constraints this is going to add
> >for the ARM64 platforms.
> >
> >With the introduction of this transition notifier, we would not be able to use
> >the fast-switch path in the schedutil governor. I am not sure if there are any
> >ARM platforms that can actually use the fast-switch path in future or not
> >though. The requirement of fast-switch path is that the freq can be changed
> >without sleeping in the hot-path.
> >
> >So, will we ever want fast-switching for ARM platforms ?
> >
> 
> I don't think we should go down a path that'll prevent ARM platform from
> switching over to fast-switching in the future.

Yeah, that's why brought attention to this stuff.

I think this patch doesn't really need to go down the notifiers way.

We can do something like this in the implementation of
topology_get_freq_scale():

        return (policy->cur << SCHED_CAPACITY_SHIFT) / max;

Though, we would be required to take care of policy structure in this
case somehow.

> Having said that, I'm not sure I fully agree with the decision to completely
> disable notifiers in the fast-switching case. How many of the current users
> of notifiers truly need support for sleeping in the notifier?

Its not just about sleeping here. We do not wish to call too much
stuff from scheduler hot path. Even if it doesn't sleep.

> Why not make
> all the transition notifiers atomic? Or at least add atomic transition
> notifiers that can be registered for separately if the client doesn't need
> the ability to sleep?
> 
> Most of the clients don't seem like ones that'll need to sleep.

Only if the scheduler maintainers agree to getting these notifiers
called from hot path, which I don't think is going to happen.

> There are a bunch of generic off-tree drivers (can't upstream them yet
> because it depends on the bus scaling framework) that also depend on CPUfreq
> transition notifiers that are going to stop working if fast switching
> becomes available in the future. So, this decision to disallow transition
> notifiers is painful for other reasons too.

I think its kind of fine to work without fast switch in those cases,
as we are anyway ready to call notifiers which may end up taking any
amount of time.

This case was special as it is affecting entire arch here and so I
pointed it out.
Morten Rasmussen June 21, 2017, 4:57 p.m. UTC | #5
On Wed, Jun 21, 2017 at 11:07:35AM +0530, Viresh Kumar wrote:
> On 20-06-17, 17:31, Saravana Kannan wrote:
> > On 06/19/2017 11:17 PM, Viresh Kumar wrote:
> > >On Thu, Jun 8, 2017 at 1:25 PM, Dietmar Eggemann
> > ><dietmar.eggemann@arm.com> wrote:
> > >
> > >>diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> > >
> > >>  static int __init register_cpufreq_notifier(void)
> > >>  {
> > >>+       int ret;
> > >>+
> > >>         /*
> > >>          * on ACPI-based systems we need to use the default cpu capacity
> > >>          * until we have the necessary code to parse the cpu capacity, so
> > >>@@ -225,8 +265,14 @@ static int __init register_cpufreq_notifier(void)
> > >>
> > >>         cpumask_copy(cpus_to_visit, cpu_possible_mask);
> > >>
> > >>-       return cpufreq_register_notifier(&init_cpu_capacity_notifier,
> > >>-                                        CPUFREQ_POLICY_NOTIFIER);
> > >>+       ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
> > >>+                                       CPUFREQ_POLICY_NOTIFIER);
> > >
> > >Wanted to make sure that we all understand the constraints this is going to add
> > >for the ARM64 platforms.
> > >
> > >With the introduction of this transition notifier, we would not be able to use
> > >the fast-switch path in the schedutil governor. I am not sure if there are any
> > >ARM platforms that can actually use the fast-switch path in future or not
> > >though. The requirement of fast-switch path is that the freq can be changed
> > >without sleeping in the hot-path.
> > >
> > >So, will we ever want fast-switching for ARM platforms ?

I hope that one day we will have such platforms.

> > >
> > 
> > I don't think we should go down a path that'll prevent ARM platform from
> > switching over to fast-switching in the future.
> 
> Yeah, that's why brought attention to this stuff.

It is true that this patch relies on the notifiers, but I don't see how
that prevents us from adding a non-notifier based solution for
fast-switch enabled platforms later?

> 
> I think this patch doesn't really need to go down the notifiers way.
> 
> We can do something like this in the implementation of
> topology_get_freq_scale():
> 
>         return (policy->cur << SCHED_CAPACITY_SHIFT) / max;
> 
> Though, we would be required to take care of policy structure in this
> case somehow.

This is exactly what this patch implements. Unfortunately we can't be
sure that there is a valid policy data structure where we can read the
information from. Isn't the policy protected by a lock as well?

I don't quite see how you would solve that problem without having some
cached version of the scaling factor that is safe to read without
locking and is always available, even before cpufreq has come up.

Another thing is that I don't think a transition notifier based solution
or any other solution based on the cur/max ratio is really the right way
to go for fast-switching platforms. If we can do very frequent frequency
switching it makes less sense to use the current ratio whenever we
update the PELT averages as the frequency might have changed multiple
times since the last update. So it would make more sense to have an
average ratio instead.

If the platform has HW counters (e.g. APERF/MPERF) that can provide the
ratio then we should of course use those, if not, one solution could be
to let cpufreq track the average frequency for each cpu over a suitable
time window (around one sched period I think). It should be fairly low
overhead to maintain. In the topology driver, we would then choose
whether the scaling factor is provided by the cpufreq average frequency
ratio or the current transition notifier based approach based on the
capabilities of the platform.

Morten
Dietmar Eggemann June 21, 2017, 5:08 p.m. UTC | #6
On 21/06/17 01:31, Saravana Kannan wrote:
> On 06/19/2017 11:17 PM, Viresh Kumar wrote:
>> On Thu, Jun 8, 2017 at 1:25 PM, Dietmar Eggemann
>> <dietmar.eggemann@arm.com> wrote:
>>
>>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>>
>>>   static int __init register_cpufreq_notifier(void)
>>>   {
>>> +       int ret;
>>> +
>>>          /*
>>>           * on ACPI-based systems we need to use the default cpu
>>> capacity
>>>           * until we have the necessary code to parse the cpu
>>> capacity, so
>>> @@ -225,8 +265,14 @@ static int __init register_cpufreq_notifier(void)
>>>
>>>          cpumask_copy(cpus_to_visit, cpu_possible_mask);
>>>
>>> -       return cpufreq_register_notifier(&init_cpu_capacity_notifier,
>>> -                                        CPUFREQ_POLICY_NOTIFIER);
>>> +       ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
>>> +                                       CPUFREQ_POLICY_NOTIFIER);
>>
>> Wanted to make sure that we all understand the constraints this is
>> going to add
>> for the ARM64 platforms.
>>
>> With the introduction of this transition notifier, we would not be
>> able to use
>> the fast-switch path in the schedutil governor. I am not sure if there
>> are any
>> ARM platforms that can actually use the fast-switch path in future or not
>> though. The requirement of fast-switch path is that the freq can be
>> changed
>> without sleeping in the hot-path.
>>
>> So, will we ever want fast-switching for ARM platforms ?
>>
> 
> I don't think we should go down a path that'll prevent ARM platform from
> switching over to fast-switching in the future.

Understood. But IMHO implementing a cpufreq transition notifier based
Frequency Invariance Engine (FIE) which provides frequency-invariant
accounting for 100% of today's arm/arm64 system is legitimate.

Like I said in the other email in this thread today, I can make sure
that I only register the cpufreq transition notifier if none of the
policies support fast frequency switching. In this case people can use
mainline to experiment with cpufreq drivers supporting fast frequency
switching (without FIE support).

> Having said that, I'm not sure I fully agree with the decision to
> completely disable notifiers in the fast-switching case. How many of the
> current users of notifiers truly need support for sleeping in the
> notifier? Why not make all the transition notifiers atomic? Or at least
> add atomic transition notifiers that can be registered for separately if
> the client doesn't need the ability to sleep?

IMHO, that's a different construction side inside the cpufreq framework.
Patches which introduced the fast frequency switching support in cpufreq
clearly state that "... fast frequency switching is inherently
incompatible with cpufreq transition notifiers ...".

If we can get rid of this restriction, the cpufreq transition notifier
based FIE implementation could stay. Otherwise we need a FIE for systems
with fast frequency switching based on something else, e.g. performance
counters.

> Most of the clients don't seem like ones that'll need to sleep.
> 
> There are a bunch of generic off-tree drivers (can't upstream them yet
> because it depends on the bus scaling framework) that also depend on
> CPUfreq transition notifiers that are going to stop working if fast
> switching becomes available in the future. So, this decision to disallow
> transition notifiers is painful for other reasons too.

Falls into the same bucket for me ... you have a requirement against the
cpufreq framework to let cpufreq transition notifier work for fast
frequency switching drivers.

[...]
Viresh Kumar June 22, 2017, 4:06 a.m. UTC | #7
On 21-06-17, 17:57, Morten Rasmussen wrote:
> It is true that this patch relies on the notifiers, but I don't see how
> that prevents us from adding a non-notifier based solution for
> fast-switch enabled platforms later?

No it doesn't, but I thought it would be better to have a single
solution (if possible) for all the cases here.

> > I think this patch doesn't really need to go down the notifiers way.
> > 
> > We can do something like this in the implementation of
> > topology_get_freq_scale():
> > 
> >         return (policy->cur << SCHED_CAPACITY_SHIFT) / max;
> > 
> > Though, we would be required to take care of policy structure in this
> > case somehow.
> 
> This is exactly what this patch implements. Unfortunately we can't be
> sure that there is a valid policy data structure where we can read the
> information from.

Actually there is a way around that.

- Revert one of my patches:
  commit f9f41e3ef99a ("cpufreq: Remove policy create/remove notifiers")

- Use those notifiers in init_cpu_capacity_callback() instead of
  CPUFREQ_NOTIFY and set/reset a local policy pointer.

- And this pointer we can use safely/reliably in
  topology_get_freq_scale(). We may need to use RCU read side
  protection in topology_get_freq_scale() though, to make sure the
  local policy pointer isn't getting updated simultaneously.

- If the policy pointer isn't set, then we can use
  SCHED_CAPACITY_SCALE value instead.


> Isn't the policy protected by a lock as well?

There are locks, but you don't need any to read policy->cur.

> Another thing is that I don't think a transition notifier based solution
> or any other solution based on the cur/max ratio is really the right way
> to go for fast-switching platforms. If we can do very frequent frequency
> switching it makes less sense to use the current ratio whenever we
> update the PELT averages as the frequency might have changed multiple
> times since the last update. So it would make more sense to have an
> average ratio instead.

> If the platform has HW counters (e.g. APERF/MPERF) that can provide the
> ratio then we should of course use those, if not, one solution could be
> to let cpufreq track the average frequency for each cpu over a suitable
> time window (around one sched period I think). It should be fairly low
> overhead to maintain. In the topology driver, we would then choose
> whether the scaling factor is provided by the cpufreq average frequency
> ratio or the current transition notifier based approach based on the
> capabilities of the platform.

Hmm, maybe.
Morten Rasmussen June 22, 2017, 9:59 a.m. UTC | #8
On Thu, Jun 22, 2017 at 09:36:43AM +0530, Viresh Kumar wrote:
> On 21-06-17, 17:57, Morten Rasmussen wrote:
> > It is true that this patch relies on the notifiers, but I don't see how
> > that prevents us from adding a non-notifier based solution for
> > fast-switch enabled platforms later?
> 
> No it doesn't, but I thought it would be better to have a single
> solution (if possible) for all the cases here.

Right. As I mentioned further down in my reply. There is no single
solution that fits all. Smart platforms with HW counters, like x86,
would want to use those. IIUC, cpufreq has no idea what the true
delivered performance is anyway on those platforms.

> 
> > > I think this patch doesn't really need to go down the notifiers way.
> > > 
> > > We can do something like this in the implementation of
> > > topology_get_freq_scale():
> > > 
> > >         return (policy->cur << SCHED_CAPACITY_SHIFT) / max;
> > > 
> > > Though, we would be required to take care of policy structure in this
> > > case somehow.
> > 
> > This is exactly what this patch implements. Unfortunately we can't be
> > sure that there is a valid policy data structure where we can read the
> > information from.
> 
> Actually there is a way around that.
> 
> - Revert one of my patches:
>   commit f9f41e3ef99a ("cpufreq: Remove policy create/remove notifiers")
> 
> - Use those notifiers in init_cpu_capacity_callback() instead of
>   CPUFREQ_NOTIFY and set/reset a local policy pointer.
> 
> - And this pointer we can use safely/reliably in
>   topology_get_freq_scale(). We may need to use RCU read side
>   protection in topology_get_freq_scale() though, to make sure the
>   local policy pointer isn't getting updated simultaneously.
> 
> - If the policy pointer isn't set, then we can use
>   SCHED_CAPACITY_SCALE value instead.

IIUC, you are proposing to maintain an RCU protected pointer in the
topology driver to the policy data structure inside cpufreq and keep it
up to date through cpufreq notifiers. So instead of getting notified
when the frequency changes so we can recompute the scaling ratio, we
have to poll the value and recompute the ratio on each access.

If we are modifying cpufreq, why not just make cpufreq responsible for
providing the scaling factor? It seems easier, cleaner, and a lot
less fragile.

> 
> 
> > Isn't the policy protected by a lock as well?
> 
> There are locks, but you don't need any to read policy->cur.

Okay, but you need to rely on notifiers to know when it is valid.

> 
> > Another thing is that I don't think a transition notifier based solution
> > or any other solution based on the cur/max ratio is really the right way
> > to go for fast-switching platforms. If we can do very frequent frequency
> > switching it makes less sense to use the current ratio whenever we
> > update the PELT averages as the frequency might have changed multiple
> > times since the last update. So it would make more sense to have an
> > average ratio instead.
> 
> > If the platform has HW counters (e.g. APERF/MPERF) that can provide the
> > ratio then we should of course use those, if not, one solution could be
> > to let cpufreq track the average frequency for each cpu over a suitable
> > time window (around one sched period I think). It should be fairly low
> > overhead to maintain. In the topology driver, we would then choose
> > whether the scaling factor is provided by the cpufreq average frequency
> > ratio or the current transition notifier based approach based on the
> > capabilities of the platform.
> 
> Hmm, maybe.

You said you wanted a solution that works for fast-switch enabled
platforms ;-)

The cur/max ratio isn't sufficient for those. PeterZ has already
proposed to use APERF/MPERF for x86 to use the average frequency for
PELT updates. I think other fast-switch platforms would want something
similar, as it makes much more sense.

Morten
diff mbox

Patch

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 272831c89feb..f6f14670bdab 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -24,12 +24,18 @@ 
 
 static DEFINE_MUTEX(cpu_scale_mutex);
 static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
 
 unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
 {
 	return per_cpu(cpu_scale, cpu);
 }
 
+unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(freq_scale, cpu);
+}
+
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
 {
 	per_cpu(cpu_scale, cpu) = capacity;
@@ -164,6 +170,7 @@  static cpumask_var_t cpus_to_visit;
 static bool cap_parsing_done;
 static void parsing_done_workfn(struct work_struct *work);
 static DECLARE_WORK(parsing_done_work, parsing_done_workfn);
+static DEFINE_PER_CPU(unsigned long, max_freq);
 
 static int
 init_cpu_capacity_callback(struct notifier_block *nb,
@@ -185,6 +192,7 @@  init_cpu_capacity_callback(struct notifier_block *nb,
 			       cpus_to_visit,
 			       policy->related_cpus);
 		for_each_cpu(cpu, policy->related_cpus) {
+			per_cpu(max_freq, cpu) = policy->cpuinfo.max_freq;
 			if (cap_parsing_failed)
 				continue;
 			raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) *
@@ -195,8 +203,10 @@  init_cpu_capacity_callback(struct notifier_block *nb,
 			if (!cap_parsing_failed) {
 				topology_normalize_cpu_scale();
 				kfree(raw_capacity);
+				pr_debug("cpu_capacity: parsing done\n");
+			} else {
+				pr_debug("cpu_capacity: max frequency parsing done\n");
 			}
-			pr_debug("cpu_capacity: parsing done\n");
 			cap_parsing_done = true;
 			schedule_work(&parsing_done_work);
 		}
@@ -208,8 +218,38 @@  static struct notifier_block init_cpu_capacity_notifier = {
 	.notifier_call = init_cpu_capacity_callback,
 };
 
+static void set_freq_scale(unsigned int cpu, unsigned long freq)
+{
+	unsigned long max = per_cpu(max_freq, cpu);
+
+	if (!max)
+		return;
+
+	per_cpu(freq_scale, cpu) = (freq << SCHED_CAPACITY_SHIFT) / max;
+}
+
+static int set_freq_scale_callback(struct notifier_block *nb,
+				   unsigned long val,
+				   void *data)
+{
+	struct cpufreq_freqs *freq = data;
+
+	switch (val) {
+	case CPUFREQ_PRECHANGE:
+		set_freq_scale(freq->cpu, freq->new);
+	}
+
+	return 0;
+}
+
+static struct notifier_block set_freq_scale_notifier = {
+	.notifier_call = set_freq_scale_callback,
+};
+
 static int __init register_cpufreq_notifier(void)
 {
+	int ret;
+
 	/*
 	 * on ACPI-based systems we need to use the default cpu capacity
 	 * until we have the necessary code to parse the cpu capacity, so
@@ -225,8 +265,14 @@  static int __init register_cpufreq_notifier(void)
 
 	cpumask_copy(cpus_to_visit, cpu_possible_mask);
 
-	return cpufreq_register_notifier(&init_cpu_capacity_notifier,
-					 CPUFREQ_POLICY_NOTIFIER);
+	ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
+					CPUFREQ_POLICY_NOTIFIER);
+
+	if (ret)
+		return ret;
+
+	return cpufreq_register_notifier(&set_freq_scale_notifier,
+					 CPUFREQ_TRANSITION_NOTIFIER);
 }
 core_initcall(register_cpufreq_notifier);
 
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 9af3c174c03a..3fb4d8ccb179 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -12,6 +12,8 @@  int topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 struct sched_domain;
 unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu);
 
+unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu);
+
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */