diff mbox

[v2,02/10] cpufreq: provide data for frequency-invariant load-tracking support

Message ID c92f1bd0-6611-8eec-51d7-f6d7754a5870@arm.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Dietmar Eggemann July 7, 2017, 4:01 p.m. UTC
On 06/07/17 11:40, Viresh Kumar wrote:
> On 06-07-17, 10:49, Dietmar Eggemann wrote:

[...]

>> In case arch_set_freq_scale() is not defined (and because of the
>> pr_debug() drivers/cpufreq/cpufreq.c is not compiled with -DDEBUG)
> 
> The line within () needs to be improved to convey a clear message.

Probably not needed anymore. See below.

[...]

>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>> index 9bf97a366029..a04c5886a5ce 100644
>> --- a/drivers/cpufreq/cpufreq.c
>> +++ b/drivers/cpufreq/cpufreq.c
>> @@ -347,6 +347,28 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
>>  	}
>>  }
>>  
>> +/*********************************************************************
>> + *           FREQUENCY INVARIANT CPU CAPACITY SUPPORT                *
>> + *********************************************************************/
>> +
>> +#ifndef arch_set_freq_scale
>> +static void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
>> +				unsigned long max_freq)
>> +{}
>> +#endif
>> +
>> +static void cpufreq_set_freq_scale(struct cpufreq_policy *policy,
>> +				   struct cpufreq_freqs *freqs)
>> +{
>> +	unsigned long cur_freq = freqs ? freqs->new : policy->cur;
>> +	unsigned long max_freq = policy->cpuinfo.max_freq;
>> +
>> +	pr_debug("cpus %*pbl cur/cur max freq %lu/%lu kHz\n",
>> +		 cpumask_pr_args(policy->related_cpus), cur_freq, max_freq);
>> +
>> +	arch_set_freq_scale(policy->related_cpus, cur_freq, max_freq);
> 
> I am not sure why all these are required to be sent here and will come back to
> it later on after going through other patches.

See below.

>> +}
>> +
>>  /**
>>   * cpufreq_notify_transition - call notifier chain and adjust_jiffies
>>   * on frequency transition.
>> @@ -405,6 +427,8 @@ void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
>>  
>>  	spin_unlock(&policy->transition_lock);
>>  
>> +	cpufreq_set_freq_scale(policy, freqs);
>> +
> 
> Why do this before even changing the frequency ? We may fail while changing it.
> 
> IMHO, you should call this routine whenever we update policy->cur and that
> happens regularly in __cpufreq_notify_transition() and few other places..

See below.
 
>>  	cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
>>  }
>>  EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
>> @@ -2203,6 +2227,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
>>  	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>>  			CPUFREQ_NOTIFY, new_policy);
>>  
>> +	cpufreq_set_freq_scale(new_policy, NULL);
> 
> Why added it here ? To get it initialized ? If yes, then we should do that in
> cpufreq_online() where we first initialize policy->cur.

I agree. This can go away. Initialization is not really needed here. We initialize
the scale values to SCHED_CAPACITY_SCALE at boot-time. 

> Apart from this, you also need to update this in the schedutil governor (if you
> haven't done that in this series later) as that also updates policy->cur in the
> fast path.

So what about I call arch_set_freq_scale() in __cpufreq_notify_transition() in the
CPUFREQ_POSTCHANGE case for slow-switching and in cpufreq_driver_fast_switch() for
fast-switching?

Like this: (only slow-path slightly tested):

Comments

Rafael J. Wysocki July 7, 2017, 4:18 p.m. UTC | #1
On Fri, Jul 7, 2017 at 6:01 PM, Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
> On 06/07/17 11:40, Viresh Kumar wrote:
>> On 06-07-17, 10:49, Dietmar Eggemann wrote:
>
> [...]
>
>>> In case arch_set_freq_scale() is not defined (and because of the
>>> pr_debug() drivers/cpufreq/cpufreq.c is not compiled with -DDEBUG)
>>
>> The line within () needs to be improved to convey a clear message.
>
> Probably not needed anymore. See below.
>
> [...]
>
>>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>>> index 9bf97a366029..a04c5886a5ce 100644
>>> --- a/drivers/cpufreq/cpufreq.c
>>> +++ b/drivers/cpufreq/cpufreq.c
>>> @@ -347,6 +347,28 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
>>>      }
>>>  }
>>>
>>> +/*********************************************************************
>>> + *           FREQUENCY INVARIANT CPU CAPACITY SUPPORT                *
>>> + *********************************************************************/
>>> +
>>> +#ifndef arch_set_freq_scale
>>> +static void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
>>> +                            unsigned long max_freq)
>>> +{}
>>> +#endif
>>> +
>>> +static void cpufreq_set_freq_scale(struct cpufreq_policy *policy,
>>> +                               struct cpufreq_freqs *freqs)
>>> +{
>>> +    unsigned long cur_freq = freqs ? freqs->new : policy->cur;
>>> +    unsigned long max_freq = policy->cpuinfo.max_freq;
>>> +
>>> +    pr_debug("cpus %*pbl cur/cur max freq %lu/%lu kHz\n",
>>> +             cpumask_pr_args(policy->related_cpus), cur_freq, max_freq);
>>> +
>>> +    arch_set_freq_scale(policy->related_cpus, cur_freq, max_freq);
>>
>> I am not sure why all these are required to be sent here and will come back to
>> it later on after going through other patches.
>
> See below.
>
>>> +}
>>> +
>>>  /**
>>>   * cpufreq_notify_transition - call notifier chain and adjust_jiffies
>>>   * on frequency transition.
>>> @@ -405,6 +427,8 @@ void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
>>>
>>>      spin_unlock(&policy->transition_lock);
>>>
>>> +    cpufreq_set_freq_scale(policy, freqs);
>>> +
>>
>> Why do this before even changing the frequency ? We may fail while changing it.
>>
>> IMHO, you should call this routine whenever we update policy->cur and that
>> happens regularly in __cpufreq_notify_transition() and few other places..
>
> See below.
>
>>>      cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
>>>  }
>>>  EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
>>> @@ -2203,6 +2227,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
>>>      blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
>>>                      CPUFREQ_NOTIFY, new_policy);
>>>
>>> +    cpufreq_set_freq_scale(new_policy, NULL);
>>
>> Why added it here ? To get it initialized ? If yes, then we should do that in
>> cpufreq_online() where we first initialize policy->cur.
>
> I agree. This can go away. Initialization is not really needed here. We initialize
> the scale values to SCHED_CAPACITY_SCALE at boot-time.
>
>> Apart from this, you also need to update this in the schedutil governor (if you
>> haven't done that in this series later) as that also updates policy->cur in the
>> fast path.
>
> So what about I call arch_set_freq_scale() in __cpufreq_notify_transition() in the
> CPUFREQ_POSTCHANGE case for slow-switching and in cpufreq_driver_fast_switch() for
> fast-switching?

Why don't you do this in drivers instead of in the core?

Ultimately, the driver knows what frequency it has requested, so why
can't it call arch_set_freq_scale()?

Thanks,
Rafael
Dietmar Eggemann July 7, 2017, 5:06 p.m. UTC | #2
On 07/07/17 17:18, Rafael J. Wysocki wrote:
> On Fri, Jul 7, 2017 at 6:01 PM, Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
>> On 06/07/17 11:40, Viresh Kumar wrote:
>>> On 06-07-17, 10:49, Dietmar Eggemann wrote:

[...]

>> So what about I call arch_set_freq_scale() in __cpufreq_notify_transition() in the
>> CPUFREQ_POSTCHANGE case for slow-switching and in cpufreq_driver_fast_switch() for
>> fast-switching?
> 
> Why don't you do this in drivers instead of in the core?
> 
> Ultimately, the driver knows what frequency it has requested, so why
> can't it call arch_set_freq_scale()?

That's correct but for arm/arm64 we have a lot of different cpufreq
drivers to deal with. And doing this call to arch_set_freq_scale() once
in the cpufreq core will cover them all.

[...]
Rafael J. Wysocki July 8, 2017, 12:09 p.m. UTC | #3
On Friday, July 07, 2017 06:06:30 PM Dietmar Eggemann wrote:
> On 07/07/17 17:18, Rafael J. Wysocki wrote:
> > On Fri, Jul 7, 2017 at 6:01 PM, Dietmar Eggemann
> > <dietmar.eggemann@arm.com> wrote:
> >> On 06/07/17 11:40, Viresh Kumar wrote:
> >>> On 06-07-17, 10:49, Dietmar Eggemann wrote:
> 
> [...]
> 
> >> So what about I call arch_set_freq_scale() in __cpufreq_notify_transition() in the
> >> CPUFREQ_POSTCHANGE case for slow-switching and in cpufreq_driver_fast_switch() for
> >> fast-switching?
> > 
> > Why don't you do this in drivers instead of in the core?
> > 
> > Ultimately, the driver knows what frequency it has requested, so why
> > can't it call arch_set_freq_scale()?
> 
> That's correct but for arm/arm64 we have a lot of different cpufreq
> drivers to deal with. And doing this call to arch_set_freq_scale() once
> in the cpufreq core will cover them all.
> 
> [...]

I'm sort of wondering how many is "a lot" really.  For instance, do you really
want all of the existing ARM platforms to use the new stuff even though
it may regress things there in principle?

Anyway, if everyone agrees that doing it in the core is the way to go (Peter?),
why don't you introduce a __weak function for setting policy->cur and
override it from your arch so as to call arch_set_freq_scale() from there?

Thanks,
Rafael
Viresh Kumar July 10, 2017, 6:40 a.m. UTC | #4
On 07-07-17, 17:01, Dietmar Eggemann wrote:
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 9bf97a366029..77c4d5e7a598 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -301,6 +301,12 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
>  #endif
>  }
>  
> +#ifndef arch_set_freq_scale
> +static void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
> +                               unsigned long max_freq)
> +{}
> +#endif
> +
>  static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
>                 struct cpufreq_freqs *freqs, unsigned int state)
>  {
> @@ -343,6 +349,8 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
>                                 CPUFREQ_POSTCHANGE, freqs);
>                 if (likely(policy) && likely(policy->cpu == freqs->cpu))
>                         policy->cur = freqs->new;
> +               arch_set_freq_scale(policy->related_cpus, policy->cur,
> +                                   policy->cpuinfo.max_freq);

This function gets called for-each-cpu-in-policy, so you don't need the first
argument. And the topology code already has max_freq, so not sure if you need
the last parameter as well, specially for the ARM solution.
Viresh Kumar July 10, 2017, 6:54 a.m. UTC | #5
On 08-07-17, 14:09, Rafael J. Wysocki wrote:
> I'm sort of wondering how many is "a lot" really.  For instance, do you really
> want all of the existing ARM platforms to use the new stuff even though
> it may regress things there in principle?

That's a valid question and we must (maybe we already have) have a policy for
such changes. I thought that such changes (which are so closely bound to the
scheduler) must be at least done at the architecture level and not really at
platform level. And so doing it widely (like done in this patch) maybe the right
thing to do.

> Anyway, if everyone agrees that doing it in the core is the way to go (Peter?),
> why don't you introduce a __weak function for setting policy->cur and
> override it from your arch so as to call arch_set_freq_scale() from there?

I agree. I wanted to suggest that earlier but somehow forgot to mention this.
Peter Zijlstra July 10, 2017, 9:30 a.m. UTC | #6
On Sat, Jul 08, 2017 at 02:09:37PM +0200, Rafael J. Wysocki wrote:
> Anyway, if everyone agrees that doing it in the core is the way to go (Peter?),
> why don't you introduce a __weak function for setting policy->cur and
> override it from your arch so as to call arch_set_freq_scale() from there?
> 

So I'm terminally backlogged and my recent break didn't help any with
that.

I'm at a total loss as to what is proposed here and why we need it. I
tried reading both the Changelog and patch but came up empty.
Viresh Kumar July 10, 2017, 9:42 a.m. UTC | #7
On 10-07-17, 11:30, Peter Zijlstra wrote:
> On Sat, Jul 08, 2017 at 02:09:37PM +0200, Rafael J. Wysocki wrote:
> > Anyway, if everyone agrees that doing it in the core is the way to go (Peter?),
> > why don't you introduce a __weak function for setting policy->cur and
> > override it from your arch so as to call arch_set_freq_scale() from there?
> > 
> 
> So I'm terminally backlogged and my recent break didn't help any with
> that.
> 
> I'm at a total loss as to what is proposed here and why we need it. I
> tried reading both the Changelog and patch but came up empty.

Dietmar is proposing the implementation of arch_set_freq_scale() for ARM (32/64)
platforms here with following equation in drivers/base/arch_topology.c:

scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq

The only variable part here is "cur_freq" and he is looking for sane ways to get
that value in the arch_topology.c file, so he can use that in the above
equation. He tried to use cpufreq transition notifiers earlier but they block us
from using fast switching.

What he is proposing now is a function:

void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
                         unsigned long max_freq);

which has to be called by someone after the frequency of the CPU is changed.

Dietmar proposed that this be called by cpufreq core and Rafael was wondering if
the cpufreq drivers should call it. Dietmar's argument is that it will be used
for the entire ARM architecture this way and wouldn't lead to redundant core
across drivers.

Hope I didn't confuse you more with this :)
Dietmar Eggemann July 10, 2017, 10:31 a.m. UTC | #8
On 10/07/17 10:42, Viresh Kumar wrote:
> On 10-07-17, 11:30, Peter Zijlstra wrote:
>> On Sat, Jul 08, 2017 at 02:09:37PM +0200, Rafael J. Wysocki wrote:
>>> Anyway, if everyone agrees that doing it in the core is the way to go (Peter?),
>>> why don't you introduce a __weak function for setting policy->cur and
>>> override it from your arch so as to call arch_set_freq_scale() from there?
>>>
>>
>> So I'm terminally backlogged and my recent break didn't help any with
>> that.
>>
>> I'm at a total loss as to what is proposed here and why we need it. I
>> tried reading both the Changelog and patch but came up empty.
> 
> Dietmar is proposing the implementation of arch_set_freq_scale() for ARM (32/64)
> platforms here with following equation in drivers/base/arch_topology.c:
> 
> scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq
> 
> The only variable part here is "cur_freq" and he is looking for sane ways to get
> that value in the arch_topology.c file, so he can use that in the above
> equation. He tried to use cpufreq transition notifiers earlier but they block us
> from using fast switching.
> 
> What he is proposing now is a function:
> 
> void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
>                          unsigned long max_freq);
> 
> which has to be called by someone after the frequency of the CPU is changed.
> 
> Dietmar proposed that this be called by cpufreq core and Rafael was wondering if
> the cpufreq drivers should call it. Dietmar's argument is that it will be used
> for the entire ARM architecture this way and wouldn't lead to redundant core
> across drivers.
> 
> Hope I didn't confuse you more with this :)
>

Perfect summary, thanks Viresh!

This is required for architectures (like arm/arm64) which do not have
any other way to know about the current CPU frequency.

X86 can do the frequency invariance support based on APERF/MPERF already
today so it does not need the support from cpufreq.
Dietmar Eggemann July 10, 2017, 12:02 p.m. UTC | #9
On 08/07/17 13:09, Rafael J. Wysocki wrote:
> On Friday, July 07, 2017 06:06:30 PM Dietmar Eggemann wrote:
>> On 07/07/17 17:18, Rafael J. Wysocki wrote:
>>> On Fri, Jul 7, 2017 at 6:01 PM, Dietmar Eggemann
>>> <dietmar.eggemann@arm.com> wrote:
>>>> On 06/07/17 11:40, Viresh Kumar wrote:
>>>>> On 06-07-17, 10:49, Dietmar Eggemann wrote:
>>
>> [...]
>>
>>>> So what about I call arch_set_freq_scale() in __cpufreq_notify_transition() in the
>>>> CPUFREQ_POSTCHANGE case for slow-switching and in cpufreq_driver_fast_switch() for
>>>> fast-switching?
>>>
>>> Why don't you do this in drivers instead of in the core?
>>>
>>> Ultimately, the driver knows what frequency it has requested, so why
>>> can't it call arch_set_freq_scale()?
>>
>> That's correct but for arm/arm64 we have a lot of different cpufreq
>> drivers to deal with. And doing this call to arch_set_freq_scale() once
>> in the cpufreq core will cover them all.
>>
>> [...]
> 
> I'm sort of wondering how many is "a lot" really.  For instance, do you really
> want all of the existing ARM platforms to use the new stuff even though
> it may regress things there in principle?

Yeah, in mainline we probably only care about a couple of them, I know
about cpufreq-dt.c, mt8173-cpufreq.c and arm_big_little.c.
But a lot of development in arm64 still happens outside mainline and we
would have to inform people to provision their cpufreq drivers with this
functionality.

With a solution in cpufreq.c we could just implement the functionality
in the arch which we then connect to the call in cpufreq.c.

> Anyway, if everyone agrees that doing it in the core is the way to go (Peter?),
> why don't you introduce a __weak function for setting policy->cur and
> override it from your arch so as to call arch_set_freq_scale() from there?

Yes, I will change this. The #define approach is not really necessary
here since we're not in the scheduler hot-path and inlining is not
really required here.

Thanks,

-- Dietmar

[...]
Rafael J. Wysocki July 10, 2017, 12:46 p.m. UTC | #10
On Monday, July 10, 2017 12:24:43 PM Viresh Kumar wrote:
> On 08-07-17, 14:09, Rafael J. Wysocki wrote:
> > I'm sort of wondering how many is "a lot" really.  For instance, do you really
> > want all of the existing ARM platforms to use the new stuff even though
> > it may regress things there in principle?
> 
> That's a valid question and we must (maybe we already have) have a policy for
> such changes.

I don't think it is a matter of policy, as it tends to vary from case to case.

What really matters is the reason to make the changes in this particular case.

If the reason if to help new systems to work better in the first place and the
old ones are affected just by the way, it may be better to avoid affecting them.

On the other hand, if the reason is to improve things for all the new and old
systems altogether, then sure let's do it this way.

> I thought that such changes (which are so closely bound to the
> scheduler) must be at least done at the architecture level and not really at
> platform level. And so doing it widely (like done in this patch) maybe the right
> thing to do.

This particular change is about a new feature, so making it in the core is OK
in two cases IMO: (a) when you actively want everyone to be affected by it and
(b) when the effect of it on the old systems should not be noticeable.

Thanks,
Rafael
Viresh Kumar July 11, 2017, 6:01 a.m. UTC | #11
On 10-07-17, 13:02, Dietmar Eggemann wrote:
> Yes, I will change this. The #define approach is not really necessary
> here since we're not in the scheduler hot-path and inlining is not
> really required here.

It would be part of scheduler hot-path for the fast-switching case, isn't it ?
(I am not arguing against using weak functions, just wanted to correct above
statement).
Viresh Kumar July 11, 2017, 6:39 a.m. UTC | #12
On 10-07-17, 14:46, Rafael J. Wysocki wrote:
> This particular change is about a new feature, so making it in the core is OK
> in two cases IMO: (a) when you actively want everyone to be affected by it and

IMO this change should be done for the whole ARM architecture. And if some
regression happens due to this, then we come back and solve it.

> (b) when the effect of it on the old systems should not be noticeable.

I am not sure about the effects of this on performance really.

@Dietmar: Any inputs for that ?
Rafael J. Wysocki July 11, 2017, 2:59 p.m. UTC | #13
On Tuesday, July 11, 2017 04:06:01 PM Dietmar Eggemann wrote:
> On 11/07/17 07:01, Viresh Kumar wrote:
> > On 10-07-17, 13:02, Dietmar Eggemann wrote:
> >> Yes, I will change this. The #define approach is not really necessary
> >> here since we're not in the scheduler hot-path and inlining is not
> >> really required here.
> > 
> > It would be part of scheduler hot-path for the fast-switching case, isn't it ?
> > (I am not arguing against using weak functions, just wanted to correct above
> > statement).
> 
> Yes you're right here.
> 
> But in the meantime we're convinced that cpufreq_driver_fast_switch() is
> not the right place to call arch_set_freq_scale() since for (future)
> arm/arm64 fast-switch driver, the return value of
> cpufreq_driver->fast_switch() does not give us the information that the
> frequency value did actually change.
> 
> So we probably have to do this soemwhere in the cpufreq driver(s) to
> support fast-switching until we have aperf/mperf like counters.

If that's the case, I'd say call arch_set_freq_scale() from drivers in all
cases or it will get *really* confusing.

Thanks,
Rafael
Dietmar Eggemann July 11, 2017, 3:06 p.m. UTC | #14
On 11/07/17 07:01, Viresh Kumar wrote:
> On 10-07-17, 13:02, Dietmar Eggemann wrote:
>> Yes, I will change this. The #define approach is not really necessary
>> here since we're not in the scheduler hot-path and inlining is not
>> really required here.
> 
> It would be part of scheduler hot-path for the fast-switching case, isn't it ?
> (I am not arguing against using weak functions, just wanted to correct above
> statement).

Yes you're right here.

But in the meantime we're convinced that cpufreq_driver_fast_switch() is
not the right place to call arch_set_freq_scale() since for (future)
arm/arm64 fast-switch driver, the return value of
cpufreq_driver->fast_switch() does not give us the information that the
frequency value did actually change.

So we probably have to do this soemwhere in the cpufreq driver(s) to
support fast-switching until we have aperf/mperf like counters.
Dietmar Eggemann July 11, 2017, 3:12 p.m. UTC | #15
On 11/07/17 15:59, Rafael J. Wysocki wrote:
> On Tuesday, July 11, 2017 04:06:01 PM Dietmar Eggemann wrote:
>> On 11/07/17 07:01, Viresh Kumar wrote:
>>> On 10-07-17, 13:02, Dietmar Eggemann wrote:
>>>> Yes, I will change this. The #define approach is not really necessary
>>>> here since we're not in the scheduler hot-path and inlining is not
>>>> really required here.
>>>
>>> It would be part of scheduler hot-path for the fast-switching case, isn't it ?
>>> (I am not arguing against using weak functions, just wanted to correct above
>>> statement).
>>
>> Yes you're right here.
>>
>> But in the meantime we're convinced that cpufreq_driver_fast_switch() is
>> not the right place to call arch_set_freq_scale() since for (future)
>> arm/arm64 fast-switch driver, the return value of
>> cpufreq_driver->fast_switch() does not give us the information that the
>> frequency value did actually change.
>>
>> So we probably have to do this soemwhere in the cpufreq driver(s) to
>> support fast-switching until we have aperf/mperf like counters.
> 
> If that's the case, I'd say call arch_set_freq_scale() from drivers in all
> cases or it will get *really* confusing.

Agreed, we should do it for slow-switching drivers from within the
driver as well in this case.
Dietmar Eggemann July 11, 2017, 3:21 p.m. UTC | #16
On 11/07/17 07:39, Viresh Kumar wrote:
> On 10-07-17, 14:46, Rafael J. Wysocki wrote:
>> This particular change is about a new feature, so making it in the core is OK
>> in two cases IMO: (a) when you actively want everyone to be affected by it and
> 
> IMO this change should be done for the whole ARM architecture. And if some
> regression happens due to this, then we come back and solve it.
> 
>> (b) when the effect of it on the old systems should not be noticeable.
> 
> I am not sure about the effects of this on performance really.
> 
> @Dietmar: Any inputs for that ?

Like I said in the other email, since for (future)
arm/arm64 fast-switch driver, the return value of
cpufreq_driver->fast_switch() does not give us the information that the
frequency value did actually change, we have to implement
arch_set_freq_scale() in the driver.
This means that we probably only implement this in the subset of drivers
which will be used in platforms on which we want to have
frequency-invariant load-tracking.

A future aperf/mperf like counter FIE solution can give us arch-wide
support when those counters are available.
Viresh Kumar July 12, 2017, 4:09 a.m. UTC | #17
On 11-07-17, 16:06, Dietmar Eggemann wrote:
> But in the meantime we're convinced that cpufreq_driver_fast_switch() is
> not the right place to call arch_set_freq_scale() since for (future)
> arm/arm64 fast-switch driver, the return value of
> cpufreq_driver->fast_switch() does not give us the information that the
> frequency value did actually change.

Yeah, I saw your discussion with Peter on #linux-rt IRC and TBH I wasn't aware
that we are going to do fast switching that way. Just trying to get
understanding of that idea a bit..

So we will do fast switching from scheduler's point of view, i.e. we wouldn't
schedule a kthread to change the frequency. But the real hardware still can't do
that without sleeping, like if we have I2C somewhere in between. AFAIU, we will
still have some kind of *software* bottom half to do that work, isn't it? And it
wouldn't be that we have pushed some instructions to the hardware, which it can
do a bit later.

For example, the regulator may be accessed via I2C and we need to program that
before changing the clock. So, it will be done by some software code only.

And now I am wondering on why that would be any better than the kthread in
schedutil. Sorry, I haven't understood the idea completely yet :(
Peter Zijlstra July 12, 2017, 8:31 a.m. UTC | #18
On Wed, Jul 12, 2017 at 09:39:17AM +0530, Viresh Kumar wrote:
> Yeah, I saw your discussion with Peter on #linux-rt IRC and TBH I wasn't aware
> that we are going to do fast switching that way. Just trying to get
> understanding of that idea a bit..
> 
> So we will do fast switching from scheduler's point of view, i.e. we wouldn't
> schedule a kthread to change the frequency. But the real hardware still can't do
> that without sleeping, like if we have I2C somewhere in between. AFAIU, we will
> still have some kind of *software* bottom half to do that work, isn't it? And it
> wouldn't be that we have pushed some instructions to the hardware, which it can
> do a bit later.
> 
> For example, the regulator may be accessed via I2C and we need to program that
> before changing the clock. So, it will be done by some software code only.
> 
> And now I am wondering on why that would be any better than the kthread in
> schedutil. Sorry, I haven't understood the idea completely yet :(

So the problem with the thread is two-fold; one the one hand we like the
scheduler to directly set frequency, but then we need to schedule a task
to change the frequency, which will change the frequency and around we
go.

On the other hand, there's very nasty issues with PI. This thread would
have very high priority (otherwise the SCHED_DEADLINE stuff won't work)
but that then means this thread needs to boost the owner of the i2c
mutex. And that then creates a massive bandwidth accounting hole.


The advantage of using an interrupt driven state machine is that all
those issues go away.

But yes, whichever way around you turn things, its crap. But given the
hardware its the best we can do.
Viresh Kumar July 12, 2017, 9:27 a.m. UTC | #19
On 12-07-17, 10:31, Peter Zijlstra wrote:
> So the problem with the thread is two-fold; one the one hand we like the
> scheduler to directly set frequency, but then we need to schedule a task
> to change the frequency, which will change the frequency and around we
> go.
> 
> On the other hand, there's very nasty issues with PI. This thread would
> have very high priority (otherwise the SCHED_DEADLINE stuff won't work)
> but that then means this thread needs to boost the owner of the i2c
> mutex. And that then creates a massive bandwidth accounting hole.
> 
> 
> The advantage of using an interrupt driven state machine is that all
> those issues go away.
> 
> But yes, whichever way around you turn things, its crap. But given the
> hardware its the best we can do.

Thanks for the explanation Peter.

IIUC, it will take more time to change the frequency eventually with
the interrupt-driven state machine as there may be multiple bottom
halves involved here, for supply, clk, etc, which would run at normal
priorities now. And those were boosted currently due to the high
priority sugov thread. And we are fine with that (from performance
point of view) ?

Coming back to where we started from (where should we call
arch_set_freq_scale() from ?).

I think we would still need some kind of synchronization between
cpufreq core and the cpufreq drivers to make sure we don't start
another freq change before the previous one is complete. Otherwise
the cpufreq drivers would be required to have similar support with
proper locking in place.

And if the core is going to get notified about successful freq changes
(which it should IMHO), then it may still be better to call
arch_set_freq_scale() from the core itself and not from individual
drivers.
Peter Zijlstra July 12, 2017, 11:14 a.m. UTC | #20
On Wed, Jul 12, 2017 at 02:57:55PM +0530, Viresh Kumar wrote:
> On 12-07-17, 10:31, Peter Zijlstra wrote:
> > So the problem with the thread is two-fold; one the one hand we like the
> > scheduler to directly set frequency, but then we need to schedule a task
> > to change the frequency, which will change the frequency and around we
> > go.
> > 
> > On the other hand, there's very nasty issues with PI. This thread would
> > have very high priority (otherwise the SCHED_DEADLINE stuff won't work)
> > but that then means this thread needs to boost the owner of the i2c
> > mutex. And that then creates a massive bandwidth accounting hole.
> > 
> > 
> > The advantage of using an interrupt driven state machine is that all
> > those issues go away.
> > 
> > But yes, whichever way around you turn things, its crap. But given the
> > hardware its the best we can do.
> 
> Thanks for the explanation Peter.
> 
> IIUC, it will take more time to change the frequency eventually with
> the interrupt-driven state machine as there may be multiple bottom
> halves involved here, for supply, clk, etc, which would run at normal
> priorities now. And those were boosted currently due to the high
> priority sugov thread. And we are fine with that (from performance
> point of view) ?

I'm not sure what you mean; bottom halves as in softirq? From what I can
tell an i2c bus does clk_prepare_enable() on registration and from that
point on clk_enable() is usable from atomic contexts. But afaict clk
stuff doesn't do interrupts at all.

(with a note that I absolutely hate the clk locking)

I think the interrupt driven thing can actually be faster than the
'regular' task waiting on the mutex. The regulator message can be
locklessly queued (it only ever makes sense to have 1 such message
pending, any later one will invalidate a prior one).

Then the i2c interrupt can detect the availability of this pending
message and splice it into the transfer queue at an opportune moment.

(of course, the current i2c bits don't support any of that)

> Coming back to where we started from (where should we call
> arch_set_freq_scale() from ?).

The drivers.. the core cpufreq doesn't know when (if any) transition is
completed.

> I think we would still need some kind of synchronization between
> cpufreq core and the cpufreq drivers to make sure we don't start
> another freq change before the previous one is complete. Otherwise
> the cpufreq drivers would be required to have similar support with
> proper locking in place.

Not sure what you mean; also not sure why. On x86 we never know, cannot
know. So why would this stuff be any different.

> And if the core is going to get notified about successful freq changes
> (which it should IMHO), then it may still be better to call
> arch_set_freq_scale() from the core itself and not from individual
> drivers.

I would not involve the core. All we want from the core is a unified
interface towards requesting DVFS changes. Everything that happens after
is not its business.
Rafael J. Wysocki July 12, 2017, 11:13 p.m. UTC | #21
On Wednesday, July 12, 2017 01:14:26 PM Peter Zijlstra wrote:
> On Wed, Jul 12, 2017 at 02:57:55PM +0530, Viresh Kumar wrote:
> > On 12-07-17, 10:31, Peter Zijlstra wrote:
> > > So the problem with the thread is two-fold; one the one hand we like the
> > > scheduler to directly set frequency, but then we need to schedule a task
> > > to change the frequency, which will change the frequency and around we
> > > go.
> > > 
> > > On the other hand, there's very nasty issues with PI. This thread would
> > > have very high priority (otherwise the SCHED_DEADLINE stuff won't work)
> > > but that then means this thread needs to boost the owner of the i2c
> > > mutex. And that then creates a massive bandwidth accounting hole.
> > > 
> > > 
> > > The advantage of using an interrupt driven state machine is that all
> > > those issues go away.
> > > 
> > > But yes, whichever way around you turn things, its crap. But given the
> > > hardware its the best we can do.
> > 
> > Thanks for the explanation Peter.
> > 
> > IIUC, it will take more time to change the frequency eventually with
> > the interrupt-driven state machine as there may be multiple bottom
> > halves involved here, for supply, clk, etc, which would run at normal
> > priorities now. And those were boosted currently due to the high
> > priority sugov thread. And we are fine with that (from performance
> > point of view) ?
> 
> I'm not sure what you mean; bottom halves as in softirq? From what I can
> tell an i2c bus does clk_prepare_enable() on registration and from that
> point on clk_enable() is usable from atomic contexts. But afaict clk
> stuff doesn't do interrupts at all.
> 
> (with a note that I absolutely hate the clk locking)
> 
> I think the interrupt driven thing can actually be faster than the
> 'regular' task waiting on the mutex. The regulator message can be
> locklessly queued (it only ever makes sense to have 1 such message
> pending, any later one will invalidate a prior one).
> 
> Then the i2c interrupt can detect the availability of this pending
> message and splice it into the transfer queue at an opportune moment.
> 
> (of course, the current i2c bits don't support any of that)

I *guess* the concern is that in the new model there is no control over the
time of requesting the frequency change and when the change actually
happens.

IIUC the whole point of making the governor thread an RT or DL one is to
ensure that the change will happen as soon as technically possible, because if
it doesn't happen in a specific time frame, it can very well be skipped entirely.

> > Coming back to where we started from (where should we call
> > arch_set_freq_scale() from ?).
> 
> The drivers.. the core cpufreq doesn't know when (if any) transition is
> completed.

Right.
 
> > I think we would still need some kind of synchronization between
> > cpufreq core and the cpufreq drivers to make sure we don't start
> > another freq change before the previous one is complete. Otherwise
> > the cpufreq drivers would be required to have similar support with
> > proper locking in place.
> 
> Not sure what you mean; also not sure why. On x86 we never know, cannot
> know. So why would this stuff be any different.

We seem to be in violent agreement on this. :-)

Thanks,
Rafael
Peter Zijlstra July 13, 2017, 7:49 a.m. UTC | #22
On Thu, Jul 13, 2017 at 01:13:43AM +0200, Rafael J. Wysocki wrote:

> I *guess* the concern is that in the new model there is no control over the
> time of requesting the frequency change and when the change actually
> happens.

There isn't in any case. If some brilliant hardware designer shares the
regulator's I2C bus with another device that requires 'big' transfers
(say a DSP) we're hosed whichever way around.

> IIUC the whole point of making the governor thread an RT or DL one is to
> ensure that the change will happen as soon as technically possible, because if
> it doesn't happen in a specific time frame, it can very well be skipped entirely.

So I think the interrupt driven thing can actually do better than the
rt-mutex based one, and I cannot think of a case where it would do
worse.

The I2C completion interrupt can splice in the pending request at the
earliest opportunity, even though the mutex owner might still think it
owns things.

And if that is not possible, it'll still be as fast as waiting for the
mutex to be released (or ever so slightly faster, because it will not
quiesce the bus like it would otherwise at the end of a transfer).
Viresh Kumar July 13, 2017, 8:48 a.m. UTC | #23
On 13-07-17, 01:13, Rafael J. Wysocki wrote:
> On Wednesday, July 12, 2017 01:14:26 PM Peter Zijlstra wrote:
> > On Wed, Jul 12, 2017 at 02:57:55PM +0530, Viresh Kumar wrote:

> > > IIUC, it will take more time to change the frequency eventually with
> > > the interrupt-driven state machine as there may be multiple bottom
> > > halves involved here, for supply, clk, etc, which would run at normal
> > > priorities now. And those were boosted currently due to the high
> > > priority sugov thread. And we are fine with that (from performance
> > > point of view) ?
> > 
> > I'm not sure what you mean; bottom halves as in softirq?

Workqueues or normal threads actually. Maybe I am completely wrong,
but this is how I believe things are going to be:

Configuration: Both regulator and clk registers accessible over I2C
bus.

Scheduler calls schedutil, which eventually calls cpufreq driver (w/o
kthread). The cpufreq backend driver will queue a async request with
callback (with regulator core) to update regulator's constraints
(which can sleep as we need to talk over I2C). The callback will be
called once regulator is programmed. And we return right after
submitting the request with regulator core.

Now, I2C transfer will finish (i.e. regulator programmed) and the
driver specific callback will get called. It will try to change the
frequency now and wait (sleep) until it finishes. I hope the regulator
core wouldn't call the driver callback from interrupt context but some
sort of bottom half, maybe workqueue (That's what I was referring to
earlier).

And finally the clk is programmed and the state machine finished.

> > From what I can
> > tell an i2c bus does clk_prepare_enable() on registration and from that
> > point on clk_enable() is usable from atomic contexts.

That assumes that we can access registers of the I2C controller
atomically without sleeping. Not sure how many ARM platforms have I2C
controller connected over a slow bus though.

> > But afaict clk
> > stuff doesn't do interrupts at all.

The clk stuff may not need it if the clock controllers registers can
be accessed atomically. But if (as in my example) the clk controller
is also over the I2C bus, then the interrupt will be provided from I2C
bus and clk routines would return only after transfer is done.

> > (with a note that I absolutely hate the clk locking)

Yeah, that's a beast :)

> > I think the interrupt driven thing can actually be faster than the
> > 'regular' task waiting on the mutex. The regulator message can be
> > locklessly queued (it only ever makes sense to have 1 such message
> > pending, any later one will invalidate a prior one).
> > 
> > Then the i2c interrupt can detect the availability of this pending
> > message and splice it into the transfer queue at an opportune moment.
> > 
> > (of course, the current i2c bits don't support any of that)
> 
> I *guess* the concern is that in the new model there is no control over the
> time of requesting the frequency change and when the change actually
> happens.

Right.

> IIUC the whole point of making the governor thread an RT or DL one is to
> ensure that the change will happen as soon as technically possible, because if
> it doesn't happen in a specific time frame, it can very well be skipped entirely.

Yes, or actually we can have more trouble (below..).

> > > Coming back to where we started from (where should we call
> > > arch_set_freq_scale() from ?).
> > 
> > The drivers.. the core cpufreq doesn't know when (if any) transition is
> > completed.
> 
> Right.
>  
> > > I think we would still need some kind of synchronization between
> > > cpufreq core and the cpufreq drivers to make sure we don't start
> > > another freq change before the previous one is complete. Otherwise
> > > the cpufreq drivers would be required to have similar support with
> > > proper locking in place.
> > 
> > Not sure what you mean; also not sure why. On x86 we never know, cannot
> > know. So why would this stuff be any different.

So as per the above example, the software on ARM requires to program
multiple hardware components (clk, regulator, power-domain, etc) for
changing CPUs frequency. And this has to be non-racy, otherwise we
might have programmed regulator, domain etc, but right before we
change the frequency another request may land which may try to program
the regulator again. We would be screwed badly here.

Its not a problem on X86 because (I believe) most of this is done by
the hardware for you. And you guys don't have to worry for that.

We already take care of these synchronization issues in slow switching
path (cpufreq_freq_transition_begin()), where we guarantee that a new
freq change request doesn't start before the previous one is over.
Peter Zijlstra July 13, 2017, 11:15 a.m. UTC | #24
On Thu, Jul 13, 2017 at 02:18:04PM +0530, Viresh Kumar wrote:
> Workqueues or normal threads actually. Maybe I am completely wrong,
> but this is how I believe things are going to be:
> 
> Configuration: Both regulator and clk registers accessible over I2C
> bus.
> 
> Scheduler calls schedutil, which eventually calls cpufreq driver (w/o
> kthread). The cpufreq backend driver will queue a async request with
> callback (with regulator core) to update regulator's constraints
> (which can sleep as we need to talk over I2C). The callback will be
> called once regulator is programmed. And we return right after
> submitting the request with regulator core.
> 
> Now, I2C transfer will finish (i.e. regulator programmed) and the
> driver specific callback will get called. It will try to change the
> frequency now and wait (sleep) until it finishes. I hope the regulator
> core wouldn't call the driver callback from interrupt context but some
> sort of bottom half, maybe workqueue (That's what I was referring to
> earlier).
> 
> And finally the clk is programmed and the state machine finished.
> 
> > > From what I can
> > > tell an i2c bus does clk_prepare_enable() on registration and from that
> > > point on clk_enable() is usable from atomic contexts.
> 
> That assumes that we can access registers of the I2C controller
> atomically without sleeping. Not sure how many ARM platforms have I2C
> controller connected over a slow bus though.
> 
> > > But afaict clk
> > > stuff doesn't do interrupts at all.
> 
> The clk stuff may not need it if the clock controllers registers can
> be accessed atomically. But if (as in my example) the clk controller
> is also over the I2C bus, then the interrupt will be provided from I2C
> bus and clk routines would return only after transfer is done.

So no, that's not the idea at all.

The one thing we assume is that the I2C bus does indeed have interrupts
(which isn't a given per se, but lacking that we're completely hosed).

For interrupt enabled I2C busses, we must be able to write to their
registers from atomic context, otherwise the interrupts can't very well
work.

With that, you can drive the entire thing from the IRQ.


I don't believe anyone is quite as silly as to put the regulator on a
nested I2C; in which case I think you still _could_ do, but I'll just
class that with failure of using a polling I2C bus for your regulator.


As to the serialization, your driver can easily keep track of that and
not allow a new transition to start while a previous is still in
critical transit.
Sudeep Holla July 13, 2017, 12:40 p.m. UTC | #25
On 11/07/17 16:21, Dietmar Eggemann wrote:
> On 11/07/17 07:39, Viresh Kumar wrote:
>> On 10-07-17, 14:46, Rafael J. Wysocki wrote:
>>> This particular change is about a new feature, so making it in the core is OK
>>> in two cases IMO: (a) when you actively want everyone to be affected by it and
>>
>> IMO this change should be done for the whole ARM architecture. And if some
>> regression happens due to this, then we come back and solve it.
>>
>>> (b) when the effect of it on the old systems should not be noticeable.
>>
>> I am not sure about the effects of this on performance really.
>>
>> @Dietmar: Any inputs for that ?
> 
> Like I said in the other email, since for (future)
> arm/arm64 fast-switch driver, the return value of
> cpufreq_driver->fast_switch() does not give us the information that the
> frequency value did actually change, we have to implement

I was under the impression that we strictly don't care about that
information when I started exploring the fast_switch with the standard
firmware interface on ARM platforms(until if and when ARM provides an
instruction to achieve that).

If f/w failed to change the frequency, will that be not corrected in the
next sample or instance. I would like to know the impact of absence of
such notifications.

> arch_set_freq_scale() in the driver.
> This means that we probably only implement this in the subset of drivers
> which will be used in platforms on which we want to have
> frequency-invariant load-tracking.
> 
> A future aperf/mperf like counter FIE solution can give us arch-wide
> support when those counters are available.
> 

Agreed.
Sudeep Holla July 13, 2017, 12:49 p.m. UTC | #26
On 12/07/17 05:09, Viresh Kumar wrote:
> On 11-07-17, 16:06, Dietmar Eggemann wrote:
>> But in the meantime we're convinced that cpufreq_driver_fast_switch() is
>> not the right place to call arch_set_freq_scale() since for (future)
>> arm/arm64 fast-switch driver, the return value of
>> cpufreq_driver->fast_switch() does not give us the information that the
>> frequency value did actually change.
> 
> Yeah, I saw your discussion with Peter on #linux-rt IRC and TBH I wasn't aware
> that we are going to do fast switching that way. Just trying to get
> understanding of that idea a bit..
> 
> So we will do fast switching from scheduler's point of view, i.e. we wouldn't
> schedule a kthread to change the frequency. But the real hardware still can't do
> that without sleeping, like if we have I2C somewhere in between. AFAIU, we will
> still have some kind of *software* bottom half to do that work, isn't it? And it
> wouldn't be that we have pushed some instructions to the hardware, which it can
> do a bit later.
> 

No the platforms we are considering are only where a standard firmware
interface is provided and the firmware deals with all those I2C/PMIC crap.

> For example, the regulator may be accessed via I2C and we need to program that
> before changing the clock. So, it will be done by some software code only.
> 

Software but just not Linux OSPM but some firmware(remote processors
presumably, can't imagine on the same processor though)
Sudeep Holla July 13, 2017, 12:54 p.m. UTC | #27
On 12/07/17 10:27, Viresh Kumar wrote:
> On 12-07-17, 10:31, Peter Zijlstra wrote:
>> So the problem with the thread is two-fold; one the one hand we like the
>> scheduler to directly set frequency, but then we need to schedule a task
>> to change the frequency, which will change the frequency and around we
>> go.
>>
>> On the other hand, there's very nasty issues with PI. This thread would
>> have very high priority (otherwise the SCHED_DEADLINE stuff won't work)
>> but that then means this thread needs to boost the owner of the i2c
>> mutex. And that then creates a massive bandwidth accounting hole.
>>
>>
>> The advantage of using an interrupt driven state machine is that all
>> those issues go away.
>>
>> But yes, whichever way around you turn things, its crap. But given the
>> hardware its the best we can do.
> 
> Thanks for the explanation Peter.
> 
> IIUC, it will take more time to change the frequency eventually with
> the interrupt-driven state machine as there may be multiple bottom
> halves involved here, for supply, clk, etc, which would run at normal
> priorities now. And those were boosted currently due to the high
> priority sugov thread. And we are fine with that (from performance
> point of view) ?
> 
> Coming back to where we started from (where should we call
> arch_set_freq_scale() from ?).
> 
> I think we would still need some kind of synchronization between
> cpufreq core and the cpufreq drivers to make sure we don't start
> another freq change before the previous one is complete. Otherwise
> the cpufreq drivers would be required to have similar support with
> proper locking in place.
> 

Good point, but with firmware interface we are considering fro
fast-switch, the firmware can override the previous request if it's not
yet started. So I assume that's fine and expected ?

> And if the core is going to get notified about successful freq changes
> (which it should IMHO),

Is that mandatory for even fast-switching ?
Dietmar Eggemann July 13, 2017, 1:08 p.m. UTC | #28
On 13/07/17 13:40, Sudeep Holla wrote:
> 
> 
> On 11/07/17 16:21, Dietmar Eggemann wrote:
>> On 11/07/17 07:39, Viresh Kumar wrote:
>>> On 10-07-17, 14:46, Rafael J. Wysocki wrote:

[...]

>> Like I said in the other email, since for (future)
>> arm/arm64 fast-switch driver, the return value of
>> cpufreq_driver->fast_switch() does not give us the information that the
>> frequency value did actually change, we have to implement
> 
> I was under the impression that we strictly don't care about that
> information when I started exploring the fast_switch with the standard
> firmware interface on ARM platforms(until if and when ARM provides an
> instruction to achieve that).
> 
> If f/w failed to change the frequency, will that be not corrected in the
> next sample or instance. I would like to know the impact of absence of
> such notifications.

In the meantime we agreed that we have to invoke frequency invariance
from within the cpufreq driver.

For a fast-switch driver I would have to put the call to
arch_set_freq_scale() somewhere where I know that the frequency has been
set.

Without a notification (from the firmware) that the frequency has been
set, I would have to call arch_set_freq_scale() somewhere in the
driver::fast_switch() call assuming that the frequency has been actually
set.

[...]
Sudeep Holla July 13, 2017, 2:04 p.m. UTC | #29
On 12/07/17 12:14, Peter Zijlstra wrote:
> On Wed, Jul 12, 2017 at 02:57:55PM +0530, Viresh Kumar wrote:
>> On 12-07-17, 10:31, Peter Zijlstra wrote:
>>> So the problem with the thread is two-fold; one the one hand we like the
>>> scheduler to directly set frequency, but then we need to schedule a task
>>> to change the frequency, which will change the frequency and around we
>>> go.
>>>
>>> On the other hand, there's very nasty issues with PI. This thread would
>>> have very high priority (otherwise the SCHED_DEADLINE stuff won't work)
>>> but that then means this thread needs to boost the owner of the i2c
>>> mutex. And that then creates a massive bandwidth accounting hole.
>>>
>>>
>>> The advantage of using an interrupt driven state machine is that all
>>> those issues go away.
>>>
>>> But yes, whichever way around you turn things, its crap. But given the
>>> hardware its the best we can do.
>>
>> Thanks for the explanation Peter.
>>
>> IIUC, it will take more time to change the frequency eventually with
>> the interrupt-driven state machine as there may be multiple bottom
>> halves involved here, for supply, clk, etc, which would run at normal
>> priorities now. And those were boosted currently due to the high
>> priority sugov thread. And we are fine with that (from performance
>> point of view) ?
> 
> I'm not sure what you mean; bottom halves as in softirq? From what I can
> tell an i2c bus does clk_prepare_enable() on registration and from that
> point on clk_enable() is usable from atomic contexts. But afaict clk
> stuff doesn't do interrupts at all.
> 
> (with a note that I absolutely hate the clk locking)
> 

Agreed. Juri pointed out this as a blocker a while ago and when we
started implementing the new and shiny ARM SCMI specification, I dropped
the whole clock layer interaction for the CPUFreq driver. However, I
still have to deal with some mailbox locking(still experimenting currently)

> I think the interrupt driven thing can actually be faster than the
> 'regular' task waiting on the mutex. The regulator message can be
> locklessly queued (it only ever makes sense to have 1 such message
> pending, any later one will invalidate a prior one).
> 

Ah OK, I just asked the same in the other thread, you have already
answered me. Good we can ignore.

> Then the i2c interrupt can detect the availability of this pending
> message and splice it into the transfer queue at an opportune moment.
> 
> (of course, the current i2c bits don't support any of that)
> 
>> Coming back to where we started from (where should we call
>> arch_set_freq_scale() from ?).
> 
> The drivers.. the core cpufreq doesn't know when (if any) transition is
> completed.
> 
>> I think we would still need some kind of synchronization between
>> cpufreq core and the cpufreq drivers to make sure we don't start
>> another freq change before the previous one is complete. Otherwise
>> the cpufreq drivers would be required to have similar support with
>> proper locking in place.
> 
> Not sure what you mean; also not sure why. On x86 we never know, cannot
> know. So why would this stuff be any different.
> 

Good, I was under the same assumption that it's okay to override the old
request with new.

>> And if the core is going to get notified about successful freq changes
>> (which it should IMHO), then it may still be better to call
>> arch_set_freq_scale() from the core itself and not from individual
>> drivers.
> 
> I would not involve the core. All we want from the core is a unified
> interface towards requesting DVFS changes. Everything that happens after
> is not its business.
> 

The question is whether we *need* to know the completion of frequency
transition. What is the impact of absence of it ? I am considering
platforms which may take up to a ms or more to do the actual transition
in the firmware.
Sudeep Holla July 13, 2017, 2:06 p.m. UTC | #30
On 13/07/17 14:08, Dietmar Eggemann wrote:
> On 13/07/17 13:40, Sudeep Holla wrote:
>>
>>
>> On 11/07/17 16:21, Dietmar Eggemann wrote:
>>> On 11/07/17 07:39, Viresh Kumar wrote:
>>>> On 10-07-17, 14:46, Rafael J. Wysocki wrote:
> 
> [...]
> 
>>> Like I said in the other email, since for (future)
>>> arm/arm64 fast-switch driver, the return value of
>>> cpufreq_driver->fast_switch() does not give us the information that the
>>> frequency value did actually change, we have to implement
>>
>> I was under the impression that we strictly don't care about that
>> information when I started exploring the fast_switch with the standard
>> firmware interface on ARM platforms(until if and when ARM provides an
>> instruction to achieve that).
>>
>> If f/w failed to change the frequency, will that be not corrected in the
>> next sample or instance. I would like to know the impact of absence of
>> such notifications.
> 
> In the meantime we agreed that we have to invoke frequency invariance
> from within the cpufreq driver.
> 
> For a fast-switch driver I would have to put the call to
> arch_set_freq_scale() somewhere where I know that the frequency has been
> set.
> 
> Without a notification (from the firmware) that the frequency has been
> set, I would have to call arch_set_freq_scale() somewhere in the
> driver::fast_switch() call assuming that the frequency has been actually
> set.
> 

Yes, that's what I was thinking too.
Peter Zijlstra July 13, 2017, 2:42 p.m. UTC | #31
On Thu, Jul 13, 2017 at 03:04:09PM +0100, Sudeep Holla wrote:

> The question is whether we *need* to know the completion of frequency
> transition. What is the impact of absence of it ? I am considering
> platforms which may take up to a ms or more to do the actual transition
> in the firmware.

So on x86 we can recover from not knowing by means of the APERF/MPERF
thing, which gives us the average effective frequency over the last
period.

If you lack that you need something to update the actual effective
frequency.

Changing the effective frequency at request time might confuse things --
esp. if the request might not be honoured at all or can take a
significant time to complete. Not to mention that _IF_ you rely on the
effective frequency to set other clocks things can come unstuck.

So unless you go the whole distance and do APERF/MPERF like things, I
think it would be very good to have a notification of completion (and
possibly a read-back of the effective frequency that is now set).
Sudeep Holla July 13, 2017, 3 p.m. UTC | #32
On 13/07/17 15:42, Peter Zijlstra wrote:
> On Thu, Jul 13, 2017 at 03:04:09PM +0100, Sudeep Holla wrote:
> 
>> The question is whether we *need* to know the completion of frequency
>> transition. What is the impact of absence of it ? I am considering
>> platforms which may take up to a ms or more to do the actual transition
>> in the firmware.
> 
> So on x86 we can recover from not knowing by means of the APERF/MPERF
> thing, which gives us the average effective frequency over the last
> period.
> 

Understood, and I agree we *must* head in that direction.

> If you lack that you need something to update the actual effective
> frequency.
> 
> Changing the effective frequency at request time might confuse things --
> esp. if the request might not be honoured at all or can take a
> significant time to complete. Not to mention that _IF_ you rely on the
> effective frequency to set other clocks things can come unstuck.
> 
> So unless you go the whole distance and do APERF/MPERF like things, I
> think it would be very good to have a notification of completion (and
> possibly a read-back of the effective frequency that is now set).
>

Yes I agree, but sadly/unfortunately the new SCMI specification I keep
referring makes these notification optional. We even have statistics
collected by the firmware to get the effective frequency, but again
optional and may get updated at slower rate than what we would expect as
they are typically running on slower processors. But IMO it's still
worth exploring the feasibility of fast switch on system with such
standard interface.

I completely agree with you on the old system which Linux has to deal
with I2C and other slow paths. I was under the assumption that we had
already eliminated the use of fast switch on such systems. When Dietmar
referred fast switching, I think he was referring to only systems with
std. firmware interface.
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 9bf97a366029..77c4d5e7a598 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -301,6 +301,12 @@  static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 #endif
 }
 
+#ifndef arch_set_freq_scale
+static void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+                               unsigned long max_freq)
+{}
+#endif
+
 static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
                struct cpufreq_freqs *freqs, unsigned int state)
 {
@@ -343,6 +349,8 @@  static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
                                CPUFREQ_POSTCHANGE, freqs);
                if (likely(policy) && likely(policy->cpu == freqs->cpu))
                        policy->cur = freqs->new;
+               arch_set_freq_scale(policy->related_cpus, policy->cur,
+                                   policy->cpuinfo.max_freq);
                break;
        }
 }
@@ -1824,9 +1832,18 @@  EXPORT_SYMBOL(cpufreq_unregister_notifier);
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
                                        unsigned int target_freq)
 {
+       unsigned int ret;
+
        target_freq = clamp_val(target_freq, policy->min, policy->max);
 
-       return cpufreq_driver->fast_switch(policy, target_freq);
+       ret = cpufreq_driver->fast_switch(policy, target_freq);
+
+       if (ret != CPUFREQ_ENTRY_INVALID) {
+               arch_set_freq_scale(policy->related_cpus, target_freq,
+                                   policy->cpuinfo.max_freq);
+       }
+
+       return ret;
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);