diff mbox series

[1/2] thermal/drivers/cpufreq_cooling: Add platform callback functions

Message ID 1587365320-25222-1-git-send-email-gao.yunxiao6@gmail.com (mailing list archive)
State New, archived
Delegated to: Daniel Lezcano
Headers show
Series [1/2] thermal/drivers/cpufreq_cooling: Add platform callback functions | expand

Commit Message

gao yunxiao April 20, 2020, 6:48 a.m. UTC
From: Jeson Gao <jeson.gao@unisoc.com>

On some platforms, due to the high power consumption, thermal frequency
reduction policy cannot control the desired temperature, platform have to
use the hotplug mechanism to mitigate temperature rising,so adding the
platform callback to support this function.

platform will hotplug out CPU when the normalised power is lower than
the power corresponding to the minimum frequency limit that is set by
platform.

Signed-off-by: Jeson Gao <jeson.gao@unisoc.com>
---
 drivers/thermal/cpufreq_cooling.c | 52 +++++++++++++++++++++++++++++++++++++++
 include/linux/cpu_cooling.h       | 30 ++++++++++++++++++++++
 2 files changed, 82 insertions(+)

Comments

Viresh Kumar April 22, 2020, 8:04 a.m. UTC | #1
On 20-04-20, 14:48, gao.yunxiao6@gmail.com wrote:
>  static DEFINE_IDA(cpufreq_ida);
> @@ -313,12 +315,24 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev,
>  	u32 last_load, normalised_power;
>  	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
>  	struct cpufreq_policy *policy = cpufreq_cdev->policy;
> +	struct cpufreq_cooling_plat_ops *plat_ops = cpufreq_cdev->plat_ops;
>  
>  	last_load = cpufreq_cdev->last_load ?: 1;
>  	normalised_power = (power * 100) / last_load;
>  	target_freq = cpu_power_to_freq(cpufreq_cdev, normalised_power);
>  
>  	*state = get_level(cpufreq_cdev, target_freq);
> +	if (*state == cpufreq_cdev->max_level &&
> +			plat_ops && plat_ops->cpufreq_plat_min_freq_limit) {
> +		plat_ops->cpufreq_plat_min_freq_limit(policy, &target_freq);
> +		*state = get_level(cpufreq_cdev, target_freq);
> +	}
> +
> +	if (plat_ops && plat_ops->cpufreq_plat_cpu_ctrl)
> +		plat_ops->cpufreq_plat_cpu_ctrl(policy,
> +				last_load, normalised_power,
> +				cpu_freq_to_power(cpufreq_cdev, target_freq));
> +
>  	trace_thermal_power_cpu_limit(policy->related_cpus, target_freq, *state,
>  				      power);
>  	return 0;
> @@ -684,3 +698,41 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
>  	kfree(cpufreq_cdev);
>  }
>  EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);

Instead of adding such callbacks to constraint the min freq of CPUs,
you can directly use frequency constraints used by QoS framework to
put such limit directly on cpufreq. Look at freq_qos_add_request().
gao yunxiao April 22, 2020, 9:53 a.m. UTC | #2
viresh

Thank you very much for your advice.

In here, only check whether the frequency point given by cpu_cooling
module is the minimum frequency point.

On 22/04/2020, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> On 20-04-20, 14:48, gao.yunxiao6@gmail.com wrote:
>>  static DEFINE_IDA(cpufreq_ida);
>> @@ -313,12 +315,24 @@ static int cpufreq_power2state(struct
>> thermal_cooling_device *cdev,
>>  	u32 last_load, normalised_power;
>>  	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
>>  	struct cpufreq_policy *policy = cpufreq_cdev->policy;
>> +	struct cpufreq_cooling_plat_ops *plat_ops = cpufreq_cdev->plat_ops;
>>
>>  	last_load = cpufreq_cdev->last_load ?: 1;
>>  	normalised_power = (power * 100) / last_load;
>>  	target_freq = cpu_power_to_freq(cpufreq_cdev, normalised_power);
>>
>>  	*state = get_level(cpufreq_cdev, target_freq);
>> +	if (*state == cpufreq_cdev->max_level &&
>> +			plat_ops && plat_ops->cpufreq_plat_min_freq_limit) {
>> +		plat_ops->cpufreq_plat_min_freq_limit(policy, &target_freq);
>> +		*state = get_level(cpufreq_cdev, target_freq);
>> +	}
>> +
>> +	if (plat_ops && plat_ops->cpufreq_plat_cpu_ctrl)
>> +		plat_ops->cpufreq_plat_cpu_ctrl(policy,
>> +				last_load, normalised_power,
>> +				cpu_freq_to_power(cpufreq_cdev, target_freq));
>> +
>>  	trace_thermal_power_cpu_limit(policy->related_cpus, target_freq,
>> *state,
>>  				      power);
>>  	return 0;
>> @@ -684,3 +698,41 @@ void cpufreq_cooling_unregister(struct
>> thermal_cooling_device *cdev)
>>  	kfree(cpufreq_cdev);
>>  }
>>  EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
>
> Instead of adding such callbacks to constraint the min freq of CPUs,
> you can directly use frequency constraints used by QoS framework to
> put such limit directly on cpufreq. Look at freq_qos_add_request().
>
> --
> viresh
>
Viresh Kumar April 22, 2020, 9:54 a.m. UTC | #3
On 22-04-20, 17:53, gao yunxiao wrote:
> viresh
> 
> Thank you very much for your advice.
> 
> In here, only check whether the frequency point given by cpu_cooling
> module is the minimum frequency point.

I am not sure I understood what you are doing here. Please elaborate a
bit.
gao yunxiao April 22, 2020, 11:11 a.m. UTC | #4
On 22/04/2020, gao yunxiao <gao.yunxiao6@gmail.com> wrote:
> viresh
>
> On UNISOC platform, CPU's temperature can not be controlled when
> cpufreq has been limited to the lowest frequency, we have to hotplug
> out CPUS to mitigate temperature rising.
>
> adding platform callback to have a chance to check whether the
> normalised power at power2state() is lower than the power
> corresponding to the lowest frequency. provide an example in another
> patch
> +static int sprd_cpufreq_cpu_ctrl(struct cpufreq_policy *policy,
> +                                u32 load, u32 normalised_power,
> +                                u32 freq_power)
> +{
> +       unsigned int ncpus, online_cpus;
> +
> +       ncpus = cpumask_weight(policy->related_cpus);
> +       online_cpus = cpumask_weight(policy->cpus);
> +
> +       if (normalised_power <  freq_power)
> +               sprd_estimate_down_cpus(policy, load,
> +                                       normalised_power, freq_power);
> +       else if (online_cpus < ncpus)
> +               sprd_estimate_up_cpus(policy, load,
> +                                     normalised_power, freq_power);
> +       else
> +               sprd_keep_cpus(policy);
> +
> +       return 0;
> +}
>
> On 22/04/2020, Viresh Kumar <viresh.kumar@linaro.org> wrote:
>> On 22-04-20, 17:53, gao yunxiao wrote:
>>> viresh
>>>
>>> Thank you very much for your advice.
>>>
>>> In here, only check whether the frequency point given by cpu_cooling
>>> module is the minimum frequency point.
>>
>> I am not sure I understood what you are doing here. Please elaborate a
>> bit.
>>
>> --
>> viresh
>>
>
Daniel Lezcano April 27, 2020, 8:22 p.m. UTC | #5
On 22/04/2020 13:11, gao yunxiao wrote:
> On 22/04/2020, gao yunxiao <gao.yunxiao6@gmail.com> wrote:
>> viresh
>>
>> On UNISOC platform, CPU's temperature can not be controlled when
>> cpufreq has been limited to the lowest frequency, we have to hotplug
>> out CPUS to mitigate temperature rising.
>>
>> adding platform callback to have a chance to check whether the
>> normalised power at power2state() is lower than the power
>> corresponding to the lowest frequency. provide an example in another
>> patch

You can use in addition the cpuidle cooling device if the cpufreq
cooling device fails.

Add two trip points. The first one mitigated by the cpufreq cooling
device and the second one, with a higher temperature, mitigated by the
cpuidle cooling device [1][2].

For my personal information, does the platform support voltage scaling?


[1]
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/driver-api/thermal/cpu-idle-cooling.rst

[2] https://lkml.org/lkml/2020/4/14/1442
gao yunxiao April 28, 2020, 6:51 a.m. UTC | #6
Daniel

Thank you for your suggestion

Yes, the platform can support voltage scaling.
I will porting cpuidle cooling and double check it on our platform.

By the way, I have a question trouble to you
when one cpu is forced into the cpuidle, the running task on it are
stopped or migrated other cpu?



On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
> On 22/04/2020 13:11, gao yunxiao wrote:
>> On 22/04/2020, gao yunxiao <gao.yunxiao6@gmail.com> wrote:
>>> viresh
>>>
>>> On UNISOC platform, CPU's temperature can not be controlled when
>>> cpufreq has been limited to the lowest frequency, we have to hotplug
>>> out CPUS to mitigate temperature rising.
>>>
>>> adding platform callback to have a chance to check whether the
>>> normalised power at power2state() is lower than the power
>>> corresponding to the lowest frequency. provide an example in another
>>> patch
>
> You can use in addition the cpuidle cooling device if the cpufreq
> cooling device fails.
>
> Add two trip points. The first one mitigated by the cpufreq cooling
> device and the second one, with a higher temperature, mitigated by the
> cpuidle cooling device [1][2].
>
> For my personal information, does the platform support voltage scaling?
>
>
> [1]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/driver-api/thermal/cpu-idle-cooling.rst
>
> [2] https://lkml.org/lkml/2020/4/14/1442
>
>
>
> --
> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>
> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
> <http://twitter.com/#!/linaroorg> Twitter |
> <http://www.linaro.org/linaro-blog/> Blog
>
Daniel Lezcano April 28, 2020, 7:53 a.m. UTC | #7
Hi,

On 28/04/2020 08:51, gao yunxiao wrote:
> Daniel
> 
> Thank you for your suggestion
> 
> Yes, the platform can support voltage scaling.

Given your issue, I would double check if the voltage scaling is really
effective.

> I will porting cpuidle cooling and double check it on our platform.
> 
> By the way, I have a question trouble to you
> when one cpu is forced into the cpuidle, the running task on it are
> stopped or migrated other cpu?

The task is scheduled out by the idle injection which has a real time
priority. When this one finishes the idle cycle, it schedules itself and
the previous task continue its work. So the short answer is the task is
stopped, the idle injection happens, then the task runs again.

Concerning the migration, that is a scheduler thing and will depend on
the thermal configuration and sensors layout.

Let's assume the platform is 4 x cores (one cluster).

1. The platform has one sensor per core and the configuration sets one
thermal zone with one idle cooling device per core

In this case, the mitigation will insert idle cycles, those will be seen
as chunk of system load cycle and will enter in the CPU load
computation. Thus, when there is an imbalance, the scheduler can migrate
the task to an idle CPU (or less busy CPU).

2. The platform has one sensor per cluster and the configuration sets
one thermal zone with *four* idle cooling devices (one per core)

When the mitigation happens, the idle injection will be on all the cores
at the same time, thus the load will increase on all the CPUs and won't
 enter in the balance computation (well actually it will enter but as
they are the same on all the CPUs, the difference is zero).


In practical, we found the configuration #2, and in order to reach the
temperature limit, all the cores are fully busy, so task migration
depends on what the tasks do and the idle injection has few impact on this.

Does it answer your question ?


> On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
>> On 22/04/2020 13:11, gao yunxiao wrote:
>>> On 22/04/2020, gao yunxiao <gao.yunxiao6@gmail.com> wrote:
>>>> viresh
>>>>
>>>> On UNISOC platform, CPU's temperature can not be controlled when
>>>> cpufreq has been limited to the lowest frequency, we have to hotplug
>>>> out CPUS to mitigate temperature rising.
>>>>
>>>> adding platform callback to have a chance to check whether the
>>>> normalised power at power2state() is lower than the power
>>>> corresponding to the lowest frequency. provide an example in another
>>>> patch
>>
>> You can use in addition the cpuidle cooling device if the cpufreq
>> cooling device fails.
>>
>> Add two trip points. The first one mitigated by the cpufreq cooling
>> device and the second one, with a higher temperature, mitigated by the
>> cpuidle cooling device [1][2].
>>
>> For my personal information, does the platform support voltage scaling?
>>
>>
>> [1]
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/driver-api/thermal/cpu-idle-cooling.rst
>>
>> [2] https://lkml.org/lkml/2020/4/14/1442
>>
>>
>>
>> --
>> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>>
>> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
>> <http://twitter.com/#!/linaroorg> Twitter |
>> <http://www.linaro.org/linaro-blog/> Blog
>>
gao yunxiao April 28, 2020, 10:01 a.m. UTC | #8
Yes,thank you for your introduction

another question trouble to you.
I'm worried that the idle cycle is set too long, there may be a
jamming phenomenon for the mobile phone, so I am not sure how to
determine the idle cycle?

On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
>
> Hi,
>
> On 28/04/2020 08:51, gao yunxiao wrote:
>> Daniel
>>
>> Thank you for your suggestion
>>
>> Yes, the platform can support voltage scaling.
>
> Given your issue, I would double check if the voltage scaling is really
> effective.
>
>> I will porting cpuidle cooling and double check it on our platform.
>>
>> By the way, I have a question trouble to you
>> when one cpu is forced into the cpuidle, the running task on it are
>> stopped or migrated other cpu?
>
> The task is scheduled out by the idle injection which has a real time
> priority. When this one finishes the idle cycle, it schedules itself and
> the previous task continue its work. So the short answer is the task is
> stopped, the idle injection happens, then the task runs again.
>
> Concerning the migration, that is a scheduler thing and will depend on
> the thermal configuration and sensors layout.
>
> Let's assume the platform is 4 x cores (one cluster).
>
> 1. The platform has one sensor per core and the configuration sets one
> thermal zone with one idle cooling device per core
>
> In this case, the mitigation will insert idle cycles, those will be seen
> as chunk of system load cycle and will enter in the CPU load
> computation. Thus, when there is an imbalance, the scheduler can migrate
> the task to an idle CPU (or less busy CPU).
>
> 2. The platform has one sensor per cluster and the configuration sets
> one thermal zone with *four* idle cooling devices (one per core)
>
> When the mitigation happens, the idle injection will be on all the cores
> at the same time, thus the load will increase on all the CPUs and won't
>  enter in the balance computation (well actually it will enter but as
> they are the same on all the CPUs, the difference is zero).
>
>
> In practical, we found the configuration #2, and in order to reach the
> temperature limit, all the cores are fully busy, so task migration
> depends on what the tasks do and the idle injection has few impact on this.
>
> Does it answer your question ?
>
>
>> On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
>>> On 22/04/2020 13:11, gao yunxiao wrote:
>>>> On 22/04/2020, gao yunxiao <gao.yunxiao6@gmail.com> wrote:
>>>>> viresh
>>>>>
>>>>> On UNISOC platform, CPU's temperature can not be controlled when
>>>>> cpufreq has been limited to the lowest frequency, we have to hotplug
>>>>> out CPUS to mitigate temperature rising.
>>>>>
>>>>> adding platform callback to have a chance to check whether the
>>>>> normalised power at power2state() is lower than the power
>>>>> corresponding to the lowest frequency. provide an example in another
>>>>> patch
>>>
>>> You can use in addition the cpuidle cooling device if the cpufreq
>>> cooling device fails.
>>>
>>> Add two trip points. The first one mitigated by the cpufreq cooling
>>> device and the second one, with a higher temperature, mitigated by the
>>> cpuidle cooling device [1][2].
>>>
>>> For my personal information, does the platform support voltage scaling?
>>>
>>>
>>> [1]
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/driver-api/thermal/cpu-idle-cooling.rst
>>>
>>> [2] https://lkml.org/lkml/2020/4/14/1442
>>>
>>>
>>>
>>> --
>>> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>>>
>>> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
>>> <http://twitter.com/#!/linaroorg> Twitter |
>>> <http://www.linaro.org/linaro-blog/> Blog
>>>
>
>
> --
> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>
> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
> <http://twitter.com/#!/linaroorg> Twitter |
> <http://www.linaro.org/linaro-blog/> Blog
>
Daniel Lezcano April 28, 2020, 10:20 a.m. UTC | #9
On 28/04/2020 12:01, gao yunxiao wrote:
> Yes,thank you for your introduction
> 
> another question trouble to you.
> I'm worried that the idle cycle is set too long, there may be a
> jamming phenomenon for the mobile phone, so I am not sure how to
> determine the idle cycle?

That will depend on the speed transition of the available idle states.
The DT gives the binding for the idle duration and the exit latency.

The optional exit latency constraint will prevent to choose deep idle
states (but which have better cooling effect).

So choosing the right constraint and the idle duration (which should be
at least greater than target residency of the idle state) is a question
of experimentation.

Concerning the jamming effect, if the phone is failing to cool down with
cpufreq and has to use the idle injection in addition, clearly there is
a problem with the heat dissipation and the latency introduced is not
the biggest problem in this situation.

If you think hotplugging the CPU is better, you still have the
possibility to rely on the hot trip point notification (not sure it
works) to unplug from userspace the CPU [1].

  -- Daniel

[1]
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/thermal/thermal_core.c#n290


> On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
>>
>> Hi,
>>
>> On 28/04/2020 08:51, gao yunxiao wrote:
>>> Daniel
>>>
>>> Thank you for your suggestion
>>>
>>> Yes, the platform can support voltage scaling.
>>
>> Given your issue, I would double check if the voltage scaling is really
>> effective.
>>
>>> I will porting cpuidle cooling and double check it on our platform.
>>>
>>> By the way, I have a question trouble to you
>>> when one cpu is forced into the cpuidle, the running task on it are
>>> stopped or migrated other cpu?
>>
>> The task is scheduled out by the idle injection which has a real time
>> priority. When this one finishes the idle cycle, it schedules itself and
>> the previous task continue its work. So the short answer is the task is
>> stopped, the idle injection happens, then the task runs again.
>>
>> Concerning the migration, that is a scheduler thing and will depend on
>> the thermal configuration and sensors layout.
>>
>> Let's assume the platform is 4 x cores (one cluster).
>>
>> 1. The platform has one sensor per core and the configuration sets one
>> thermal zone with one idle cooling device per core
>>
>> In this case, the mitigation will insert idle cycles, those will be seen
>> as chunk of system load cycle and will enter in the CPU load
>> computation. Thus, when there is an imbalance, the scheduler can migrate
>> the task to an idle CPU (or less busy CPU).
>>
>> 2. The platform has one sensor per cluster and the configuration sets
>> one thermal zone with *four* idle cooling devices (one per core)
>>
>> When the mitigation happens, the idle injection will be on all the cores
>> at the same time, thus the load will increase on all the CPUs and won't
>>  enter in the balance computation (well actually it will enter but as
>> they are the same on all the CPUs, the difference is zero).
>>
>>
>> In practical, we found the configuration #2, and in order to reach the
>> temperature limit, all the cores are fully busy, so task migration
>> depends on what the tasks do and the idle injection has few impact on this.
>>
>> Does it answer your question ?
>>
>>
>>> On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
>>>> On 22/04/2020 13:11, gao yunxiao wrote:
>>>>> On 22/04/2020, gao yunxiao <gao.yunxiao6@gmail.com> wrote:
>>>>>> viresh
>>>>>>
>>>>>> On UNISOC platform, CPU's temperature can not be controlled when
>>>>>> cpufreq has been limited to the lowest frequency, we have to hotplug
>>>>>> out CPUS to mitigate temperature rising.
>>>>>>
>>>>>> adding platform callback to have a chance to check whether the
>>>>>> normalised power at power2state() is lower than the power
>>>>>> corresponding to the lowest frequency. provide an example in another
>>>>>> patch
>>>>
>>>> You can use in addition the cpuidle cooling device if the cpufreq
>>>> cooling device fails.
>>>>
>>>> Add two trip points. The first one mitigated by the cpufreq cooling
>>>> device and the second one, with a higher temperature, mitigated by the
>>>> cpuidle cooling device [1][2].
>>>>
>>>> For my personal information, does the platform support voltage scaling?
>>>>
>>>>
>>>> [1]
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/driver-api/thermal/cpu-idle-cooling.rst
>>>>
>>>> [2] https://lkml.org/lkml/2020/4/14/1442
>>>>
>>>>
>>>>
>>>> --
>>>> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>>>>
>>>> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
>>>> <http://twitter.com/#!/linaroorg> Twitter |
>>>> <http://www.linaro.org/linaro-blog/> Blog
>>>>
>>
>>
>> --
>> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>>
>> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
>> <http://twitter.com/#!/linaroorg> Twitter |
>> <http://www.linaro.org/linaro-blog/> Blog
>>
gao yunxiao April 28, 2020, 11:03 a.m. UTC | #10
Daniel, thank you very much

On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
> On 28/04/2020 12:01, gao yunxiao wrote:
>> Yes,thank you for your introduction
>>
>> another question trouble to you.
>> I'm worried that the idle cycle is set too long, there may be a
>> jamming phenomenon for the mobile phone, so I am not sure how to
>> determine the idle cycle?
>
> That will depend on the speed transition of the available idle states.
> The DT gives the binding for the idle duration and the exit latency.
>
> The optional exit latency constraint will prevent to choose deep idle
> states (but which have better cooling effect).
>
> So choosing the right constraint and the idle duration (which should be
> at least greater than target residency of the idle state) is a question
> of experimentation.
>
> Concerning the jamming effect, if the phone is failing to cool down with
> cpufreq and has to use the idle injection in addition, clearly there is
> a problem with the heat dissipation and the latency introduced is not
> the biggest problem in this situation.
>
> If you think hotplugging the CPU is better, you still have the
> possibility to rely on the hot trip point notification (not sure it
> works) to unplug from userspace the CPU [1].
>
>   -- Daniel
>
> [1]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/thermal/thermal_core.c#n290
>
>
>> On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
>>>
>>> Hi,
>>>
>>> On 28/04/2020 08:51, gao yunxiao wrote:
>>>> Daniel
>>>>
>>>> Thank you for your suggestion
>>>>
>>>> Yes, the platform can support voltage scaling.
>>>
>>> Given your issue, I would double check if the voltage scaling is really
>>> effective.
>>>
>>>> I will porting cpuidle cooling and double check it on our platform.
>>>>
>>>> By the way, I have a question trouble to you
>>>> when one cpu is forced into the cpuidle, the running task on it are
>>>> stopped or migrated other cpu?
>>>
>>> The task is scheduled out by the idle injection which has a real time
>>> priority. When this one finishes the idle cycle, it schedules itself and
>>> the previous task continue its work. So the short answer is the task is
>>> stopped, the idle injection happens, then the task runs again.
>>>
>>> Concerning the migration, that is a scheduler thing and will depend on
>>> the thermal configuration and sensors layout.
>>>
>>> Let's assume the platform is 4 x cores (one cluster).
>>>
>>> 1. The platform has one sensor per core and the configuration sets one
>>> thermal zone with one idle cooling device per core
>>>
>>> In this case, the mitigation will insert idle cycles, those will be seen
>>> as chunk of system load cycle and will enter in the CPU load
>>> computation. Thus, when there is an imbalance, the scheduler can migrate
>>> the task to an idle CPU (or less busy CPU).
>>>
>>> 2. The platform has one sensor per cluster and the configuration sets
>>> one thermal zone with *four* idle cooling devices (one per core)
>>>
>>> When the mitigation happens, the idle injection will be on all the cores
>>> at the same time, thus the load will increase on all the CPUs and won't
>>>  enter in the balance computation (well actually it will enter but as
>>> they are the same on all the CPUs, the difference is zero).
>>>
>>>
>>> In practical, we found the configuration #2, and in order to reach the
>>> temperature limit, all the cores are fully busy, so task migration
>>> depends on what the tasks do and the idle injection has few impact on
>>> this.
>>>
>>> Does it answer your question ?
>>>
>>>
>>>> On 28/04/2020, Daniel Lezcano <daniel.lezcano@linaro.org> wrote:
>>>>> On 22/04/2020 13:11, gao yunxiao wrote:
>>>>>> On 22/04/2020, gao yunxiao <gao.yunxiao6@gmail.com> wrote:
>>>>>>> viresh
>>>>>>>
>>>>>>> On UNISOC platform, CPU's temperature can not be controlled when
>>>>>>> cpufreq has been limited to the lowest frequency, we have to hotplug
>>>>>>> out CPUS to mitigate temperature rising.
>>>>>>>
>>>>>>> adding platform callback to have a chance to check whether the
>>>>>>> normalised power at power2state() is lower than the power
>>>>>>> corresponding to the lowest frequency. provide an example in another
>>>>>>> patch
>>>>>
>>>>> You can use in addition the cpuidle cooling device if the cpufreq
>>>>> cooling device fails.
>>>>>
>>>>> Add two trip points. The first one mitigated by the cpufreq cooling
>>>>> device and the second one, with a higher temperature, mitigated by the
>>>>> cpuidle cooling device [1][2].
>>>>>
>>>>> For my personal information, does the platform support voltage
>>>>> scaling?
>>>>>
>>>>>
>>>>> [1]
>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/driver-api/thermal/cpu-idle-cooling.rst
>>>>>
>>>>> [2] https://lkml.org/lkml/2020/4/14/1442
>>>>>
>>>>>
>>>>>
>>>>> --
>>>>> <http://www.linaro.org/> Linaro.org │ Open source software for ARM
>>>>> SoCs
>>>>>
>>>>> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
>>>>> <http://twitter.com/#!/linaroorg> Twitter |
>>>>> <http://www.linaro.org/linaro-blog/> Blog
>>>>>
>>>
>>>
>>> --
>>> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>>>
>>> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
>>> <http://twitter.com/#!/linaroorg> Twitter |
>>> <http://www.linaro.org/linaro-blog/> Blog
>>>
>
>
> --
> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>
> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
> <http://twitter.com/#!/linaroorg> Twitter |
> <http://www.linaro.org/linaro-blog/> Blog
>
diff mbox series

Patch

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index e297e13..16cbf58 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -64,6 +64,7 @@  struct time_in_idle {
  * @node: list_head to link all cpufreq_cooling_device together.
  * @idle_time: idle time stats
  * @qos_req: PM QoS contraint to apply
+ * @plat_ops: point to platform callback function.
  *
  * This structure is required for keeping information of each registered
  * cpufreq_cooling_device.
@@ -78,6 +79,7 @@  struct cpufreq_cooling_device {
 	struct list_head node;
 	struct time_in_idle *idle_time;
 	struct freq_qos_request qos_req;
+	struct cpufreq_cooling_plat_ops *plat_ops;
 };
 
 static DEFINE_IDA(cpufreq_ida);
@@ -313,12 +315,24 @@  static int cpufreq_power2state(struct thermal_cooling_device *cdev,
 	u32 last_load, normalised_power;
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	struct cpufreq_policy *policy = cpufreq_cdev->policy;
+	struct cpufreq_cooling_plat_ops *plat_ops = cpufreq_cdev->plat_ops;
 
 	last_load = cpufreq_cdev->last_load ?: 1;
 	normalised_power = (power * 100) / last_load;
 	target_freq = cpu_power_to_freq(cpufreq_cdev, normalised_power);
 
 	*state = get_level(cpufreq_cdev, target_freq);
+	if (*state == cpufreq_cdev->max_level &&
+			plat_ops && plat_ops->cpufreq_plat_min_freq_limit) {
+		plat_ops->cpufreq_plat_min_freq_limit(policy, &target_freq);
+		*state = get_level(cpufreq_cdev, target_freq);
+	}
+
+	if (plat_ops && plat_ops->cpufreq_plat_cpu_ctrl)
+		plat_ops->cpufreq_plat_cpu_ctrl(policy,
+				last_load, normalised_power,
+				cpu_freq_to_power(cpufreq_cdev, target_freq));
+
 	trace_thermal_power_cpu_limit(policy->related_cpus, target_freq, *state,
 				      power);
 	return 0;
@@ -684,3 +698,41 @@  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 	kfree(cpufreq_cdev);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
+
+/**
+ * cpufreq_cooling_plat_ops_register - register platform callback function.
+ * @cdev: thermal cooling device pointer.
+ * @plat_ops: platform callback function pointer.
+ */
+int  cpufreq_cooling_plat_ops_register(struct thermal_cooling_device *cdev,
+			struct cpufreq_cooling_plat_ops *plat_ops)
+{
+	struct cpufreq_cooling_device *cpufreq_cdev;
+
+	if (!cdev && !cdev->devdata && !plat_ops)
+		return -EINVAL;
+
+	cpufreq_cdev = cdev->devdata;
+	cpufreq_cdev->plat_ops = plat_ops;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cpufreq_cooling_plat_ops_register);
+
+/**
+ * cpufreq_cooling_plat_ops_unregister - unregister platform callback function.
+ * @cdev: thermal cooling device pointer.
+ */
+int  cpufreq_cooling_plat_ops_unregister(struct thermal_cooling_device *cdev)
+{
+	struct cpufreq_cooling_device *cpufreq_cdev;
+
+	if (!cdev && !cdev->devdata)
+		return -EINVAL;
+
+	cpufreq_cdev = cdev->devdata;
+	cpufreq_cdev->plat_ops = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cpufreq_cooling_plat_ops_unregister);
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index 65501d8..3934918 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -19,6 +19,23 @@ 
 
 struct cpufreq_policy;
 
+/**
+ * struct cpufreq_cooling_plat_ops - platfom cpu cooling policy ops
+ *
+ * @cpufreq_plat_cpu_ctrl: this function provides a further core control
+ * policy when the current policies cannot cool down to an expected
+ * temperature value.
+ *
+ * @cpufreq_plat_min_freq_limit: set cpu frequency limit, cooling devices
+ * are not allowed to adjust cpu frequency to out of that limit.
+ */
+struct cpufreq_cooling_plat_ops {
+	int (*cpufreq_plat_cpu_ctrl)(struct cpufreq_policy *policy,
+				u32 load, u32 normalised_power, u32 freq_power);
+	void (*cpufreq_plat_min_freq_limit)(struct cpufreq_policy *policy,
+						u32 *target_freq);
+};
+
 #ifdef CONFIG_CPU_FREQ_THERMAL
 /**
  * cpufreq_cooling_register - function to create cpufreq cooling device.
@@ -40,6 +57,19 @@  struct thermal_cooling_device *
 struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct cpufreq_policy *policy);
 
+/**
+ * cpufreq_cooling_plat_ops_register - register platform callback function.
+ * @cdev: thermal cooling device pointer.
+ * @plat_ops: platform callback function pointer.
+ */
+int cpufreq_cooling_plat_ops_register(struct thermal_cooling_device *cdev,
+			struct cpufreq_cooling_plat_ops *plat_ops);
+/**
+ * cpufreq_cooling_plat_ops_unregister - unregister platform callback function.
+ * @cdev: thermal cooling device pointer.
+ */
+int  cpufreq_cooling_plat_ops_unregister(struct thermal_cooling_device *cdev);
+
 #else /* !CONFIG_CPU_FREQ_THERMAL */
 static inline struct thermal_cooling_device *
 cpufreq_cooling_register(struct cpufreq_policy *policy)