diff mbox series

[v2] thermal: devfreq_cooling: Use PM QoS to set frequency limits

Message ID 20200116151219.v2.1.I146403d05b9ec82f48b807efd416a57f545b447a@changeid (mailing list archive)
State New, archived
Delegated to: Daniel Lezcano
Headers show
Series [v2] thermal: devfreq_cooling: Use PM QoS to set frequency limits | expand

Commit Message

Matthias Kaehlcke Jan. 16, 2020, 11:12 p.m. UTC
Now that devfreq supports limiting the frequency range of a device
through PM QoS make use of it instead of disabling OPPs that should
not be used.

The switch from disabling OPPs to PM QoS introduces a subtle behavioral
change in case of conflicting requests (min > max): PM QoS gives
precedence to the MIN_FREQUENCY request, while higher OPPs disabled
with dev_pm_opp_disable() would override MIN_FREQUENCY.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
---

Changes in v2:
- added documentation for 'req_max_freq'
- fixed jumps in of_devfreq_cooling_register_power() unwind
- added comment about behavioral change to the commit message

 drivers/thermal/devfreq_cooling.c | 70 ++++++++++---------------------
 1 file changed, 23 insertions(+), 47 deletions(-)

Comments

Chanwoo Choi Jan. 17, 2020, 5:22 a.m. UTC | #1
On 1/17/20 8:12 AM, Matthias Kaehlcke wrote:
> Now that devfreq supports limiting the frequency range of a device
> through PM QoS make use of it instead of disabling OPPs that should
> not be used.
> 
> The switch from disabling OPPs to PM QoS introduces a subtle behavioral
> change in case of conflicting requests (min > max): PM QoS gives
> precedence to the MIN_FREQUENCY request, while higher OPPs disabled
> with dev_pm_opp_disable() would override MIN_FREQUENCY.
> 
> Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> ---
> 
> Changes in v2:
> - added documentation for 'req_max_freq'
> - fixed jumps in of_devfreq_cooling_register_power() unwind
> - added comment about behavioral change to the commit message
> 
>  drivers/thermal/devfreq_cooling.c | 70 ++++++++++---------------------
>  1 file changed, 23 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
> index ef59256887ff63..cbbaf5bc425d1a 100644
> --- a/drivers/thermal/devfreq_cooling.c
> +++ b/drivers/thermal/devfreq_cooling.c
> @@ -24,11 +24,13 @@
>  #include <linux/idr.h>
>  #include <linux/slab.h>
>  #include <linux/pm_opp.h>
> +#include <linux/pm_qos.h>
>  #include <linux/thermal.h>
>  
>  #include <trace/events/thermal.h>
>  
> -#define SCALE_ERROR_MITIGATION 100
> +#define HZ_PER_KHZ		1000
> +#define SCALE_ERROR_MITIGATION	100
>  
>  static DEFINE_IDA(devfreq_ida);
>  
> @@ -53,6 +55,8 @@ static DEFINE_IDA(devfreq_ida);
>   *		'utilization' (which is	'busy_time / 'total_time').
>   *		The 'res_util' range is from 100 to (power_table[state] * 100)
>   *		for the corresponding 'state'.
> + * @req_max_freq:	PM QoS request for limiting the maximum frequency
> + *			of the devfreq device.
>   */
>  struct devfreq_cooling_device {
>  	int id;
> @@ -65,49 +69,9 @@ struct devfreq_cooling_device {
>  	struct devfreq_cooling_power *power_ops;
>  	u32 res_util;
>  	int capped_state;
> +	struct dev_pm_qos_request req_max_freq;
>  };
>  
> -/**
> - * partition_enable_opps() - disable all opps above a given state
> - * @dfc:	Pointer to devfreq we are operating on
> - * @cdev_state:	cooling device state we're setting
> - *
> - * Go through the OPPs of the device, enabling all OPPs until
> - * @cdev_state and disabling those frequencies above it.
> - */
> -static int partition_enable_opps(struct devfreq_cooling_device *dfc,
> -				 unsigned long cdev_state)
> -{
> -	int i;
> -	struct device *dev = dfc->devfreq->dev.parent;
> -
> -	for (i = 0; i < dfc->freq_table_size; i++) {
> -		struct dev_pm_opp *opp;
> -		int ret = 0;
> -		unsigned int freq = dfc->freq_table[i];
> -		bool want_enable = i >= cdev_state ? true : false;
> -
> -		opp = dev_pm_opp_find_freq_exact(dev, freq, !want_enable);
> -
> -		if (PTR_ERR(opp) == -ERANGE)
> -			continue;
> -		else if (IS_ERR(opp))
> -			return PTR_ERR(opp);
> -
> -		dev_pm_opp_put(opp);
> -
> -		if (want_enable)
> -			ret = dev_pm_opp_enable(dev, freq);
> -		else
> -			ret = dev_pm_opp_disable(dev, freq);
> -
> -		if (ret)
> -			return ret;
> -	}
> -
> -	return 0;
> -}
> -
>  static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
>  					 unsigned long *state)
>  {
> @@ -134,7 +98,7 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
>  	struct devfreq_cooling_device *dfc = cdev->devdata;
>  	struct devfreq *df = dfc->devfreq;
>  	struct device *dev = df->dev.parent;
> -	int ret;
> +	unsigned long freq;
>  
>  	if (state == dfc->cooling_state)
>  		return 0;
> @@ -144,9 +108,10 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
>  	if (state >= dfc->freq_table_size)
>  		return -EINVAL;
>  
> -	ret = partition_enable_opps(dfc, state);
> -	if (ret)
> -		return ret;
> +	freq = dfc->freq_table[state];
> +
> +	dev_pm_qos_update_request(&dfc->req_max_freq,
> +				  DIV_ROUND_UP(freq, HZ_PER_KHZ));
>  
>  	dfc->cooling_state = state;
>  
> @@ -529,9 +494,15 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
>  	if (err)
>  		goto free_dfc;
>  
> -	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
> +	err = dev_pm_qos_add_request(df->dev.parent, &dfc->req_max_freq,
> +				     DEV_PM_QOS_MAX_FREQUENCY,
> +				     PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
>  	if (err < 0)
>  		goto free_tables;
> +
> +	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
> +	if (err < 0)
> +		goto remove_qos_req;
>  	dfc->id = err;
>  
>  	snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
> @@ -552,6 +523,10 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
>  
>  release_ida:
>  	ida_simple_remove(&devfreq_ida, dfc->id);
> +
> +remove_qos_req:
> +	dev_pm_qos_remove_request(&dfc->req_max_freq);
> +
>  free_tables:
>  	kfree(dfc->power_table);
>  	kfree(dfc->freq_table);
> @@ -600,6 +575,7 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
>  
>  	thermal_cooling_device_unregister(dfc->cdev);
>  	ida_simple_remove(&devfreq_ida, dfc->id);
> +	dev_pm_qos_remove_request(&dfc->req_max_freq);
>  	kfree(dfc->power_table);
>  	kfree(dfc->freq_table);
>  
> 

Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Matthias Kaehlcke March 12, 2020, 12:35 a.m. UTC | #2
Is any further action needed from my side or can this land?

Thanks

Matthias

On Fri, Jan 17, 2020 at 02:22:02PM +0900, Chanwoo Choi wrote:
> On 1/17/20 8:12 AM, Matthias Kaehlcke wrote:
> > Now that devfreq supports limiting the frequency range of a device
> > through PM QoS make use of it instead of disabling OPPs that should
> > not be used.
> > 
> > The switch from disabling OPPs to PM QoS introduces a subtle behavioral
> > change in case of conflicting requests (min > max): PM QoS gives
> > precedence to the MIN_FREQUENCY request, while higher OPPs disabled
> > with dev_pm_opp_disable() would override MIN_FREQUENCY.
> > 
> > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > ---
> > 
> > Changes in v2:
> > - added documentation for 'req_max_freq'
> > - fixed jumps in of_devfreq_cooling_register_power() unwind
> > - added comment about behavioral change to the commit message
> > 
> >  drivers/thermal/devfreq_cooling.c | 70 ++++++++++---------------------
> >  1 file changed, 23 insertions(+), 47 deletions(-)
> > 
> > diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
> > index ef59256887ff63..cbbaf5bc425d1a 100644
> > --- a/drivers/thermal/devfreq_cooling.c
> > +++ b/drivers/thermal/devfreq_cooling.c
> > @@ -24,11 +24,13 @@
> >  #include <linux/idr.h>
> >  #include <linux/slab.h>
> >  #include <linux/pm_opp.h>
> > +#include <linux/pm_qos.h>
> >  #include <linux/thermal.h>
> >  
> >  #include <trace/events/thermal.h>
> >  
> > -#define SCALE_ERROR_MITIGATION 100
> > +#define HZ_PER_KHZ		1000
> > +#define SCALE_ERROR_MITIGATION	100
> >  
> >  static DEFINE_IDA(devfreq_ida);
> >  
> > @@ -53,6 +55,8 @@ static DEFINE_IDA(devfreq_ida);
> >   *		'utilization' (which is	'busy_time / 'total_time').
> >   *		The 'res_util' range is from 100 to (power_table[state] * 100)
> >   *		for the corresponding 'state'.
> > + * @req_max_freq:	PM QoS request for limiting the maximum frequency
> > + *			of the devfreq device.
> >   */
> >  struct devfreq_cooling_device {
> >  	int id;
> > @@ -65,49 +69,9 @@ struct devfreq_cooling_device {
> >  	struct devfreq_cooling_power *power_ops;
> >  	u32 res_util;
> >  	int capped_state;
> > +	struct dev_pm_qos_request req_max_freq;
> >  };
> >  
> > -/**
> > - * partition_enable_opps() - disable all opps above a given state
> > - * @dfc:	Pointer to devfreq we are operating on
> > - * @cdev_state:	cooling device state we're setting
> > - *
> > - * Go through the OPPs of the device, enabling all OPPs until
> > - * @cdev_state and disabling those frequencies above it.
> > - */
> > -static int partition_enable_opps(struct devfreq_cooling_device *dfc,
> > -				 unsigned long cdev_state)
> > -{
> > -	int i;
> > -	struct device *dev = dfc->devfreq->dev.parent;
> > -
> > -	for (i = 0; i < dfc->freq_table_size; i++) {
> > -		struct dev_pm_opp *opp;
> > -		int ret = 0;
> > -		unsigned int freq = dfc->freq_table[i];
> > -		bool want_enable = i >= cdev_state ? true : false;
> > -
> > -		opp = dev_pm_opp_find_freq_exact(dev, freq, !want_enable);
> > -
> > -		if (PTR_ERR(opp) == -ERANGE)
> > -			continue;
> > -		else if (IS_ERR(opp))
> > -			return PTR_ERR(opp);
> > -
> > -		dev_pm_opp_put(opp);
> > -
> > -		if (want_enable)
> > -			ret = dev_pm_opp_enable(dev, freq);
> > -		else
> > -			ret = dev_pm_opp_disable(dev, freq);
> > -
> > -		if (ret)
> > -			return ret;
> > -	}
> > -
> > -	return 0;
> > -}
> > -
> >  static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
> >  					 unsigned long *state)
> >  {
> > @@ -134,7 +98,7 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
> >  	struct devfreq_cooling_device *dfc = cdev->devdata;
> >  	struct devfreq *df = dfc->devfreq;
> >  	struct device *dev = df->dev.parent;
> > -	int ret;
> > +	unsigned long freq;
> >  
> >  	if (state == dfc->cooling_state)
> >  		return 0;
> > @@ -144,9 +108,10 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
> >  	if (state >= dfc->freq_table_size)
> >  		return -EINVAL;
> >  
> > -	ret = partition_enable_opps(dfc, state);
> > -	if (ret)
> > -		return ret;
> > +	freq = dfc->freq_table[state];
> > +
> > +	dev_pm_qos_update_request(&dfc->req_max_freq,
> > +				  DIV_ROUND_UP(freq, HZ_PER_KHZ));
> >  
> >  	dfc->cooling_state = state;
> >  
> > @@ -529,9 +494,15 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
> >  	if (err)
> >  		goto free_dfc;
> >  
> > -	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
> > +	err = dev_pm_qos_add_request(df->dev.parent, &dfc->req_max_freq,
> > +				     DEV_PM_QOS_MAX_FREQUENCY,
> > +				     PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
> >  	if (err < 0)
> >  		goto free_tables;
> > +
> > +	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
> > +	if (err < 0)
> > +		goto remove_qos_req;
> >  	dfc->id = err;
> >  
> >  	snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
> > @@ -552,6 +523,10 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
> >  
> >  release_ida:
> >  	ida_simple_remove(&devfreq_ida, dfc->id);
> > +
> > +remove_qos_req:
> > +	dev_pm_qos_remove_request(&dfc->req_max_freq);
> > +
> >  free_tables:
> >  	kfree(dfc->power_table);
> >  	kfree(dfc->freq_table);
> > @@ -600,6 +575,7 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
> >  
> >  	thermal_cooling_device_unregister(dfc->cdev);
> >  	ida_simple_remove(&devfreq_ida, dfc->id);
> > +	dev_pm_qos_remove_request(&dfc->req_max_freq);
> >  	kfree(dfc->power_table);
> >  	kfree(dfc->freq_table);
> >  
> > 
> 
> Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
> 
> -- 
> Best Regards,
> Chanwoo Choi
> Samsung Electronics
Lukasz Luba March 12, 2020, 11:39 a.m. UTC | #3
Hi Matthias,

I just saw this email below the patch. I wasn't aware that you
are working on this. I will have to update my changes...

It looks good to me.
Unfortunately, it does not apply on top of Amit's commit
1b5cb9570670a6277cc0 thermal: devfreq_cooling: Appease the kernel-doc deity

Could you check this?

Other then that

Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>

Regards,
Lukasz

On 3/12/20 12:35 AM, Matthias Kaehlcke wrote:
> Is any further action needed from my side or can this land?
> 
> Thanks
> 
> Matthias
> 
> On Fri, Jan 17, 2020 at 02:22:02PM +0900, Chanwoo Choi wrote:
>> On 1/17/20 8:12 AM, Matthias Kaehlcke wrote:
>>> Now that devfreq supports limiting the frequency range of a device
>>> through PM QoS make use of it instead of disabling OPPs that should
>>> not be used.
>>>
>>> The switch from disabling OPPs to PM QoS introduces a subtle behavioral
>>> change in case of conflicting requests (min > max): PM QoS gives
>>> precedence to the MIN_FREQUENCY request, while higher OPPs disabled
>>> with dev_pm_opp_disable() would override MIN_FREQUENCY.
>>>
>>> Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
>>> ---
>>>
>>> Changes in v2:
>>> - added documentation for 'req_max_freq'
>>> - fixed jumps in of_devfreq_cooling_register_power() unwind
>>> - added comment about behavioral change to the commit message
>>>
>>>   drivers/thermal/devfreq_cooling.c | 70 ++++++++++---------------------
>>>   1 file changed, 23 insertions(+), 47 deletions(-)
>>>
>>> diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
>>> index ef59256887ff63..cbbaf5bc425d1a 100644
>>> --- a/drivers/thermal/devfreq_cooling.c
>>> +++ b/drivers/thermal/devfreq_cooling.c
>>> @@ -24,11 +24,13 @@
>>>   #include <linux/idr.h>
>>>   #include <linux/slab.h>
>>>   #include <linux/pm_opp.h>
>>> +#include <linux/pm_qos.h>
>>>   #include <linux/thermal.h>
>>>   
>>>   #include <trace/events/thermal.h>
>>>   
>>> -#define SCALE_ERROR_MITIGATION 100
>>> +#define HZ_PER_KHZ		1000
>>> +#define SCALE_ERROR_MITIGATION	100
>>>   
>>>   static DEFINE_IDA(devfreq_ida);
>>>   
>>> @@ -53,6 +55,8 @@ static DEFINE_IDA(devfreq_ida);
>>>    *		'utilization' (which is	'busy_time / 'total_time').
>>>    *		The 'res_util' range is from 100 to (power_table[state] * 100)
>>>    *		for the corresponding 'state'.
>>> + * @req_max_freq:	PM QoS request for limiting the maximum frequency
>>> + *			of the devfreq device.
>>>    */
>>>   struct devfreq_cooling_device {
>>>   	int id;
>>> @@ -65,49 +69,9 @@ struct devfreq_cooling_device {
>>>   	struct devfreq_cooling_power *power_ops;
>>>   	u32 res_util;
>>>   	int capped_state;
>>> +	struct dev_pm_qos_request req_max_freq;
>>>   };
>>>   
>>> -/**
>>> - * partition_enable_opps() - disable all opps above a given state
>>> - * @dfc:	Pointer to devfreq we are operating on
>>> - * @cdev_state:	cooling device state we're setting
>>> - *
>>> - * Go through the OPPs of the device, enabling all OPPs until
>>> - * @cdev_state and disabling those frequencies above it.
>>> - */
>>> -static int partition_enable_opps(struct devfreq_cooling_device *dfc,
>>> -				 unsigned long cdev_state)
>>> -{
>>> -	int i;
>>> -	struct device *dev = dfc->devfreq->dev.parent;
>>> -
>>> -	for (i = 0; i < dfc->freq_table_size; i++) {
>>> -		struct dev_pm_opp *opp;
>>> -		int ret = 0;
>>> -		unsigned int freq = dfc->freq_table[i];
>>> -		bool want_enable = i >= cdev_state ? true : false;
>>> -
>>> -		opp = dev_pm_opp_find_freq_exact(dev, freq, !want_enable);
>>> -
>>> -		if (PTR_ERR(opp) == -ERANGE)
>>> -			continue;
>>> -		else if (IS_ERR(opp))
>>> -			return PTR_ERR(opp);
>>> -
>>> -		dev_pm_opp_put(opp);
>>> -
>>> -		if (want_enable)
>>> -			ret = dev_pm_opp_enable(dev, freq);
>>> -		else
>>> -			ret = dev_pm_opp_disable(dev, freq);
>>> -
>>> -		if (ret)
>>> -			return ret;
>>> -	}
>>> -
>>> -	return 0;
>>> -}
>>> -
>>>   static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
>>>   					 unsigned long *state)
>>>   {
>>> @@ -134,7 +98,7 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
>>>   	struct devfreq_cooling_device *dfc = cdev->devdata;
>>>   	struct devfreq *df = dfc->devfreq;
>>>   	struct device *dev = df->dev.parent;
>>> -	int ret;
>>> +	unsigned long freq;
>>>   
>>>   	if (state == dfc->cooling_state)
>>>   		return 0;
>>> @@ -144,9 +108,10 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
>>>   	if (state >= dfc->freq_table_size)
>>>   		return -EINVAL;
>>>   
>>> -	ret = partition_enable_opps(dfc, state);
>>> -	if (ret)
>>> -		return ret;
>>> +	freq = dfc->freq_table[state];
>>> +
>>> +	dev_pm_qos_update_request(&dfc->req_max_freq,
>>> +				  DIV_ROUND_UP(freq, HZ_PER_KHZ));
>>>   
>>>   	dfc->cooling_state = state;
>>>   
>>> @@ -529,9 +494,15 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
>>>   	if (err)
>>>   		goto free_dfc;
>>>   
>>> -	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
>>> +	err = dev_pm_qos_add_request(df->dev.parent, &dfc->req_max_freq,
>>> +				     DEV_PM_QOS_MAX_FREQUENCY,
>>> +				     PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
>>>   	if (err < 0)
>>>   		goto free_tables;
>>> +
>>> +	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
>>> +	if (err < 0)
>>> +		goto remove_qos_req;
>>>   	dfc->id = err;
>>>   
>>>   	snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
>>> @@ -552,6 +523,10 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
>>>   
>>>   release_ida:
>>>   	ida_simple_remove(&devfreq_ida, dfc->id);
>>> +
>>> +remove_qos_req:
>>> +	dev_pm_qos_remove_request(&dfc->req_max_freq);
>>> +
>>>   free_tables:
>>>   	kfree(dfc->power_table);
>>>   	kfree(dfc->freq_table);
>>> @@ -600,6 +575,7 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
>>>   
>>>   	thermal_cooling_device_unregister(dfc->cdev);
>>>   	ida_simple_remove(&devfreq_ida, dfc->id);
>>> +	dev_pm_qos_remove_request(&dfc->req_max_freq);
>>>   	kfree(dfc->power_table);
>>>   	kfree(dfc->freq_table);
>>>   
>>>
>>
>> Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
>>
>> -- 
>> Best Regards,
>> Chanwoo Choi
>> Samsung Electronics
Matthias Kaehlcke March 12, 2020, 5:57 p.m. UTC | #4
Hi Lukasz,

thanks for the review!

I'll rebase and send v3. Hopefully it doesn't cause too much extra
work for your changes.

Thanks

Matthias

On Thu, Mar 12, 2020 at 11:39:56AM +0000, Lukasz Luba wrote:
> Hi Matthias,
> 
> I just saw this email below the patch. I wasn't aware that you
> are working on this. I will have to update my changes...
> 
> It looks good to me.
> Unfortunately, it does not apply on top of Amit's commit
> 1b5cb9570670a6277cc0 thermal: devfreq_cooling: Appease the kernel-doc deity
> 
> Could you check this?
> 
> Other then that
> 
> Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
> 
> Regards,
> Lukasz
> 
> On 3/12/20 12:35 AM, Matthias Kaehlcke wrote:
> > Is any further action needed from my side or can this land?
> > 
> > Thanks
> > 
> > Matthias
> > 
> > On Fri, Jan 17, 2020 at 02:22:02PM +0900, Chanwoo Choi wrote:
> > > On 1/17/20 8:12 AM, Matthias Kaehlcke wrote:
> > > > Now that devfreq supports limiting the frequency range of a device
> > > > through PM QoS make use of it instead of disabling OPPs that should
> > > > not be used.
> > > > 
> > > > The switch from disabling OPPs to PM QoS introduces a subtle behavioral
> > > > change in case of conflicting requests (min > max): PM QoS gives
> > > > precedence to the MIN_FREQUENCY request, while higher OPPs disabled
> > > > with dev_pm_opp_disable() would override MIN_FREQUENCY.
> > > > 
> > > > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > > > ---
> > > > 
> > > > Changes in v2:
> > > > - added documentation for 'req_max_freq'
> > > > - fixed jumps in of_devfreq_cooling_register_power() unwind
> > > > - added comment about behavioral change to the commit message
> > > > 
> > > >   drivers/thermal/devfreq_cooling.c | 70 ++++++++++---------------------
> > > >   1 file changed, 23 insertions(+), 47 deletions(-)
> > > > 
> > > > diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
> > > > index ef59256887ff63..cbbaf5bc425d1a 100644
> > > > --- a/drivers/thermal/devfreq_cooling.c
> > > > +++ b/drivers/thermal/devfreq_cooling.c
> > > > @@ -24,11 +24,13 @@
> > > >   #include <linux/idr.h>
> > > >   #include <linux/slab.h>
> > > >   #include <linux/pm_opp.h>
> > > > +#include <linux/pm_qos.h>
> > > >   #include <linux/thermal.h>
> > > >   #include <trace/events/thermal.h>
> > > > -#define SCALE_ERROR_MITIGATION 100
> > > > +#define HZ_PER_KHZ		1000
> > > > +#define SCALE_ERROR_MITIGATION	100
> > > >   static DEFINE_IDA(devfreq_ida);
> > > > @@ -53,6 +55,8 @@ static DEFINE_IDA(devfreq_ida);
> > > >    *		'utilization' (which is	'busy_time / 'total_time').
> > > >    *		The 'res_util' range is from 100 to (power_table[state] * 100)
> > > >    *		for the corresponding 'state'.
> > > > + * @req_max_freq:	PM QoS request for limiting the maximum frequency
> > > > + *			of the devfreq device.
> > > >    */
> > > >   struct devfreq_cooling_device {
> > > >   	int id;
> > > > @@ -65,49 +69,9 @@ struct devfreq_cooling_device {
> > > >   	struct devfreq_cooling_power *power_ops;
> > > >   	u32 res_util;
> > > >   	int capped_state;
> > > > +	struct dev_pm_qos_request req_max_freq;
> > > >   };
> > > > -/**
> > > > - * partition_enable_opps() - disable all opps above a given state
> > > > - * @dfc:	Pointer to devfreq we are operating on
> > > > - * @cdev_state:	cooling device state we're setting
> > > > - *
> > > > - * Go through the OPPs of the device, enabling all OPPs until
> > > > - * @cdev_state and disabling those frequencies above it.
> > > > - */
> > > > -static int partition_enable_opps(struct devfreq_cooling_device *dfc,
> > > > -				 unsigned long cdev_state)
> > > > -{
> > > > -	int i;
> > > > -	struct device *dev = dfc->devfreq->dev.parent;
> > > > -
> > > > -	for (i = 0; i < dfc->freq_table_size; i++) {
> > > > -		struct dev_pm_opp *opp;
> > > > -		int ret = 0;
> > > > -		unsigned int freq = dfc->freq_table[i];
> > > > -		bool want_enable = i >= cdev_state ? true : false;
> > > > -
> > > > -		opp = dev_pm_opp_find_freq_exact(dev, freq, !want_enable);
> > > > -
> > > > -		if (PTR_ERR(opp) == -ERANGE)
> > > > -			continue;
> > > > -		else if (IS_ERR(opp))
> > > > -			return PTR_ERR(opp);
> > > > -
> > > > -		dev_pm_opp_put(opp);
> > > > -
> > > > -		if (want_enable)
> > > > -			ret = dev_pm_opp_enable(dev, freq);
> > > > -		else
> > > > -			ret = dev_pm_opp_disable(dev, freq);
> > > > -
> > > > -		if (ret)
> > > > -			return ret;
> > > > -	}
> > > > -
> > > > -	return 0;
> > > > -}
> > > > -
> > > >   static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
> > > >   					 unsigned long *state)
> > > >   {
> > > > @@ -134,7 +98,7 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
> > > >   	struct devfreq_cooling_device *dfc = cdev->devdata;
> > > >   	struct devfreq *df = dfc->devfreq;
> > > >   	struct device *dev = df->dev.parent;
> > > > -	int ret;
> > > > +	unsigned long freq;
> > > >   	if (state == dfc->cooling_state)
> > > >   		return 0;
> > > > @@ -144,9 +108,10 @@ static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
> > > >   	if (state >= dfc->freq_table_size)
> > > >   		return -EINVAL;
> > > > -	ret = partition_enable_opps(dfc, state);
> > > > -	if (ret)
> > > > -		return ret;
> > > > +	freq = dfc->freq_table[state];
> > > > +
> > > > +	dev_pm_qos_update_request(&dfc->req_max_freq,
> > > > +				  DIV_ROUND_UP(freq, HZ_PER_KHZ));
> > > >   	dfc->cooling_state = state;
> > > > @@ -529,9 +494,15 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
> > > >   	if (err)
> > > >   		goto free_dfc;
> > > > -	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
> > > > +	err = dev_pm_qos_add_request(df->dev.parent, &dfc->req_max_freq,
> > > > +				     DEV_PM_QOS_MAX_FREQUENCY,
> > > > +				     PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
> > > >   	if (err < 0)
> > > >   		goto free_tables;
> > > > +
> > > > +	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
> > > > +	if (err < 0)
> > > > +		goto remove_qos_req;
> > > >   	dfc->id = err;
> > > >   	snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
> > > > @@ -552,6 +523,10 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
> > > >   release_ida:
> > > >   	ida_simple_remove(&devfreq_ida, dfc->id);
> > > > +
> > > > +remove_qos_req:
> > > > +	dev_pm_qos_remove_request(&dfc->req_max_freq);
> > > > +
> > > >   free_tables:
> > > >   	kfree(dfc->power_table);
> > > >   	kfree(dfc->freq_table);
> > > > @@ -600,6 +575,7 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
> > > >   	thermal_cooling_device_unregister(dfc->cdev);
> > > >   	ida_simple_remove(&devfreq_ida, dfc->id);
> > > > +	dev_pm_qos_remove_request(&dfc->req_max_freq);
> > > >   	kfree(dfc->power_table);
> > > >   	kfree(dfc->freq_table);
> > > > 
> > > 
> > > Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
> > > 
> > > -- 
> > > Best Regards,
> > > Chanwoo Choi
> > > Samsung Electronics
Lukasz Luba March 12, 2020, 8:26 p.m. UTC | #5
On 3/12/20 5:57 PM, Matthias Kaehlcke wrote:
> Hi Lukasz,
> 
> thanks for the review!
> 
> I'll rebase and send v3. Hopefully it doesn't cause too much extra
> work for your changes.

No worries, your change is really needed. It will simply the code
that I am working on (can be found here [1]). I don't have
deal with the race with devfreq governor now.

Thank you for working on it.

Lukasz

[1] https://lkml.org/lkml/2020/3/9/475
diff mbox series

Patch

diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index ef59256887ff63..cbbaf5bc425d1a 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -24,11 +24,13 @@ 
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/pm_opp.h>
+#include <linux/pm_qos.h>
 #include <linux/thermal.h>
 
 #include <trace/events/thermal.h>
 
-#define SCALE_ERROR_MITIGATION 100
+#define HZ_PER_KHZ		1000
+#define SCALE_ERROR_MITIGATION	100
 
 static DEFINE_IDA(devfreq_ida);
 
@@ -53,6 +55,8 @@  static DEFINE_IDA(devfreq_ida);
  *		'utilization' (which is	'busy_time / 'total_time').
  *		The 'res_util' range is from 100 to (power_table[state] * 100)
  *		for the corresponding 'state'.
+ * @req_max_freq:	PM QoS request for limiting the maximum frequency
+ *			of the devfreq device.
  */
 struct devfreq_cooling_device {
 	int id;
@@ -65,49 +69,9 @@  struct devfreq_cooling_device {
 	struct devfreq_cooling_power *power_ops;
 	u32 res_util;
 	int capped_state;
+	struct dev_pm_qos_request req_max_freq;
 };
 
-/**
- * partition_enable_opps() - disable all opps above a given state
- * @dfc:	Pointer to devfreq we are operating on
- * @cdev_state:	cooling device state we're setting
- *
- * Go through the OPPs of the device, enabling all OPPs until
- * @cdev_state and disabling those frequencies above it.
- */
-static int partition_enable_opps(struct devfreq_cooling_device *dfc,
-				 unsigned long cdev_state)
-{
-	int i;
-	struct device *dev = dfc->devfreq->dev.parent;
-
-	for (i = 0; i < dfc->freq_table_size; i++) {
-		struct dev_pm_opp *opp;
-		int ret = 0;
-		unsigned int freq = dfc->freq_table[i];
-		bool want_enable = i >= cdev_state ? true : false;
-
-		opp = dev_pm_opp_find_freq_exact(dev, freq, !want_enable);
-
-		if (PTR_ERR(opp) == -ERANGE)
-			continue;
-		else if (IS_ERR(opp))
-			return PTR_ERR(opp);
-
-		dev_pm_opp_put(opp);
-
-		if (want_enable)
-			ret = dev_pm_opp_enable(dev, freq);
-		else
-			ret = dev_pm_opp_disable(dev, freq);
-
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
 static int devfreq_cooling_get_max_state(struct thermal_cooling_device *cdev,
 					 unsigned long *state)
 {
@@ -134,7 +98,7 @@  static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 	struct devfreq *df = dfc->devfreq;
 	struct device *dev = df->dev.parent;
-	int ret;
+	unsigned long freq;
 
 	if (state == dfc->cooling_state)
 		return 0;
@@ -144,9 +108,10 @@  static int devfreq_cooling_set_cur_state(struct thermal_cooling_device *cdev,
 	if (state >= dfc->freq_table_size)
 		return -EINVAL;
 
-	ret = partition_enable_opps(dfc, state);
-	if (ret)
-		return ret;
+	freq = dfc->freq_table[state];
+
+	dev_pm_qos_update_request(&dfc->req_max_freq,
+				  DIV_ROUND_UP(freq, HZ_PER_KHZ));
 
 	dfc->cooling_state = state;
 
@@ -529,9 +494,15 @@  of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
 	if (err)
 		goto free_dfc;
 
-	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
+	err = dev_pm_qos_add_request(df->dev.parent, &dfc->req_max_freq,
+				     DEV_PM_QOS_MAX_FREQUENCY,
+				     PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
 	if (err < 0)
 		goto free_tables;
+
+	err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
+	if (err < 0)
+		goto remove_qos_req;
 	dfc->id = err;
 
 	snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
@@ -552,6 +523,10 @@  of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
 
 release_ida:
 	ida_simple_remove(&devfreq_ida, dfc->id);
+
+remove_qos_req:
+	dev_pm_qos_remove_request(&dfc->req_max_freq);
+
 free_tables:
 	kfree(dfc->power_table);
 	kfree(dfc->freq_table);
@@ -600,6 +575,7 @@  void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
 	thermal_cooling_device_unregister(dfc->cdev);
 	ida_simple_remove(&devfreq_ida, dfc->id);
+	dev_pm_qos_remove_request(&dfc->req_max_freq);
 	kfree(dfc->power_table);
 	kfree(dfc->freq_table);