diff mbox series

[v3,3/6] cpufreq: qcom-cpufreq-hw: Add dcvs interrupt support

Message ID 20210708120656.663851-4-thara.gopinath@linaro.org (mailing list archive)
State Not Applicable, archived
Headers show
Series Introduce LMh driver for Qualcomm SoCs | expand

Commit Message

Thara Gopinath July 8, 2021, 12:06 p.m. UTC
Add interrupt support to notify the kernel of h/w initiated frequency
throttling by LMh. Convey this to scheduler via thermal presssure
interface.

Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
---

v2->v3:
	- Cosmetic fixes from review comments on the list.
	- Moved all LMh initializations to qcom_cpufreq_hw_lmh_init.
	- Added freeing of LMh interrupt and cancelling the polling worker to
	  qcom_cpufreq_hw_cpu_exit as per Viresh's suggestion.
	- LMh interrupts are now tied to cpu dev and not cpufreq dev. This will be
	  useful for further generation of SoCs where the same interrupt signals
	  multiple cpu clusters.

v1->v2:
	- Introduced qcom_cpufreq_hw_lmh_init to consolidate LMh related initializations
	  as per Viresh's review comment.
	- Moved the piece of code restarting polling/re-enabling LMh interrupt to
	  qcom_lmh_dcvs_notify therby simplifying isr and timer callback as per Viresh's
	  suggestion.
	- Droped cpus from qcom_cpufreq_data and instead using cpus from cpufreq_policy in
	  qcom_lmh_dcvs_notify as per Viresh's review comment.
	- Dropped dt property qcom,support-lmh as per Bjorn's suggestion.
	- Other minor/cosmetic fixes

 drivers/cpufreq/qcom-cpufreq-hw.c | 118 ++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

Comments

Viresh Kumar July 9, 2021, 6:46 a.m. UTC | #1
On 08-07-21, 08:06, Thara Gopinath wrote:
>  static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>  {
>  	struct platform_device *pdev = cpufreq_get_driver_data();
> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>  			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
>  	}
>  
> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);

You missed unregistering EM here (which is also missing from exit,
which you need to fix first in a separate patch).

> +	if (ret)
> +		goto error;
> +
>  	return 0;
>  error:
>  	kfree(data);
> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>  
>  	dev_pm_opp_remove_all_dynamic(cpu_dev);
>  	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
> +	if (data->lmh_dcvs_irq > 0) {
> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);

Why using devm variants here and while requesting the irq ? 

> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
> +	}

Please move this to qcom_cpufreq_hw_lmh_exit() or something.

Now with sequence of disabling interrupt, etc, I see a potential
problem.

CPU0                                    CPU1

qcom_cpufreq_hw_cpu_exit()
-> devm_free_irq();
                                        qcom_lmh_dcvs_poll()
                                        -> qcom_lmh_dcvs_notify()
                                          -> enable_irq()

-> cancel_delayed_work_sync();


What will happen if enable_irq() gets called after freeing the irq ?
Not sure, but it looks like you will hit this then from manage.c:

WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before
                                     setup/request_irq: irq %u\n", irq))

?

You got a chicken n egg problem :)
Thara Gopinath July 9, 2021, 3:37 p.m. UTC | #2
On 7/9/21 2:46 AM, Viresh Kumar wrote:
> On 08-07-21, 08:06, Thara Gopinath wrote:
>>   static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   {
>>   	struct platform_device *pdev = cpufreq_get_driver_data();
>> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
>>   	}
>>   
>> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);
> 
> You missed unregistering EM here (which is also missing from exit,
> which you need to fix first in a separate patch).

Hi!

So how exactly do you do this? I checked other users of the api and I do 
not see any free. I would say if needed, it should be a separate patch 
and outside of this series.

> 
>> +	if (ret)
>> +		goto error;
>> +
>>   	return 0;
>>   error:
>>   	kfree(data);
>> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>>   
>>   	dev_pm_opp_remove_all_dynamic(cpu_dev);
>>   	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
>> +	if (data->lmh_dcvs_irq > 0) {
>> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
> 
> Why using devm variants here and while requesting the irq ?
> 
>> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
>> +	}
> 
> Please move this to qcom_cpufreq_hw_lmh_exit() or something.

Ok.

> 
> Now with sequence of disabling interrupt, etc, I see a potential
> problem.
> 
> CPU0                                    CPU1
> 
> qcom_cpufreq_hw_cpu_exit()
> -> devm_free_irq();
>                                          qcom_lmh_dcvs_poll()
>                                          -> qcom_lmh_dcvs_notify()
>                                            -> enable_irq()
> 
> -> cancel_delayed_work_sync();
> 
> 
> What will happen if enable_irq() gets called after freeing the irq ?
> Not sure, but it looks like you will hit this then from manage.c:
> 
> WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before
>                                       setup/request_irq: irq %u\n", irq))
> 
> ?
> 
> You got a chicken n egg problem :)

Yes indeed! But also it is a very rare chicken and egg problem.
The scenario here is that the cpus are busy and running load causing a 
thermal overrun and lmh is engaged. At the same time for this issue to 
be hit the cpu is trying to exit/disable cpufreq. Calling 
cancel_delayed_work_sync first could solve this issue, right ? 
cancel_delayed_work_sync guarantees the work not to be pending even if
it requeues itself on return. So once the delayed work is cancelled, the 
interrupts can be safely disabled. Thoughts ?


>
Bjorn Andersson July 10, 2021, 4:57 a.m. UTC | #3
On Thu 08 Jul 07:06 CDT 2021, Thara Gopinath wrote:

> Add interrupt support to notify the kernel of h/w initiated frequency
> throttling by LMh. Convey this to scheduler via thermal presssure
> interface.
> 
> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
> ---
> 
> v2->v3:
> 	- Cosmetic fixes from review comments on the list.
> 	- Moved all LMh initializations to qcom_cpufreq_hw_lmh_init.
> 	- Added freeing of LMh interrupt and cancelling the polling worker to
> 	  qcom_cpufreq_hw_cpu_exit as per Viresh's suggestion.
> 	- LMh interrupts are now tied to cpu dev and not cpufreq dev. This will be
> 	  useful for further generation of SoCs where the same interrupt signals
> 	  multiple cpu clusters.
> 
> v1->v2:
> 	- Introduced qcom_cpufreq_hw_lmh_init to consolidate LMh related initializations
> 	  as per Viresh's review comment.
> 	- Moved the piece of code restarting polling/re-enabling LMh interrupt to
> 	  qcom_lmh_dcvs_notify therby simplifying isr and timer callback as per Viresh's
> 	  suggestion.
> 	- Droped cpus from qcom_cpufreq_data and instead using cpus from cpufreq_policy in
> 	  qcom_lmh_dcvs_notify as per Viresh's review comment.
> 	- Dropped dt property qcom,support-lmh as per Bjorn's suggestion.
> 	- Other minor/cosmetic fixes
> 
>  drivers/cpufreq/qcom-cpufreq-hw.c | 118 ++++++++++++++++++++++++++++++
>  1 file changed, 118 insertions(+)
> 
> diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
> index f86859bf76f1..bb5fc700d913 100644
> --- a/drivers/cpufreq/qcom-cpufreq-hw.c
> +++ b/drivers/cpufreq/qcom-cpufreq-hw.c
> @@ -7,6 +7,7 @@
>  #include <linux/cpufreq.h>
>  #include <linux/init.h>
>  #include <linux/interconnect.h>
> +#include <linux/interrupt.h>
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/of_address.h>
> @@ -22,10 +23,13 @@
>  #define CLK_HW_DIV			2
>  #define LUT_TURBO_IND			1
>  
> +#define HZ_PER_KHZ			1000
> +
>  struct qcom_cpufreq_soc_data {
>  	u32 reg_enable;
>  	u32 reg_freq_lut;
>  	u32 reg_volt_lut;
> +	u32 reg_current_vote;
>  	u32 reg_perf_state;
>  	u8 lut_row_size;
>  };
> @@ -33,7 +37,10 @@ struct qcom_cpufreq_soc_data {
>  struct qcom_cpufreq_data {
>  	void __iomem *base;
>  	struct resource *res;
> +	struct delayed_work lmh_dcvs_poll_work;

How about dropping "lmh" from this variable name?

Perhaps "throttle_work" or something like that?

>  	const struct qcom_cpufreq_soc_data *soc_data;
> +	struct cpufreq_policy *policy;
> +	int lmh_dcvs_irq;

throttle_irq ?

>  };
>  
>  static unsigned long cpu_hw_rate, xo_rate;
> @@ -251,10 +258,84 @@ static void qcom_get_related_cpus(int index, struct cpumask *m)
>  	}
>  }
>  
> +static inline unsigned long qcom_lmh_vote_to_freq(u32 val)
> +{
> +	return (val & 0x3FF) * 19200;
> +}
> +
> +static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
> +{
> +	struct cpufreq_policy *policy = data->policy;
> +	struct dev_pm_opp *opp;
> +	struct device *dev;
> +	unsigned long max_capacity, capacity, freq_hz, throttled_freq;
> +	unsigned int val, freq;
> +
> +	/*
> +	 * Get the h/w throttled frequency, normalize it using the
> +	 * registered opp table and use it to calculate thermal pressure.
> +	 */
> +	val = readl_relaxed(data->base + data->soc_data->reg_current_vote);

I would find it cleaner to move the readl() into the helper function, as
you don't care about the register value, only the resulting frequency.

> +	freq = qcom_lmh_vote_to_freq(val);
> +	freq_hz = freq * HZ_PER_KHZ;
> +
> +	dev = get_cpu_device(cpumask_first(policy->cpus));
> +	opp = dev_pm_opp_find_freq_floor(dev, &freq_hz);
> +	if (IS_ERR(opp) && PTR_ERR(opp) == -ERANGE)
> +		opp = dev_pm_opp_find_freq_ceil(dev, &freq_hz);
> +
> +	throttled_freq = freq_hz / HZ_PER_KHZ;
> +
> +	/* Update thermal pressure */
> +
> +	max_capacity = arch_scale_cpu_capacity(cpumask_first(policy->cpus));
> +	capacity = throttled_freq * max_capacity;
> +	capacity /= policy->cpuinfo.max_freq;

Perhaps, to avoid overflows if this is ever used on a 32-bit platform
use:

	mult_frac(max_capacity, throttled_freq, policy->cpuinfo.max_freq)

> +
> +	/* Don't pass boost capacity to scheduler */
> +	if (capacity > max_capacity)
> +		capacity = max_capacity;
> +
> +	arch_set_thermal_pressure(policy->cpus, max_capacity - capacity);
> +
> +	/*
> +	 * If h/w throttled frequency is higher than what cpufreq has requested for, stop
> +	 * polling and switch back to interrupt mechanism
> +	 */
> +
> +	if (throttled_freq >= qcom_cpufreq_hw_get(cpumask_first(policy->cpus)))
> +		/* Clear the existing interrupts and enable it back */
> +		enable_irq(data->lmh_dcvs_irq);
> +	else
> +		mod_delayed_work(system_highpri_wq, &data->lmh_dcvs_poll_work,
> +				 msecs_to_jiffies(10));
> +}
> +
> +static void qcom_lmh_dcvs_poll(struct work_struct *work)
> +{
> +	struct qcom_cpufreq_data *data;
> +
> +	data = container_of(work, struct qcom_cpufreq_data, lmh_dcvs_poll_work.work);
> +
> +	qcom_lmh_dcvs_notify(data);
> +}
> +
> +static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data)
> +{
> +	struct qcom_cpufreq_data *c_data = data;
> +
> +	/* Disable interrupt and enable polling */
> +	disable_irq_nosync(c_data->lmh_dcvs_irq);
> +	qcom_lmh_dcvs_notify(c_data);
> +
> +	return 0;
> +}
> +
>  static const struct qcom_cpufreq_soc_data qcom_soc_data = {
>  	.reg_enable = 0x0,
>  	.reg_freq_lut = 0x110,
>  	.reg_volt_lut = 0x114,
> +	.reg_current_vote = 0x704,
>  	.reg_perf_state = 0x920,
>  	.lut_row_size = 32,
>  };
> @@ -274,6 +355,35 @@ static const struct of_device_id qcom_cpufreq_hw_match[] = {
>  };
>  MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match);
>  
> +static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index)
> +{
> +	struct qcom_cpufreq_data *data = policy->driver_data;
> +	struct platform_device *pdev = cpufreq_get_driver_data();
> +	struct device *cpu_dev = get_cpu_device(policy->cpu);
> +	char irq_name[15];
> +	int ret;
> +
> +	/*
> +	 * Look for LMh interrupt. If no interrupt line is specified /
> +	 * if there is an error, allow cpufreq to be enabled as usual.
> +	 */
> +	data->lmh_dcvs_irq = platform_get_irq(pdev, index);
> +	if (data->lmh_dcvs_irq <= 0)
> +		return data->lmh_dcvs_irq == -EPROBE_DEFER ? -EPROBE_DEFER : 0;
> +
> +	snprintf(irq_name, sizeof(irq_name), "dcvsh-irq-%u", policy->cpu);
> +	ret = devm_request_irq(cpu_dev, data->lmh_dcvs_irq, qcom_lmh_dcvs_handle_irq,
> +			       0, irq_name, data);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Error %d registering irq %x\n", ret, data->lmh_dcvs_irq);

The irq number here won't have any meaning, and %x wouldn't be suitable.

How about ..."Error registering %s: %d\n", irq_name, ret); ?

> +		return 0;

This sounds like a problem, wouldn't it be suitable to treat it as a
problem?

> +	}
> +	data->policy = policy;

Afaict, no one is going to access data->policy unless devm_request_irq()
succeeds and if it does and the interrupt fires immediately it would be
too late to set it here. So better move it earlier.

> +	INIT_DEFERRABLE_WORK(&data->lmh_dcvs_poll_work, qcom_lmh_dcvs_poll);

What if the interrupt fires before you initialize the work? Better move
this higher up.

> +
> +	return 0;
> +}
> +
>  static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>  {
>  	struct platform_device *pdev = cpufreq_get_driver_data();
> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>  			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
>  	}
>  
> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);
> +	if (ret)
> +		goto error;
> +
>  	return 0;
>  error:
>  	kfree(data);
> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>  
>  	dev_pm_opp_remove_all_dynamic(cpu_dev);
>  	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
> +	if (data->lmh_dcvs_irq > 0) {
> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);

As init/exit are called multiple times you should avoid the devm
variants.

Regards,
Bjorn

> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
> +	}
>  	kfree(policy->freq_table);
>  	kfree(data);
>  	iounmap(base);
> -- 
> 2.25.1
>
Viresh Kumar July 12, 2021, 4:35 a.m. UTC | #4
+Lukasz,

On 09-07-21, 11:37, Thara Gopinath wrote:
> On 7/9/21 2:46 AM, Viresh Kumar wrote:
> > On 08-07-21, 08:06, Thara Gopinath wrote:
> > >   static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
> > >   {
> > >   	struct platform_device *pdev = cpufreq_get_driver_data();
> > > @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
> > >   			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
> > >   	}
> > > +	ret = qcom_cpufreq_hw_lmh_init(policy, index);
> > 
> > You missed unregistering EM here (which is also missing from exit,
> > which you need to fix first in a separate patch).
> 
> Hi!
> 
> So how exactly do you do this? I checked other users of the api and I do not
> see any free.

Lukasz,

I don't see the cpufreq drivers ever calling dev_pm_opp_of_unregister_em(), and
even if they called, it would translate to em_dev_unregister_perf_domain(),
which has this:

	if (_is_cpu_device(dev))
		return;

I am not sure what's going on here, can you help ?
Viresh Kumar July 12, 2021, 4:41 a.m. UTC | #5
On 09-07-21, 11:37, Thara Gopinath wrote:
> On 7/9/21 2:46 AM, Viresh Kumar wrote:
> > > @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
> > >   	dev_pm_opp_remove_all_dynamic(cpu_dev);
> > >   	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
> > > +	if (data->lmh_dcvs_irq > 0) {
> > > +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
> > 
> > Why using devm variants here and while requesting the irq ?

Missed this one ?

> > 
> > > +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
> > > +	}
> > 
> > Please move this to qcom_cpufreq_hw_lmh_exit() or something.
> 
> Ok.
> 
> > 
> > Now with sequence of disabling interrupt, etc, I see a potential
> > problem.
> > 
> > CPU0                                    CPU1
> > 
> > qcom_cpufreq_hw_cpu_exit()
> > -> devm_free_irq();
> >                                          qcom_lmh_dcvs_poll()
> >                                          -> qcom_lmh_dcvs_notify()
> >                                            -> enable_irq()
> > 
> > -> cancel_delayed_work_sync();
> > 
> > 
> > What will happen if enable_irq() gets called after freeing the irq ?
> > Not sure, but it looks like you will hit this then from manage.c:
> > 
> > WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before
> >                                       setup/request_irq: irq %u\n", irq))
> > 
> > ?
> > 
> > You got a chicken n egg problem :)
> 
> Yes indeed! But also it is a very rare chicken and egg problem.
> The scenario here is that the cpus are busy and running load causing a
> thermal overrun and lmh is engaged. At the same time for this issue to be
> hit the cpu is trying to exit/disable cpufreq.

Yes, it is a very specific case but it needs to be resolved anyway. You don't
want to get this ever :)

> Calling
> cancel_delayed_work_sync first could solve this issue, right ?
> cancel_delayed_work_sync guarantees the work not to be pending even if
> it requeues itself on return. So once the delayed work is cancelled, the
> interrupts can be safely disabled. Thoughts ?

I don't think even that would provide such guarantees to you here, as there is
a chance the work gets queued again because of an interrupt that triggers right
after you cancel the work.

The basic way of solving such issues is that once you cancel something, you need
to guarantee that it doesn't get triggered again, no matter what.

The problem here I see is with your design itself, both delayed work and irq can
enable each other, so no matter which one you disable first, won't be
sufficient. You need to fix that design somehow.
Thara Gopinath July 13, 2021, 1:09 a.m. UTC | #6
On 7/10/21 12:57 AM, Bjorn Andersson wrote:
> On Thu 08 Jul 07:06 CDT 2021, Thara Gopinath wrote:
> 
>> Add interrupt support to notify the kernel of h/w initiated frequency
>> throttling by LMh. Convey this to scheduler via thermal presssure
>> interface.
>>
>> Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
>> ---
>>
>> v2->v3:
>> 	- Cosmetic fixes from review comments on the list.
>> 	- Moved all LMh initializations to qcom_cpufreq_hw_lmh_init.
>> 	- Added freeing of LMh interrupt and cancelling the polling worker to
>> 	  qcom_cpufreq_hw_cpu_exit as per Viresh's suggestion.
>> 	- LMh interrupts are now tied to cpu dev and not cpufreq dev. This will be
>> 	  useful for further generation of SoCs where the same interrupt signals
>> 	  multiple cpu clusters.
>>
>> v1->v2:
>> 	- Introduced qcom_cpufreq_hw_lmh_init to consolidate LMh related initializations
>> 	  as per Viresh's review comment.
>> 	- Moved the piece of code restarting polling/re-enabling LMh interrupt to
>> 	  qcom_lmh_dcvs_notify therby simplifying isr and timer callback as per Viresh's
>> 	  suggestion.
>> 	- Droped cpus from qcom_cpufreq_data and instead using cpus from cpufreq_policy in
>> 	  qcom_lmh_dcvs_notify as per Viresh's review comment.
>> 	- Dropped dt property qcom,support-lmh as per Bjorn's suggestion.
>> 	- Other minor/cosmetic fixes
>>
>>   drivers/cpufreq/qcom-cpufreq-hw.c | 118 ++++++++++++++++++++++++++++++
>>   1 file changed, 118 insertions(+)
>>
>> diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
>> index f86859bf76f1..bb5fc700d913 100644
>> --- a/drivers/cpufreq/qcom-cpufreq-hw.c
>> +++ b/drivers/cpufreq/qcom-cpufreq-hw.c
>> @@ -7,6 +7,7 @@
>>   #include <linux/cpufreq.h>
>>   #include <linux/init.h>
>>   #include <linux/interconnect.h>
>> +#include <linux/interrupt.h>
>>   #include <linux/kernel.h>
>>   #include <linux/module.h>
>>   #include <linux/of_address.h>
>> @@ -22,10 +23,13 @@
>>   #define CLK_HW_DIV			2
>>   #define LUT_TURBO_IND			1
>>   
>> +#define HZ_PER_KHZ			1000
>> +
>>   struct qcom_cpufreq_soc_data {
>>   	u32 reg_enable;
>>   	u32 reg_freq_lut;
>>   	u32 reg_volt_lut;
>> +	u32 reg_current_vote;
>>   	u32 reg_perf_state;
>>   	u8 lut_row_size;
>>   };
>> @@ -33,7 +37,10 @@ struct qcom_cpufreq_soc_data {
>>   struct qcom_cpufreq_data {
>>   	void __iomem *base;
>>   	struct resource *res;
>> +	struct delayed_work lmh_dcvs_poll_work;
> 
> How about dropping "lmh" from this variable name?
> 
> Perhaps "throttle_work" or something like that?
> 
>>   	const struct qcom_cpufreq_soc_data *soc_data;
>> +	struct cpufreq_policy *policy;
>> +	int lmh_dcvs_irq;
> 
> throttle_irq ?

sounds good!

> 
>>   };
>>   
>>   static unsigned long cpu_hw_rate, xo_rate;
>> @@ -251,10 +258,84 @@ static void qcom_get_related_cpus(int index, struct cpumask *m)
>>   	}
>>   }
>>   
>> +static inline unsigned long qcom_lmh_vote_to_freq(u32 val)
>> +{
>> +	return (val & 0x3FF) * 19200;
>> +}
>> +
>> +static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
>> +{
>> +	struct cpufreq_policy *policy = data->policy;
>> +	struct dev_pm_opp *opp;
>> +	struct device *dev;
>> +	unsigned long max_capacity, capacity, freq_hz, throttled_freq;
>> +	unsigned int val, freq;
>> +
>> +	/*
>> +	 * Get the h/w throttled frequency, normalize it using the
>> +	 * registered opp table and use it to calculate thermal pressure.
>> +	 */
>> +	val = readl_relaxed(data->base + data->soc_data->reg_current_vote);
> 
> I would find it cleaner to move the readl() into the helper function, as
> you don't care about the register value, only the resulting frequency.

Ok..

> 
>> +	freq = qcom_lmh_vote_to_freq(val);
>> +	freq_hz = freq * HZ_PER_KHZ;
>> +
>> +	dev = get_cpu_device(cpumask_first(policy->cpus));
>> +	opp = dev_pm_opp_find_freq_floor(dev, &freq_hz);
>> +	if (IS_ERR(opp) && PTR_ERR(opp) == -ERANGE)
>> +		opp = dev_pm_opp_find_freq_ceil(dev, &freq_hz);
>> +
>> +	throttled_freq = freq_hz / HZ_PER_KHZ;
>> +
>> +	/* Update thermal pressure */
>> +
>> +	max_capacity = arch_scale_cpu_capacity(cpumask_first(policy->cpus));
>> +	capacity = throttled_freq * max_capacity;
>> +	capacity /= policy->cpuinfo.max_freq;
> 
> Perhaps, to avoid overflows if this is ever used on a 32-bit platform
> use:
> 
> 	mult_frac(max_capacity, throttled_freq, policy->cpuinfo.max_freq)

yep. sounds good.

> 
>> +
>> +	/* Don't pass boost capacity to scheduler */
>> +	if (capacity > max_capacity)
>> +		capacity = max_capacity;
>> +
>> +	arch_set_thermal_pressure(policy->cpus, max_capacity - capacity);
>> +
>> +	/*
>> +	 * If h/w throttled frequency is higher than what cpufreq has requested for, stop
>> +	 * polling and switch back to interrupt mechanism
>> +	 */
>> +
>> +	if (throttled_freq >= qcom_cpufreq_hw_get(cpumask_first(policy->cpus)))
>> +		/* Clear the existing interrupts and enable it back */
>> +		enable_irq(data->lmh_dcvs_irq);
>> +	else
>> +		mod_delayed_work(system_highpri_wq, &data->lmh_dcvs_poll_work,
>> +				 msecs_to_jiffies(10));
>> +}
>> +
>> +static void qcom_lmh_dcvs_poll(struct work_struct *work)
>> +{
>> +	struct qcom_cpufreq_data *data;
>> +
>> +	data = container_of(work, struct qcom_cpufreq_data, lmh_dcvs_poll_work.work);
>> +
>> +	qcom_lmh_dcvs_notify(data);
>> +}
>> +
>> +static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data)
>> +{
>> +	struct qcom_cpufreq_data *c_data = data;
>> +
>> +	/* Disable interrupt and enable polling */
>> +	disable_irq_nosync(c_data->lmh_dcvs_irq);
>> +	qcom_lmh_dcvs_notify(c_data);
>> +
>> +	return 0;
>> +}
>> +
>>   static const struct qcom_cpufreq_soc_data qcom_soc_data = {
>>   	.reg_enable = 0x0,
>>   	.reg_freq_lut = 0x110,
>>   	.reg_volt_lut = 0x114,
>> +	.reg_current_vote = 0x704,
>>   	.reg_perf_state = 0x920,
>>   	.lut_row_size = 32,
>>   };
>> @@ -274,6 +355,35 @@ static const struct of_device_id qcom_cpufreq_hw_match[] = {
>>   };
>>   MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match);
>>   
>> +static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index)
>> +{
>> +	struct qcom_cpufreq_data *data = policy->driver_data;
>> +	struct platform_device *pdev = cpufreq_get_driver_data();
>> +	struct device *cpu_dev = get_cpu_device(policy->cpu);
>> +	char irq_name[15];
>> +	int ret;
>> +
>> +	/*
>> +	 * Look for LMh interrupt. If no interrupt line is specified /
>> +	 * if there is an error, allow cpufreq to be enabled as usual.
>> +	 */
>> +	data->lmh_dcvs_irq = platform_get_irq(pdev, index);
>> +	if (data->lmh_dcvs_irq <= 0)
>> +		return data->lmh_dcvs_irq == -EPROBE_DEFER ? -EPROBE_DEFER : 0;
>> +
>> +	snprintf(irq_name, sizeof(irq_name), "dcvsh-irq-%u", policy->cpu);
>> +	ret = devm_request_irq(cpu_dev, data->lmh_dcvs_irq, qcom_lmh_dcvs_handle_irq,
>> +			       0, irq_name, data);
>> +	if (ret) {
>> +		dev_err(&pdev->dev, "Error %d registering irq %x\n", ret, data->lmh_dcvs_irq);
> 
> The irq number here won't have any meaning, and %x wouldn't be suitable.
> 
> How about ..."Error registering %s: %d\n", irq_name, ret); ?

ok.

> 
>> +		return 0;
> 
> This sounds like a problem, wouldn't it be suitable to treat it as a
> problem?

I thought a lot about this. My point is even if LMh does not get enabled 
due to some reason, cpufreq should be enabled. If I return an error back 
from here, cpufreq will be disabled.


> 
>> +	}
>> +	data->policy = policy;
> 
> Afaict, no one is going to access data->policy unless devm_request_irq()
> succeeds and if it does and the interrupt fires immediately it would be
> too late to set it here. So better move it earlier.
> 
>> +	INIT_DEFERRABLE_WORK(&data->lmh_dcvs_poll_work, qcom_lmh_dcvs_poll);
> 
> What if the interrupt fires before you initialize the work? Better move
> this higher up.

I will move this and the data->policy = policy above before requesting 
the interrupt.

> 
>> +
>> +	return 0;
>> +}
>> +
>>   static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   {
>>   	struct platform_device *pdev = cpufreq_get_driver_data();
>> @@ -370,6 +480,10 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
>>   			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
>>   	}
>>   
>> +	ret = qcom_cpufreq_hw_lmh_init(policy, index);
>> +	if (ret)
>> +		goto error;
>> +
>>   	return 0;
>>   error:
>>   	kfree(data);
>> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>>   
>>   	dev_pm_opp_remove_all_dynamic(cpu_dev);
>>   	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
>> +	if (data->lmh_dcvs_irq > 0) {
>> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
> 
> As init/exit are called multiple times you should avoid the devm
> variants.

Yes. I think Viresh was also mentioning this. I will move to non devm 
version.

> 
> Regards,
> Bjorn
> 
>> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
>> +	}
>>   	kfree(policy->freq_table);
>>   	kfree(data);
>>   	iounmap(base);
>> -- 
>> 2.25.1
>>
Thara Gopinath July 13, 2021, 1:18 a.m. UTC | #7
On 7/12/21 12:41 AM, Viresh Kumar wrote:
> On 09-07-21, 11:37, Thara Gopinath wrote:
>> On 7/9/21 2:46 AM, Viresh Kumar wrote:
>>>> @@ -389,6 +503,10 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
>>>>    	dev_pm_opp_remove_all_dynamic(cpu_dev);
>>>>    	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
>>>> +	if (data->lmh_dcvs_irq > 0) {
>>>> +		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
>>>
>>> Why using devm variants here and while requesting the irq ?
> 
> Missed this one ?

Yep. I just replied to Bjorn's email on this. I will move to non devm 
version.

> 
>>>
>>>> +		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
>>>> +	}
>>>
>>> Please move this to qcom_cpufreq_hw_lmh_exit() or something.
>>
>> Ok.
>>
>>>
>>> Now with sequence of disabling interrupt, etc, I see a potential
>>> problem.
>>>
>>> CPU0                                    CPU1
>>>
>>> qcom_cpufreq_hw_cpu_exit()
>>> -> devm_free_irq();
>>>                                           qcom_lmh_dcvs_poll()
>>>                                           -> qcom_lmh_dcvs_notify()
>>>                                             -> enable_irq()
>>>
>>> -> cancel_delayed_work_sync();
>>>
>>>
>>> What will happen if enable_irq() gets called after freeing the irq ?
>>> Not sure, but it looks like you will hit this then from manage.c:
>>>
>>> WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before
>>>                                        setup/request_irq: irq %u\n", irq))
>>>
>>> ?
>>>
>>> You got a chicken n egg problem :)
>>
>> Yes indeed! But also it is a very rare chicken and egg problem.
>> The scenario here is that the cpus are busy and running load causing a
>> thermal overrun and lmh is engaged. At the same time for this issue to be
>> hit the cpu is trying to exit/disable cpufreq.
> 
> Yes, it is a very specific case but it needs to be resolved anyway. You don't
> want to get this ever :)
> 
>> Calling
>> cancel_delayed_work_sync first could solve this issue, right ?
>> cancel_delayed_work_sync guarantees the work not to be pending even if
>> it requeues itself on return. So once the delayed work is cancelled, the
>> interrupts can be safely disabled. Thoughts ?
> 
> I don't think even that would provide such guarantees to you here, as there is
> a chance the work gets queued again because of an interrupt that triggers right
> after you cancel the work.
> 
> The basic way of solving such issues is that once you cancel something, you need
> to guarantee that it doesn't get triggered again, no matter what.
> 
> The problem here I see is with your design itself, both delayed work and irq can
> enable each other, so no matter which one you disable first, won't be
> sufficient. You need to fix that design somehow.

So I really need the interrupt to fire and then the timer to kick in and 
take up the monitoring. I can think of introducing a variable 
is_disabled which is updated and read under a spinlock. 
qcom_cpufreq_hw_cpu_exit can hold the spinlock and set is_disabled to 
true prior to cancelling the work queue or disabling the interrupt. 
Before re-enabling the interrupt or re-queuing the work in 
qcom_lmh_dcvs_notify, is_disabled can be read and checked.

But does this problem not exist in target_index , fast_switch etc also ? 
One cpu can be disabling and the other one can be updating the target 
right?

>
Viresh Kumar July 13, 2021, 3:18 a.m. UTC | #8
On 12-07-21, 21:18, Thara Gopinath wrote:
> So I really need the interrupt to fire and then the timer to kick in and
> take up the monitoring. I can think of introducing a variable is_disabled
> which is updated and read under a spinlock. qcom_cpufreq_hw_cpu_exit can
> hold the spinlock and set is_disabled to true prior to cancelling the work
> queue or disabling the interrupt. Before re-enabling the interrupt or
> re-queuing the work in qcom_lmh_dcvs_notify, is_disabled can be read and
> checked.

Or you can make the lmh_dcvs_poll_work item a pointer and mark it NULL in exit,
with proper locking etc.

> But does this problem not exist in target_index , fast_switch etc also ? One
> cpu can be disabling and the other one can be updating the target right?

The race doesn't happen there as cpufreq_unregister_driver() takes care of
stopping everything before removing the policy. To be more precise, governor's
->stop() function is responsible for making sure that frequency won't be updated
any further.
Thara Gopinath July 14, 2021, 12:37 p.m. UTC | #9
On 7/12/21 11:18 PM, Viresh Kumar wrote:
> On 12-07-21, 21:18, Thara Gopinath wrote:
>> So I really need the interrupt to fire and then the timer to kick in and
>> take up the monitoring. I can think of introducing a variable is_disabled
>> which is updated and read under a spinlock. qcom_cpufreq_hw_cpu_exit can
>> hold the spinlock and set is_disabled to true prior to cancelling the work
>> queue or disabling the interrupt. Before re-enabling the interrupt or
>> re-queuing the work in qcom_lmh_dcvs_notify, is_disabled can be read and
>> checked.
> 
> Or you can make the lmh_dcvs_poll_work item a pointer and mark it NULL in exit,
> with proper locking etc.

Yes it could work. I will spin the next version with either this or 
introducing a new variable with locking.

> 
>> But does this problem not exist in target_index , fast_switch etc also ? One
>> cpu can be disabling and the other one can be updating the target right?
> 
> The race doesn't happen there as cpufreq_unregister_driver() takes care of
> stopping everything before removing the policy. To be more precise, governor's
> ->stop() function is responsible for making sure that frequency won't be updated
> any further.
>
diff mbox series

Patch

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index f86859bf76f1..bb5fc700d913 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -7,6 +7,7 @@ 
 #include <linux/cpufreq.h>
 #include <linux/init.h>
 #include <linux/interconnect.h>
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
@@ -22,10 +23,13 @@ 
 #define CLK_HW_DIV			2
 #define LUT_TURBO_IND			1
 
+#define HZ_PER_KHZ			1000
+
 struct qcom_cpufreq_soc_data {
 	u32 reg_enable;
 	u32 reg_freq_lut;
 	u32 reg_volt_lut;
+	u32 reg_current_vote;
 	u32 reg_perf_state;
 	u8 lut_row_size;
 };
@@ -33,7 +37,10 @@  struct qcom_cpufreq_soc_data {
 struct qcom_cpufreq_data {
 	void __iomem *base;
 	struct resource *res;
+	struct delayed_work lmh_dcvs_poll_work;
 	const struct qcom_cpufreq_soc_data *soc_data;
+	struct cpufreq_policy *policy;
+	int lmh_dcvs_irq;
 };
 
 static unsigned long cpu_hw_rate, xo_rate;
@@ -251,10 +258,84 @@  static void qcom_get_related_cpus(int index, struct cpumask *m)
 	}
 }
 
+static inline unsigned long qcom_lmh_vote_to_freq(u32 val)
+{
+	return (val & 0x3FF) * 19200;
+}
+
+static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
+{
+	struct cpufreq_policy *policy = data->policy;
+	struct dev_pm_opp *opp;
+	struct device *dev;
+	unsigned long max_capacity, capacity, freq_hz, throttled_freq;
+	unsigned int val, freq;
+
+	/*
+	 * Get the h/w throttled frequency, normalize it using the
+	 * registered opp table and use it to calculate thermal pressure.
+	 */
+	val = readl_relaxed(data->base + data->soc_data->reg_current_vote);
+	freq = qcom_lmh_vote_to_freq(val);
+	freq_hz = freq * HZ_PER_KHZ;
+
+	dev = get_cpu_device(cpumask_first(policy->cpus));
+	opp = dev_pm_opp_find_freq_floor(dev, &freq_hz);
+	if (IS_ERR(opp) && PTR_ERR(opp) == -ERANGE)
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq_hz);
+
+	throttled_freq = freq_hz / HZ_PER_KHZ;
+
+	/* Update thermal pressure */
+
+	max_capacity = arch_scale_cpu_capacity(cpumask_first(policy->cpus));
+	capacity = throttled_freq * max_capacity;
+	capacity /= policy->cpuinfo.max_freq;
+
+	/* Don't pass boost capacity to scheduler */
+	if (capacity > max_capacity)
+		capacity = max_capacity;
+
+	arch_set_thermal_pressure(policy->cpus, max_capacity - capacity);
+
+	/*
+	 * If h/w throttled frequency is higher than what cpufreq has requested for, stop
+	 * polling and switch back to interrupt mechanism
+	 */
+
+	if (throttled_freq >= qcom_cpufreq_hw_get(cpumask_first(policy->cpus)))
+		/* Clear the existing interrupts and enable it back */
+		enable_irq(data->lmh_dcvs_irq);
+	else
+		mod_delayed_work(system_highpri_wq, &data->lmh_dcvs_poll_work,
+				 msecs_to_jiffies(10));
+}
+
+static void qcom_lmh_dcvs_poll(struct work_struct *work)
+{
+	struct qcom_cpufreq_data *data;
+
+	data = container_of(work, struct qcom_cpufreq_data, lmh_dcvs_poll_work.work);
+
+	qcom_lmh_dcvs_notify(data);
+}
+
+static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data)
+{
+	struct qcom_cpufreq_data *c_data = data;
+
+	/* Disable interrupt and enable polling */
+	disable_irq_nosync(c_data->lmh_dcvs_irq);
+	qcom_lmh_dcvs_notify(c_data);
+
+	return 0;
+}
+
 static const struct qcom_cpufreq_soc_data qcom_soc_data = {
 	.reg_enable = 0x0,
 	.reg_freq_lut = 0x110,
 	.reg_volt_lut = 0x114,
+	.reg_current_vote = 0x704,
 	.reg_perf_state = 0x920,
 	.lut_row_size = 32,
 };
@@ -274,6 +355,35 @@  static const struct of_device_id qcom_cpufreq_hw_match[] = {
 };
 MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match);
 
+static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index)
+{
+	struct qcom_cpufreq_data *data = policy->driver_data;
+	struct platform_device *pdev = cpufreq_get_driver_data();
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	char irq_name[15];
+	int ret;
+
+	/*
+	 * Look for LMh interrupt. If no interrupt line is specified /
+	 * if there is an error, allow cpufreq to be enabled as usual.
+	 */
+	data->lmh_dcvs_irq = platform_get_irq(pdev, index);
+	if (data->lmh_dcvs_irq <= 0)
+		return data->lmh_dcvs_irq == -EPROBE_DEFER ? -EPROBE_DEFER : 0;
+
+	snprintf(irq_name, sizeof(irq_name), "dcvsh-irq-%u", policy->cpu);
+	ret = devm_request_irq(cpu_dev, data->lmh_dcvs_irq, qcom_lmh_dcvs_handle_irq,
+			       0, irq_name, data);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering irq %x\n", ret, data->lmh_dcvs_irq);
+		return 0;
+	}
+	data->policy = policy;
+	INIT_DEFERRABLE_WORK(&data->lmh_dcvs_poll_work, qcom_lmh_dcvs_poll);
+
+	return 0;
+}
+
 static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 {
 	struct platform_device *pdev = cpufreq_get_driver_data();
@@ -370,6 +480,10 @@  static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 			dev_warn(cpu_dev, "failed to enable boost: %d\n", ret);
 	}
 
+	ret = qcom_cpufreq_hw_lmh_init(policy, index);
+	if (ret)
+		goto error;
+
 	return 0;
 error:
 	kfree(data);
@@ -389,6 +503,10 @@  static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy)
 
 	dev_pm_opp_remove_all_dynamic(cpu_dev);
 	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
+	if (data->lmh_dcvs_irq > 0) {
+		devm_free_irq(cpu_dev, data->lmh_dcvs_irq, data);
+		cancel_delayed_work_sync(&data->lmh_dcvs_poll_work);
+	}
 	kfree(policy->freq_table);
 	kfree(data);
 	iounmap(base);