diff mbox

[v4,5/7] cpufreq: Calculate number of busy CPUs

Message ID 1371661969-7660-6-git-send-email-l.majewski@samsung.com (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

Lukasz Majewski June 19, 2013, 5:12 p.m. UTC
In the core governor code, per cpu load value is calculated. This patch
uses it to mark processor as a "busy" one, when load value is higher than
90%.

New cpufreq sysfs attribute is created (busy_cpus). It is read only
and provides information about number of actually busy CPU.

Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
Signed-off-by: Myungjoo Ham <myungjoo.ham@samsung.com>

Changes for v4:
- New patch
---
 drivers/cpufreq/cpufreq.c          |   31 ++++++++++++++++++++++++++++++-
 drivers/cpufreq/cpufreq_governor.c |    1 +
 include/linux/cpufreq.h            |    3 +++
 3 files changed, 34 insertions(+), 1 deletion(-)

Comments

dirk.brandewie@gmail.com June 19, 2013, 6:01 p.m. UTC | #1
On 06/19/2013 10:12 AM, Lukasz Majewski wrote:
> In the core governor code, per cpu load value is calculated. This patch
> uses it to mark processor as a "busy" one, when load value is higher than
> 90%.
>
> New cpufreq sysfs attribute is created (busy_cpus). It is read only
> and provides information about number of actually busy CPU.
>

What is the intended use of this interface?

For drivers that have internal governors it will be misleading/wrong

--Dirk
> Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> Signed-off-by: Myungjoo Ham <myungjoo.ham@samsung.com>
>
> Changes for v4:
> - New patch
> ---
>   drivers/cpufreq/cpufreq.c          |   31 ++++++++++++++++++++++++++++++-
>   drivers/cpufreq/cpufreq_governor.c |    1 +
>   include/linux/cpufreq.h            |    3 +++
>   3 files changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 9141d33..f785273 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -48,6 +48,7 @@ static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor);
>   #endif
>   static DEFINE_RWLOCK(cpufreq_driver_lock);
>   static LIST_HEAD(cpufreq_policy_list);
> +static cpumask_t cpufreq_busy_cpus;
>
>   /*
>    * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
> @@ -317,6 +318,13 @@ EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
>   /*********************************************************************
>    *                          SYSFS INTERFACE                          *
>    *********************************************************************/
> +ssize_t show_busy_cpus(struct kobject *kobj,
> +				 struct attribute *attr, char *buf)
> +{
> +	return sprintf(buf, "%d\n", cpufreq_num_busy_cpu());
> +}
> +define_one_global_ro(busy_cpus);
> +
>   ssize_t show_boost(struct kobject *kobj,
>   				 struct attribute *attr, char *buf)
>   {
> @@ -1980,6 +1988,17 @@ int cpufreq_boost_enabled(void)
>   	return boost_enabled;
>   }
>
> +int cpufreq_num_busy_cpu(void)
> +{
> +	return cpumask_weight(&cpufreq_busy_cpus);
> +}
> +
> +void cpufreq_set_busy_cpu(int cpu, int val)
> +{
> +	val ? cpumask_set_cpu(cpu, &cpufreq_busy_cpus) :
> +		cpumask_clear_cpu(cpu, &cpufreq_busy_cpus);
> +}
> +
>   /*********************************************************************
>    *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
>    *********************************************************************/
> @@ -2019,6 +2038,13 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
>   	cpufreq_driver = driver_data;
>   	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
>
> +	ret = cpufreq_sysfs_create_file(&(busy_cpus.attr));
> +	if (ret) {
> +		pr_err("%s: cannot register global busy_cpus sysfs file\n",
> +		       __func__);
> +		goto err_null_driver;
> +	}
> +
>   	if (!cpufreq_driver->boost_supported)
>   		boost.attr.mode = 0444;
>
> @@ -2026,7 +2052,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
>   	if (ret) {
>   		pr_err("%s: cannot register global boost sysfs file\n",
>   		       __func__);
> -		goto err_null_driver;
> +		goto err_busy_idle_unreg;
>   	}
>
>   	ret = subsys_interface_register(&cpufreq_interface);
> @@ -2058,6 +2084,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
>   	return 0;
>   err_if_unreg:
>   	subsys_interface_unregister(&cpufreq_interface);
> +err_busy_idle_unreg:
> +	cpufreq_sysfs_remove_file(&(busy_cpus.attr));
>   err_null_driver:
>   	write_lock_irqsave(&cpufreq_driver_lock, flags);
>   	cpufreq_driver = NULL;
> @@ -2086,6 +2114,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
>
>   	subsys_interface_unregister(&cpufreq_interface);
>
> +	cpufreq_sysfs_remove_file(&(busy_cpus.attr));
>   	cpufreq_sysfs_remove_file(&(boost.attr));
>   	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
>
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 077cea7..3402533 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -148,6 +148,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
>   			continue;
>
>   		load = 100 * (wall_time - idle_time) / wall_time;
> +		cpufreq_set_busy_cpu(j, load > 90 ? 1 : 0);
>
>   		if (dbs_data->cdata->governor == GOV_ONDEMAND) {
>   			int freq_avg = __cpufreq_driver_getavg(policy, j);
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 4783c4c..536abfc 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -436,6 +436,9 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
>   int cpufreq_boost_trigger_state(int state);
>   int cpufreq_boost_supported(void);
>   int cpufreq_boost_enabled(void);
> +
> +void cpufreq_set_busy_cpu(int cpu, int val);
> +int cpufreq_num_busy_cpu(void);
>   /* the following 3 funtions are for cpufreq core use only */
>   struct cpufreq_frequency_table *cpufreq_frequency_get_table(unsigned int cpu);
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukasz Majewski June 19, 2013, 8:58 p.m. UTC | #2
On Wed, 19 Jun 2013 11:01:07 -0700
Dirk Brandewie <dirk.brandewie@gmail.com> wrote:

> On 06/19/2013 10:12 AM, Lukasz Majewski wrote:
> > In the core governor code, per cpu load value is calculated. This
> > patch uses it to mark processor as a "busy" one, when load value is
> > higher than 90%.
> >
> > New cpufreq sysfs attribute is created (busy_cpus). It is read only
> > and provides information about number of actually busy CPU.
> >
> 
> What is the intended use of this interface?

The kernel API would be handy with boost managed by software (like it is
done with exynos) and with LAB governor.

The intention is to have two save valves for boost:

1. Enable SW controlled boost only when we have just one busy CPU.

2. Use the Thermal subsystem to switch off SW managed boost when it
detects that SoC is overheating.

The problem with 2 is that, the boost code is compiled in to the cpufreq
core (no CONFIG_BOOST flag). Thereof we cannot guarantee, that thermal
would be always enabled (the KConfig select option). 
I think that thermal subsystem is a suitable place to disable SW boost
at emergency. However in my opinion this is not enough and precaution
defined at 1 is needed. 

I can remove exporting the "busy_cpu" sysfs attribute if you think, that
it would confuse userspace. Its purpose is mainly informational.

> 
> For drivers that have internal governors it will be misleading/wrong

Would you be so kind and give me an example of such an internal
governor?

Maybe I've overlooked some important information/usecase.

> 
> --Dirk
> > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > Signed-off-by: Myungjoo Ham <myungjoo.ham@samsung.com>
> >
> > Changes for v4:
> > - New patch
> > ---
> >   drivers/cpufreq/cpufreq.c          |   31
> > ++++++++++++++++++++++++++++++- drivers/cpufreq/cpufreq_governor.c
> > |    1 + include/linux/cpufreq.h            |    3 +++
> >   3 files changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 9141d33..f785273 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -48,6 +48,7 @@ static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN],
> > cpufreq_cpu_governor); #endif
> >   static DEFINE_RWLOCK(cpufreq_driver_lock);
> >   static LIST_HEAD(cpufreq_policy_list);
> > +static cpumask_t cpufreq_busy_cpus;
> >
> >   /*
> >    * cpu_policy_rwsem is a per CPU reader-writer semaphore designed
> > to cure @@ -317,6 +318,13 @@
> > EXPORT_SYMBOL_GPL(cpufreq_notify_transition); /*********************************************************************
> >    *                          SYSFS
> > INTERFACE                          *
> > *********************************************************************/
> > +ssize_t show_busy_cpus(struct kobject *kobj,
> > +				 struct attribute *attr, char *buf)
> > +{
> > +	return sprintf(buf, "%d\n", cpufreq_num_busy_cpu());
> > +}
> > +define_one_global_ro(busy_cpus);
> > +
> >   ssize_t show_boost(struct kobject *kobj,
> >   				 struct attribute *attr, char
> > *buf) {
> > @@ -1980,6 +1988,17 @@ int cpufreq_boost_enabled(void)
> >   	return boost_enabled;
> >   }
> >
> > +int cpufreq_num_busy_cpu(void)
> > +{
> > +	return cpumask_weight(&cpufreq_busy_cpus);
> > +}
> > +
> > +void cpufreq_set_busy_cpu(int cpu, int val)
> > +{
> > +	val ? cpumask_set_cpu(cpu, &cpufreq_busy_cpus) :
> > +		cpumask_clear_cpu(cpu, &cpufreq_busy_cpus);
> > +}
> > +
> >   /*********************************************************************
> >    *               REGISTER / UNREGISTER CPUFREQ
> > DRIVER                *
> > *********************************************************************/
> > @@ -2019,6 +2038,13 @@ int cpufreq_register_driver(struct
> > cpufreq_driver *driver_data) cpufreq_driver = driver_data;
> > write_unlock_irqrestore(&cpufreq_driver_lock, flags);
> >
> > +	ret = cpufreq_sysfs_create_file(&(busy_cpus.attr));
> > +	if (ret) {
> > +		pr_err("%s: cannot register global busy_cpus sysfs
> > file\n",
> > +		       __func__);
> > +		goto err_null_driver;
> > +	}
> > +
> >   	if (!cpufreq_driver->boost_supported)
> >   		boost.attr.mode = 0444;
> >
> > @@ -2026,7 +2052,7 @@ int cpufreq_register_driver(struct
> > cpufreq_driver *driver_data) if (ret) {
> >   		pr_err("%s: cannot register global boost sysfs
> > file\n", __func__);
> > -		goto err_null_driver;
> > +		goto err_busy_idle_unreg;
> >   	}
> >
> >   	ret = subsys_interface_register(&cpufreq_interface);
> > @@ -2058,6 +2084,8 @@ int cpufreq_register_driver(struct
> > cpufreq_driver *driver_data) return 0;
> >   err_if_unreg:
> >   	subsys_interface_unregister(&cpufreq_interface);
> > +err_busy_idle_unreg:
> > +	cpufreq_sysfs_remove_file(&(busy_cpus.attr));
> >   err_null_driver:
> >   	write_lock_irqsave(&cpufreq_driver_lock, flags);
> >   	cpufreq_driver = NULL;
> > @@ -2086,6 +2114,7 @@ int cpufreq_unregister_driver(struct
> > cpufreq_driver *driver)
> >
> >   	subsys_interface_unregister(&cpufreq_interface);
> >
> > +	cpufreq_sysfs_remove_file(&(busy_cpus.attr));
> >   	cpufreq_sysfs_remove_file(&(boost.attr));
> >   	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
> >
> > diff --git a/drivers/cpufreq/cpufreq_governor.c
> > b/drivers/cpufreq/cpufreq_governor.c index 077cea7..3402533 100644
> > --- a/drivers/cpufreq/cpufreq_governor.c
> > +++ b/drivers/cpufreq/cpufreq_governor.c
> > @@ -148,6 +148,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data,
> > int cpu) continue;
> >
> >   		load = 100 * (wall_time - idle_time) / wall_time;
> > +		cpufreq_set_busy_cpu(j, load > 90 ? 1 : 0);
> >
> >   		if (dbs_data->cdata->governor == GOV_ONDEMAND) {
> >   			int freq_avg =
> > __cpufreq_driver_getavg(policy, j); diff --git
> > a/include/linux/cpufreq.h b/include/linux/cpufreq.h index
> > 4783c4c..536abfc 100644 --- a/include/linux/cpufreq.h
> > +++ b/include/linux/cpufreq.h
> > @@ -436,6 +436,9 @@ int cpufreq_frequency_table_target(struct
> > cpufreq_policy *policy, int cpufreq_boost_trigger_state(int state);
> >   int cpufreq_boost_supported(void);
> >   int cpufreq_boost_enabled(void);
> > +
> > +void cpufreq_set_busy_cpu(int cpu, int val);
> > +int cpufreq_num_busy_cpu(void);
> >   /* the following 3 funtions are for cpufreq core use only */
> >   struct cpufreq_frequency_table
> > *cpufreq_frequency_get_table(unsigned int cpu);
> >
> >
> 
Best regards,

Lukasz Majewski
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki June 19, 2013, 10:26 p.m. UTC | #3
On Wednesday, June 19, 2013 10:58:32 PM Lukasz Majewski wrote:
> On Wed, 19 Jun 2013 11:01:07 -0700
> Dirk Brandewie <dirk.brandewie@gmail.com> wrote:
> 
> > On 06/19/2013 10:12 AM, Lukasz Majewski wrote:
> > > In the core governor code, per cpu load value is calculated. This
> > > patch uses it to mark processor as a "busy" one, when load value is
> > > higher than 90%.
> > >
> > > New cpufreq sysfs attribute is created (busy_cpus). It is read only
> > > and provides information about number of actually busy CPU.
> > >
> > 
> > What is the intended use of this interface?
> 
> The kernel API would be handy with boost managed by software (like it is
> done with exynos) and with LAB governor.
> 
> The intention is to have two save valves for boost:
> 
> 1. Enable SW controlled boost only when we have just one busy CPU.
> 
> 2. Use the Thermal subsystem to switch off SW managed boost when it
> detects that SoC is overheating.
> 
> The problem with 2 is that, the boost code is compiled in to the cpufreq
> core (no CONFIG_BOOST flag). Thereof we cannot guarantee, that thermal
> would be always enabled (the KConfig select option). 
> I think that thermal subsystem is a suitable place to disable SW boost
> at emergency. However in my opinion this is not enough and precaution
> defined at 1 is needed. 
> 
> I can remove exporting the "busy_cpu" sysfs attribute if you think, that
> it would confuse userspace. Its purpose is mainly informational.
> 
> > 
> > For drivers that have internal governors it will be misleading/wrong
> 
> Would you be so kind and give me an example of such an internal
> governor?
> 
> Maybe I've overlooked some important information/usecase.

Please look at intel_pstate.c.

Thanks,
Rafael
Viresh Kumar June 27, 2013, 9:36 a.m. UTC | #4
On 19 June 2013 22:42, Lukasz Majewski <l.majewski@samsung.com> wrote:
> In the core governor code, per cpu load value is calculated. This patch
> uses it to mark processor as a "busy" one, when load value is higher than
> 90%.

How can we take this assumption. What about a cpu which is only 70-80%
busy? So, we would be running at around 70% of policy->max.. Now at
this point you want to enable boost frequency and so that will happen for
all cores... Wouldn't that burn your chip?

> New cpufreq sysfs attribute is created (busy_cpus). It is read only
> and provides information about number of actually busy CPU.

Not required.

> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 077cea7..3402533 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -148,6 +148,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
>                         continue;
>
>                 load = 100 * (wall_time - idle_time) / wall_time;
> +               cpufreq_set_busy_cpu(j, load > 90 ? 1 : 0);

do this only when boost is enabled.

>                 if (dbs_data->cdata->governor == GOV_ONDEMAND) {
>                         int freq_avg = __cpufreq_driver_getavg(policy, j);
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukasz Majewski June 27, 2013, 10:58 a.m. UTC | #5
On Thu, 27 Jun 2013 15:06:43 +0530, Viresh Kumar wrote:
> On 19 June 2013 22:42, Lukasz Majewski <l.majewski@samsung.com> wrote:
> > In the core governor code, per cpu load value is calculated. This
> > patch uses it to mark processor as a "busy" one, when load value is
> > higher than 90%.
> 
> How can we take this assumption. What about a cpu which is only 70-80%
> busy [*]? 

Do you have any idea of how to precisely set the load threshold?

As a side note:

I've thought about this patch for some time and for me it looks like we
are mixing policy (number of busy CPUs) with abilities, which shall be
provided by the driver (boost). 
Additionally, we can only roughly "estimate" [*] when boost shall run
and when it shall be turned off.

I think that, we shall leave this management [*] to the thermal
framework. This framework is designed exactly to protect from over
heating (it uses the same freq_table for passive CPU cooling) with
several trip points -> e.g. 40 deg (disable boost), 75 deg (impose max
freq as 1.0 GHz to cool down, 90 deg (shutdown immediately).
Please refer to PATH v4 7/7.

Unfortunately, since we dropped Kconfig flag for BOOST we cannot
impose "select THERMAL_FRAMEWORK", when flag for BOOST is enabled at
Kconfig.

Ideally kernel shall not even build when CONFIG_CPUFREQ_BOOST Kconfig
flag is set and thermal for target architecture is not correctly
configured (including proper trip points).




> So, we would be running at around 70% of policy->max.. Now at
> this point you want to enable boost frequency and so that will happen
> for all cores... Wouldn't that burn your chip?

As I've written above, we can only estimate here. Even with the same
manufacturer at one SoC revision BOOST will work with 95% of policy max
but the other revision will work stable only with 70%.

> 
> > New cpufreq sysfs attribute is created (busy_cpus). It is read only
> > and provides information about number of actually busy CPU.
> 
> Not required.

This attribute seems helpful for debug.

> 
> > diff --git a/drivers/cpufreq/cpufreq_governor.c
> > b/drivers/cpufreq/cpufreq_governor.c index 077cea7..3402533 100644
> > --- a/drivers/cpufreq/cpufreq_governor.c
> > +++ b/drivers/cpufreq/cpufreq_governor.c
> > @@ -148,6 +148,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data,
> > int cpu) continue;
> >
> >                 load = 100 * (wall_time - idle_time) / wall_time;
> > +               cpufreq_set_busy_cpu(j, load > 90 ? 1 : 0);
> 
> do this only when boost is enabled.

Please read above side note.

> 
> >                 if (dbs_data->cdata->governor == GOV_ONDEMAND) {
> >                         int freq_avg =
> > __cpufreq_driver_getavg(policy, j);
Viresh Kumar June 27, 2013, 11:16 a.m. UTC | #6
@Rafael: We need you to jump into this discussion now, I don't
have a good idea about what we should do :)

On 27 June 2013 16:28, Lukasz Majewski <l.majewski@samsung.com> wrote:
> Do you have any idea of how to precisely set the load threshold?

I thought we are talking about cpu being in idle state.

> As a side note:
>
> I've thought about this patch for some time and for me it looks like we
> are mixing policy (number of busy CPUs) with abilities, which shall be
> provided by the driver (boost).
> Additionally, we can only roughly "estimate" [*] when boost shall run
> and when it shall be turned off.

This is another problem in the patch you sent. User would simply
enable or disable boost feature from userspace only once.

Now, if you disable it at high temperatures then its responsibility
to enable it again. Which you are missing.

> I think that, we shall leave this management [*] to the thermal
> framework. This framework is designed exactly to protect from over
> heating (it uses the same freq_table for passive CPU cooling) with
> several trip points -> e.g. 40 deg (disable boost), 75 deg (impose max
> freq as 1.0 GHz to cool down, 90 deg (shutdown immediately).
> Please refer to PATH v4 7/7.

There might be platforms where overheating isn't a issue with boost,
if it is only enabled while only one cpu is in use.

> Unfortunately, since we dropped Kconfig flag for BOOST we cannot
> impose "select THERMAL_FRAMEWORK", when flag for BOOST is enabled at
> Kconfig.

Not a big deal, we can get that in if required.

> Ideally kernel shall not even build when CONFIG_CPUFREQ_BOOST Kconfig
> flag is set and thermal for target architecture is not correctly
> configured (including proper trip points).
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukasz Majewski June 27, 2013, 2:42 p.m. UTC | #7
On Thu, 27 Jun 2013 16:46:44 +0530, Viresh Kumar wrote:
> @Rafael: We need you to jump into this discussion now, I don't
> have a good idea about what we should do :)
> 
> On 27 June 2013 16:28, Lukasz Majewski <l.majewski@samsung.com> wrote:
> > Do you have any idea of how to precisely set the load threshold?
> 
> I thought we are talking about cpu being in idle state.

If we _drop_ the idea with thermal subsystem to disable the boost,
the logic as far as I've understood shall here be as follow:

Only enable BOOST when one CPU load > THRESHOLD_MAX and other CPUs <
THRESHOLD_MIN

THRESHOLD_MIN & THRESHOLD_MAX are SoC specific. 

In my opinion the above constrain imposes policy to the cpufreq driver.

> 
> > As a side note:
> >
> > I've thought about this patch for some time and for me it looks
> > like we are mixing policy (number of busy CPUs) with abilities,
> > which shall be provided by the driver (boost).
> > Additionally, we can only roughly "estimate" [*] when boost shall
> > run and when it shall be turned off.
> 
> This is another problem in the patch you sent. User would simply
> enable or disable boost feature from userspace only once.

The above statement is definitely true for Intel. There HW manage the
frequency.

> 
> Now, if you disable it at high temperatures then its responsibility
> to enable it again. Which you are missing.

So thermal or "other solution" [*] shall disable boost when overheated
and enable it back when things cool down.

[*] @ Viresh & Rafael do you have any idea about the "other solution"
here?


> 
> > I think that, we shall leave this management [*] to the thermal
> > framework. This framework is designed exactly to protect from over
> > heating (it uses the same freq_table for passive CPU cooling) with
> > several trip points -> e.g. 40 deg (disable boost), 75 deg (impose
> > max freq as 1.0 GHz to cool down, 90 deg (shutdown immediately).
> > Please refer to PATH v4 7/7.
> 
> There might be platforms where overheating isn't a issue with boost,
> if it is only enabled while only one cpu is in use.

Could you elaborate more on this?

I thought, that with multi core one needs to keep itself inside
power/thermal dissipation envelope. 

> 
> > Unfortunately, since we dropped Kconfig flag for BOOST we cannot
> > impose "select THERMAL_FRAMEWORK", when flag for BOOST is enabled at
> > Kconfig.
> 
> Not a big deal, we can get that in if required.
> 
> > Ideally kernel shall not even build when CONFIG_CPUFREQ_BOOST
> > Kconfig flag is set and thermal for target architecture is not
> > correctly configured (including proper trip points).

This would prevent situation when somebody made a mistake and
had enabled boost, but for some reason had forgotten to
configure/enable thermal subsystem.

Moreover Kconfig's CONFIG_CPUFREQ_BOOST flag would indicate that user
enabled boost for some reason and he/she (presumably) knows what is
doing.
Viresh Kumar June 28, 2013, 3:50 a.m. UTC | #8
On 27 June 2013 20:12, Lukasz Majewski <l.majewski@samsung.com> wrote:
> On Thu, 27 Jun 2013 16:46:44 +0530, Viresh Kumar wrote:
>> @Rafael: We need you to jump into this discussion now, I don't
>> have a good idea about what we should do :)
>>
>> On 27 June 2013 16:28, Lukasz Majewski <l.majewski@samsung.com> wrote:
>> > Do you have any idea of how to precisely set the load threshold?
>>
>> I thought we are talking about cpu being in idle state.
>
> If we _drop_ the idea with thermal subsystem to disable the boost,
> the logic as far as I've understood shall here be as follow:
>
> Only enable BOOST when one CPU load > THRESHOLD_MAX and other CPUs <
> THRESHOLD_MIN

Again, I thought that we are talking about cpus being completely idle.
i.e. in WFI (wait for interrupt) or deeper states.

> THRESHOLD_MIN & THRESHOLD_MAX are SoC specific.
>
> In my opinion the above constrain imposes policy to the cpufreq driver.

Hmm.

> So thermal or "other solution" [*] shall disable boost when overheated
> and enable it back when things cool down.

yeah..

> [*] @ Viresh & Rafael do you have any idea about the "other solution"
> here?

Not really sure :)

>> There might be platforms where overheating isn't a issue with boost,
>> if it is only enabled while only one cpu is in use.
>
> Could you elaborate more on this?

I meant platforms where chip doesn't heat up much when only one core
is in use and is using boost frequency. So, they may not require support
for thermal layer at all.. But I am not aware of what the ground reality is. If
such systems can be possible or not.

> This would prevent situation when somebody made a mistake and
> had enabled boost, but for some reason had forgotten to
> configure/enable thermal subsystem.
>
> Moreover Kconfig's CONFIG_CPUFREQ_BOOST flag would indicate that user
> enabled boost for some reason and he/she (presumably) knows what is
> doing.

Yeah.. And drivers like ACPI cpufreq and exynos can simply do a select
from their Kconfig entries so that user isn't required to select them.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukasz Majewski June 28, 2013, 6:54 a.m. UTC | #9
On Fri, 28 Jun 2013 09:20:40 +0530, Viresh Kumar wrote:
> On 27 June 2013 20:12, Lukasz Majewski <l.majewski@samsung.com> wrote:
> > On Thu, 27 Jun 2013 16:46:44 +0530, Viresh Kumar wrote:
> >> @Rafael: We need you to jump into this discussion now, I don't
> >> have a good idea about what we should do :)
> >>
> >> On 27 June 2013 16:28, Lukasz Majewski <l.majewski@samsung.com>
> >> wrote:
> >> > Do you have any idea of how to precisely set the load threshold?
> >>
> >> I thought we are talking about cpu being in idle state.
> >
> > If we _drop_ the idea with thermal subsystem to disable the boost,
> > the logic as far as I've understood shall here be as follow:
> >
> > Only enable BOOST when one CPU load > THRESHOLD_MAX and other CPUs <
> > THRESHOLD_MIN
> 
> Again, I thought that we are talking about cpus being completely idle.
> i.e. in WFI (wait for interrupt) or deeper states.
> 
> > THRESHOLD_MIN & THRESHOLD_MAX are SoC specific.
> >
> > In my opinion the above constrain imposes policy to the cpufreq
> > driver.
> 
> Hmm.
> 
> > So thermal or "other solution" [*] shall disable boost when
> > overheated and enable it back when things cool down.
> 
> yeah..

For me thermal is a good candidate to enable boost again. I only need
to find a proper place for it.

> 
> > [*] @ Viresh & Rafael do you have any idea about the "other
> > solution" here?
> 
> Not really sure :)

Not any single one? Then I would like to propose thermal.

> 
> >> There might be platforms where overheating isn't a issue with
> >> boost, if it is only enabled while only one cpu is in use.
> >
> > Could you elaborate more on this?
> 
> I meant platforms where chip doesn't heat up much when only one core
> is in use and is using boost frequency. So, they may not require
> support for thermal layer at all.. But I am not aware of what the
> ground reality is. If such systems can be possible or not.

Ok.

> 
> > This would prevent situation when somebody made a mistake and
> > had enabled boost, but for some reason had forgotten to
> > configure/enable thermal subsystem.
> >
> > Moreover Kconfig's CONFIG_CPUFREQ_BOOST flag would indicate that
> > user enabled boost for some reason and he/she (presumably) knows
> > what is doing.
> 
> Yeah.. And drivers like ACPI cpufreq and exynos can simply do a select
> from their Kconfig entries so that user isn't required to select them.

Automatic select is not a good option. My goal would be here to define
BOOST as NO by default (at lease for SW managed ones). And allow user
to enable it explicitly.
Lukasz Majewski July 1, 2013, 8:15 a.m. UTC | #10
On Fri, 28 Jun 2013 08:54:57 +0200, Lukasz Majewski wrote:
> > > So thermal or "other solution" [*] shall disable boost when
> > > overheated and enable it back when things cool down.  
> > 
> > yeah..  
> 
> For me thermal is a good candidate to enable boost again. I only need
> to find a proper place for it.

Unfortunately, after careful investigation it turned out, that thermal
is suited to disable boost when overheating is detected. (since it uses
thermal subsystem interrupt to take action).

The problem is with enabling boost again, since thermal (at least at my
board) only reacts on TMU (thermal) interrupt. Thermal thread may be
started, but I don't regard this as a good solution (to extra bloat
thermal).

> 
> >   
> > > [*] @ Viresh & Rafael do you have any idea about the "other
> > > solution" here?  
> > 
> > Not really sure :)  
> 
> Not any single one? Then I would like to propose thermal.
> 

Does anybody have any idea here? As written above, thermal is suitable
to disable boost.

I'd like to bring those three options under discussion:

1. boost attr is always exported -> do not enable boost automatically
   when disabled by thermal (as it was proposed at v4).

2. boost attr is always exported -> find a way to enable boost after
   emergency disablement when thermal detects overheating (newest
   proposition). 

3. boost attr only exported at x86 (when supported)
   boost attr NOT exported via sysfs for SW controlled boost (e.g.
   Exynos ARM).

   Then we only enable/disable boost at kernel and don't need to take
   care of the user space interaction. This scenario is my use case. I
   hadn't planned to expose boost to userspace and use it with LAB as a
   kernel API.
Viresh Kumar July 4, 2013, 5:06 a.m. UTC | #11
Hi Lukasz,

Sorry for being late. Actually I didn't had an answer to your mail
and wanted to go through it with some fresh mind. This is my
first mail this morning, lets see if I can bring something good
into the discussion.

On 1 July 2013 13:45, Lukasz Majewski <l.majewski@samsung.com> wrote:
> Does anybody have any idea here? As written above, thermal is suitable
> to disable boost.

See, one thing is very clear. User space applications aren't responsible
for enabling boost again and again. There has to be a internal mechanism
inside kernel for that.

> I'd like to bring those three options under discussion:
>
> 1. boost attr is always exported -> do not enable boost automatically
>    when disabled by thermal (as it was proposed at v4).

So, that's a problem. I see one more solution to that.
- Create another Macro in cpufreq.c which would contain the time
after which we will autoenable boost.
- So, suppose thermal disabled it due to high temperature (Lets not
change value of sysfs variable boost_enable, but create another
variable like: skip_boost: which means skip boost temporarily).
- Thermal would enable this variable skip_boost.
- Then we will continue to get requests for next frequency and will
check eveytime if we have exceeded time for autoenabling boost.
- If yes, then we disable this variable and start boosting again..
- Then thermal can disable it again later.

This variable (time for autoenable) looks to be more platform
dependent for now, but lets don't make it like that unless somebody
needs it.

> 2. boost attr is always exported -> find a way to enable boost after
>    emergency disablement when thermal detects overheating (newest
>    proposition).

My solution above probably.

> 3. boost attr only exported at x86 (when supported)
>    boost attr NOT exported via sysfs for SW controlled boost (e.g.
>    Exynos ARM).
>
>    Then we only enable/disable boost at kernel and don't need to take
>    care of the user space interaction. This scenario is my use case. I
>    hadn't planned to expose boost to userspace and use it with LAB as a
>    kernel API.

Userspace must have control of this feature after kernel is built. That kernel
image may run for ever without changing in a product.

@Rafael: How crazy do you think my solution is? :)
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukasz Majewski July 4, 2013, 5:43 a.m. UTC | #12
On Thu, 04 Jul 2013 10:36:53 +0530, Viresh Kumar wrote:
> Hi Lukasz,
> 
> Sorry for being late. Actually I didn't had an answer to your mail
> and wanted to go through it with some fresh mind. This is my
> first mail this morning, lets see if I can bring something good
> into the discussion.
> 
> On 1 July 2013 13:45, Lukasz Majewski <l.majewski@samsung.com> wrote:
> > Does anybody have any idea here? As written above, thermal is
> > suitable to disable boost.
> 
> See, one thing is very clear. User space applications aren't
> responsible for enabling boost again and again. There has to be a
> internal mechanism inside kernel for that.
> 
> > I'd like to bring those three options under discussion:
> >
> > 1. boost attr is always exported -> do not enable boost
> > automatically when disabled by thermal (as it was proposed at v4).
> 
> So, that's a problem. I see one more solution to that.
> - Create another Macro in cpufreq.c which would contain the time
> after which we will autoenable boost.
> - So, suppose thermal disabled it due to high temperature (Lets not
> change value of sysfs variable boost_enable, but create another
> variable like: skip_boost: which means skip boost temporarily).
> - Thermal would enable this variable skip_boost.
> - Then we will continue to get requests for next frequency and will
> check eveytime if we have exceeded time for autoenabling boost.
> - If yes, then we disable this variable and start boosting again..
> - Then thermal can disable it again later.
> 
> This variable (time for autoenable) looks to be more platform
> dependent for now, but lets don't make it like that unless somebody
> needs it.
> 
> > 2. boost attr is always exported -> find a way to enable boost after
> >    emergency disablement when thermal detects overheating (newest
> >    proposition).
> 
> My solution above probably.

This is a possible solution, but I've already modified thermal code a
bit and found a solution for the problem.

I use thermal workqueue (which is already in place anyway) to enable the
boost again.
Due to that I can provide behaviour similar to HW controlled boost.

Patches with this solution are already prepared. I will post them in a
few hours. Ok?

> 
> > 3. boost attr only exported at x86 (when supported)
> >    boost attr NOT exported via sysfs for SW controlled boost (e.g.
> >    Exynos ARM).
> >
> >    Then we only enable/disable boost at kernel and don't need to
> > take care of the user space interaction. This scenario is my use
> > case. I hadn't planned to expose boost to userspace and use it with
> > LAB as a kernel API.
> 
> Userspace must have control of this feature after kernel is built.
> That kernel image may run for ever without changing in a product.
> 
> @Rafael: How crazy do you think my solution is? :)
Viresh Kumar July 4, 2013, 6:28 a.m. UTC | #13
On 4 July 2013 11:13, Lukasz Majewski <l.majewski@samsung.com> wrote:
> This is a possible solution, but I've already modified thermal code a
> bit and found a solution for the problem.
>
> I use thermal workqueue (which is already in place anyway) to enable the
> boost again.
> Due to that I can provide behaviour similar to HW controlled boost.
>
> Patches with this solution are already prepared. I will post them in a
> few hours. Ok?

I don't see any harm in checking them out :)
Please send them.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukasz Majewski July 4, 2013, 6:49 a.m. UTC | #14
On Thu, 04 Jul 2013 11:58:53 +0530, Viresh Kumar wrote:

> On 4 July 2013 11:13, Lukasz Majewski <l.majewski@samsung.com> wrote:
> > This is a possible solution, but I've already modified thermal code
> > a bit and found a solution for the problem.
> >
> > I use thermal workqueue (which is already in place anyway) to
> > enable the boost again.
> > Due to that I can provide behaviour similar to HW controlled boost.
> >
> > Patches with this solution are already prepared. I will post them
> > in a few hours. Ok?
> 
> I don't see any harm in checking them out :)
> Please send them.

:-)
Rafael Wysocki July 4, 2013, 12:50 p.m. UTC | #15
On Thursday, July 04, 2013 10:36:53 AM Viresh Kumar wrote:
> Hi Lukasz,
> 
> Sorry for being late. Actually I didn't had an answer to your mail
> and wanted to go through it with some fresh mind. This is my
> first mail this morning, lets see if I can bring something good
> into the discussion.
> 
> On 1 July 2013 13:45, Lukasz Majewski <l.majewski@samsung.com> wrote:
> > Does anybody have any idea here? As written above, thermal is suitable
> > to disable boost.
> 
> See, one thing is very clear. User space applications aren't responsible
> for enabling boost again and again. There has to be a internal mechanism
> inside kernel for that.
> 
> > I'd like to bring those three options under discussion:
> >
> > 1. boost attr is always exported -> do not enable boost automatically
> >    when disabled by thermal (as it was proposed at v4).
> 
> So, that's a problem. I see one more solution to that.
> - Create another Macro in cpufreq.c which would contain the time
> after which we will autoenable boost.

Well, how can we tell what time is correct here?  It may depend on factors not
under our control or even variable, like the ambient temperature, so surely
it might not be a hard coded value.

> - So, suppose thermal disabled it due to high temperature (Lets not
> change value of sysfs variable boost_enable, but create another
> variable like: skip_boost: which means skip boost temporarily).

What about "block_boost"?  Other than that, this particular item is a good idea
I think.

> - Thermal would enable this variable skip_boost.
> - Then we will continue to get requests for next frequency and will
> check eveytime if we have exceeded time for autoenabling boost.
> - If yes, then we disable this variable and start boosting again..

I wonder if we could use thermal watermarks instead?  Such that a high
watermark would cause block_boost to be set and a low watermark would cause
it to be reset?  And the thermal layer would be in control of both watermarks?

> - Then thermal can disable it again later.

Thanks,
Rafael
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 9141d33..f785273 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -48,6 +48,7 @@  static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor);
 #endif
 static DEFINE_RWLOCK(cpufreq_driver_lock);
 static LIST_HEAD(cpufreq_policy_list);
+static cpumask_t cpufreq_busy_cpus;
 
 /*
  * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
@@ -317,6 +318,13 @@  EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
 /*********************************************************************
  *                          SYSFS INTERFACE                          *
  *********************************************************************/
+ssize_t show_busy_cpus(struct kobject *kobj,
+				 struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", cpufreq_num_busy_cpu());
+}
+define_one_global_ro(busy_cpus);
+
 ssize_t show_boost(struct kobject *kobj,
 				 struct attribute *attr, char *buf)
 {
@@ -1980,6 +1988,17 @@  int cpufreq_boost_enabled(void)
 	return boost_enabled;
 }
 
+int cpufreq_num_busy_cpu(void)
+{
+	return cpumask_weight(&cpufreq_busy_cpus);
+}
+
+void cpufreq_set_busy_cpu(int cpu, int val)
+{
+	val ? cpumask_set_cpu(cpu, &cpufreq_busy_cpus) :
+		cpumask_clear_cpu(cpu, &cpufreq_busy_cpus);
+}
+
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
  *********************************************************************/
@@ -2019,6 +2038,13 @@  int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	cpufreq_driver = driver_data;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+	ret = cpufreq_sysfs_create_file(&(busy_cpus.attr));
+	if (ret) {
+		pr_err("%s: cannot register global busy_cpus sysfs file\n",
+		       __func__);
+		goto err_null_driver;
+	}
+
 	if (!cpufreq_driver->boost_supported)
 		boost.attr.mode = 0444;
 
@@ -2026,7 +2052,7 @@  int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	if (ret) {
 		pr_err("%s: cannot register global boost sysfs file\n",
 		       __func__);
-		goto err_null_driver;
+		goto err_busy_idle_unreg;
 	}
 
 	ret = subsys_interface_register(&cpufreq_interface);
@@ -2058,6 +2084,8 @@  int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	return 0;
 err_if_unreg:
 	subsys_interface_unregister(&cpufreq_interface);
+err_busy_idle_unreg:
+	cpufreq_sysfs_remove_file(&(busy_cpus.attr));
 err_null_driver:
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
@@ -2086,6 +2114,7 @@  int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 
 	subsys_interface_unregister(&cpufreq_interface);
 
+	cpufreq_sysfs_remove_file(&(busy_cpus.attr));
 	cpufreq_sysfs_remove_file(&(boost.attr));
 	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 077cea7..3402533 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -148,6 +148,7 @@  void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 			continue;
 
 		load = 100 * (wall_time - idle_time) / wall_time;
+		cpufreq_set_busy_cpu(j, load > 90 ? 1 : 0);
 
 		if (dbs_data->cdata->governor == GOV_ONDEMAND) {
 			int freq_avg = __cpufreq_driver_getavg(policy, j);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4783c4c..536abfc 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -436,6 +436,9 @@  int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 int cpufreq_boost_trigger_state(int state);
 int cpufreq_boost_supported(void);
 int cpufreq_boost_enabled(void);
+
+void cpufreq_set_busy_cpu(int cpu, int val);
+int cpufreq_num_busy_cpu(void);
 /* the following 3 funtions are for cpufreq core use only */
 struct cpufreq_frequency_table *cpufreq_frequency_get_table(unsigned int cpu);