diff mbox series

[2/2] cpufreq: Specify default governor on command line

Message ID 20200615165554.228063-3-qperret@google.com (mailing list archive)
State Superseded, archived
Headers show
Series cpufreq: Specify the default governor on command line | expand

Commit Message

Quentin Perret June 15, 2020, 4:55 p.m. UTC
Currently, the only way to specify the default CPUfreq governor is via
Kconfig options, which suits users who can build the kernel themselves
perfectly.

However, for those who use a distro-like kernel (such as Android, with
the Generic Kernel Image project), the only way to use a different
default is to boot to userspace, and to then switch using the sysfs
interface. Being able to specify the default governor on the command
line, like is the case for cpuidle, would enable those users to specify
their governor of choice earlier on, and to simplify slighlty the
userspace boot procedure.

To support this use-case, add a kernel command line parameter enabling
to specify a default governor for CPUfreq, which takes precedence over
the builtin default.

This implementation has one notable limitation: the default governor
must be registered before the driver. This is solved for builtin
governors and drivers using appropriate *_initcall() functions. And in
the modular case, this must be reflected as a constraint on the module
loading order.

Signed-off-by: Quentin Perret <qperret@google.com>
---
 .../admin-guide/kernel-parameters.txt         |  5 +++
 Documentation/admin-guide/pm/cpufreq.rst      |  6 ++--
 drivers/cpufreq/cpufreq.c                     | 34 ++++++++++++++++---
 3 files changed, 37 insertions(+), 8 deletions(-)

Comments

Quentin Perret June 15, 2020, 5:41 p.m. UTC | #1
On Monday 15 Jun 2020 at 17:55:54 (+0100), Quentin Perret wrote:
>  static int cpufreq_init_governor(struct cpufreq_policy *policy)
>  {
>  	int ret;
> @@ -2701,6 +2721,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
>  
>  	if (driver_data->setpolicy)
>  		driver_data->flags |= CPUFREQ_CONST_LOOPS;
> +	else
> +		cpufreq_get_default_governor();

Looking at this again, it appears that the comment above
cpufreq_parse_governor() confused me a bit -- this needs doing
unconditionally I think.

I'll fix it in v2.

Thanks,
Quentin
Viresh Kumar June 16, 2020, 4:31 a.m. UTC | #2
On 15-06-20, 17:55, Quentin Perret wrote:
> +static void cpufreq_get_default_governor(void)
> +{
> +	default_governor = cpufreq_parse_governor(cpufreq_param_governor);
> +	if (!default_governor) {
> +		if (*cpufreq_param_governor)
> +			pr_warn("Failed to find %s\n", cpufreq_param_governor);
> +		default_governor = cpufreq_default_governor();

A module_get() never happened for this case and so maybe a
module_put() should never get called.

> +	}
> +}
> +
> +static void cpufreq_put_default_governor(void)
> +{
> +	if (!default_governor)
> +		return;
> +	module_put(default_governor->owner);
> +	default_governor = NULL;
> +}
> +
>  static int cpufreq_init_governor(struct cpufreq_policy *policy)
>  {
>  	int ret;
> @@ -2701,6 +2721,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
>  
>  	if (driver_data->setpolicy)
>  		driver_data->flags |= CPUFREQ_CONST_LOOPS;
> +	else
> +		cpufreq_get_default_governor();
>  
>  	if (cpufreq_boost_supported()) {
>  		ret = create_boost_sysfs_file();
> @@ -2769,6 +2791,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
>  	subsys_interface_unregister(&cpufreq_interface);
>  	remove_boost_sysfs_file();
>  	cpuhp_remove_state_nocalls_cpuslocked(hp_online);
> +	cpufreq_put_default_governor();
>  
>  	write_lock_irqsave(&cpufreq_driver_lock, flags);
>  
> @@ -2792,4 +2815,5 @@ static int __init cpufreq_core_init(void)
>  	return 0;
>  }

And since this is a per boot thing, there is perhaps no need of doing
these at driver register/unregister, I would rather do it at:
cpufreq_core_init() time itself and so we will never need to run
cpufreq_put_default_governor() and so can be removed.

And another thing I am not able to understand (despite you commenting
about that in the commit log) is what happens if the default governor
chosen is built as a module ?
Quentin Perret June 16, 2020, 8:31 a.m. UTC | #3
Hey Viresh,

On Tuesday 16 Jun 2020 at 10:01:43 (+0530), Viresh Kumar wrote:
> On 15-06-20, 17:55, Quentin Perret wrote:
> > +static void cpufreq_get_default_governor(void)
> > +{
> > +	default_governor = cpufreq_parse_governor(cpufreq_param_governor);
> > +	if (!default_governor) {
> > +		if (*cpufreq_param_governor)
> > +			pr_warn("Failed to find %s\n", cpufreq_param_governor);
> > +		default_governor = cpufreq_default_governor();
> 
> A module_get() never happened for this case and so maybe a
> module_put() should never get called.

Correct, however cpufreq_default_governor() being a weak function, we're
basically guaranteed the governor we get from there is builtin, so
gov->owner is NULL. That is, module_put() is not actively useful, but it
doesn't harm. So I figured that should be fine. That could definitely
use a comment, though :)

> > +	}
> > +}
> > +
> > +static void cpufreq_put_default_governor(void)
> > +{
> > +	if (!default_governor)
> > +		return;
> > +	module_put(default_governor->owner);
> > +	default_governor = NULL;
> > +}
> > +
> >  static int cpufreq_init_governor(struct cpufreq_policy *policy)
> >  {
> >  	int ret;
> > @@ -2701,6 +2721,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
> >  
> >  	if (driver_data->setpolicy)
> >  		driver_data->flags |= CPUFREQ_CONST_LOOPS;
> > +	else
> > +		cpufreq_get_default_governor();
> >  
> >  	if (cpufreq_boost_supported()) {
> >  		ret = create_boost_sysfs_file();
> > @@ -2769,6 +2791,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
> >  	subsys_interface_unregister(&cpufreq_interface);
> >  	remove_boost_sysfs_file();
> >  	cpuhp_remove_state_nocalls_cpuslocked(hp_online);
> > +	cpufreq_put_default_governor();
> >  
> >  	write_lock_irqsave(&cpufreq_driver_lock, flags);
> >  
> > @@ -2792,4 +2815,5 @@ static int __init cpufreq_core_init(void)
> >  	return 0;
> >  }
> 
> And since this is a per boot thing, there is perhaps no need of doing
> these at driver register/unregister, I would rather do it at:
> cpufreq_core_init() time itself and so we will never need to run
> cpufreq_put_default_governor() and so can be removed.

Right, so the reason I avoided cpufreq_core_init() was because it is
called at core_initcall() time, which means I can't really assume the
governors have been loaded by that time. By waiting for the driver to
probe before detecting the default gov, we get that nice ordering. But
yes, it feels odd to have it here :/

Thinking about it more, the natural fit for this would rather be the
register/unregister path for governors directly. If that sounds good to
you (?) I'll try to move it there in v2.

> And another thing I am not able to understand (despite you commenting
> about that in the commit log) is what happens if the default governor
> chosen is built as a module ?

So the answer is 'it depends'. If the driver is built as a module too,
then you should load the governor module first, and then the driver
module, and everything will work just fine.

But in the case where the governor is loaded _after_ the driver (either
because we got the module ordering wrong, or because the driver is
builtin), then the policies will be initialized with the builtin
default, and nothing special will happen when the governor module is
loaded.

That behaviour very much is open for discussion, though. A possible
alternative would be to automatically switch all policies to the default
governor upon loading. That would have the nice benefit or removing the
ordering dependency, but that is more involved and I didn't have a
use-case for it, so I went for the simpler option ('the-default
governor-needs-to-be-registered-before-the-policies-are-created').

Thoughts?

Thanks,
Quentin
Viresh Kumar June 16, 2020, 9:27 a.m. UTC | #4
On 16-06-20, 09:31, Quentin Perret wrote:
> Right, so the reason I avoided cpufreq_core_init() was because it is
> called at core_initcall() time, which means I can't really assume the
> governors have been loaded by that time. By waiting for the driver to
> probe before detecting the default gov, we get that nice ordering. But
> yes, it feels odd to have it here :/
> 
> Thinking about it more, the natural fit for this would rather be the
> register/unregister path for governors directly. If that sounds good to
> you (?) I'll try to move it there in v2.

There is another problem here which we need to look at. Any governor
which is built as a module and isn't currently used, should be allowed
to unload. And this needs to be tested by you as well, should be easy
enough.

With the current implementation, you take a reference to the default
governor when the driver is registered and drop it only when the
driver goes away. Which means we won't be able to unload the module of
the governor even if it isn't used. Which is wrong. The solution I
proposed had the same issue as well.

You need to figure out a way where we don't need to keep holding the
module hostage even when it isn't used. I see two ways at least for
the same:

- Do that from the existing place: cpufreq_init_policy().

- And I think this can be done from governor-register/unregister as
  well.

Second one sounds good, if it is feasible to do that.
Quentin Perret June 16, 2020, 9:48 a.m. UTC | #5
On Tuesday 16 Jun 2020 at 14:57:59 (+0530), Viresh Kumar wrote:
> There is another problem here which we need to look at. Any governor
> which is built as a module and isn't currently used, should be allowed
> to unload. And this needs to be tested by you as well, should be easy
> enough.
> 
> With the current implementation, you take a reference to the default
> governor when the driver is registered and drop it only when the
> driver goes away. Which means we won't be able to unload the module of
> the governor even if it isn't used. Which is wrong. The solution I
> proposed had the same issue as well.
> 
> You need to figure out a way where we don't need to keep holding the
> module hostage even when it isn't used. I see two ways at least for
> the same:
> 
> - Do that from the existing place: cpufreq_init_policy().
> 
> - And I think this can be done from governor-register/unregister as
>   well.
> 
> Second one sounds good, if it is feasible to do that.

Good point.

I'm thinking something along the lines of:

---8<---
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 0f05caedc320..a9219404e07f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2340,6 +2340,11 @@ int cpufreq_register_governor(struct cpufreq_governor *governor)
 		list_add(&governor->governor_list, &cpufreq_governor_list);
 	}
 
+	if (!strncasecmp(cpufreq_param_governor, governor->name, CPUFREQ_NAME_LEN))
+		default_governor = governor;
+	else if (!default_governor && cpufreq_default_governor() == governor)
+		default_governor = cpufreq_default_governor();
+
 	mutex_unlock(&cpufreq_governor_mutex);
 	return err;
 }
@@ -2368,6 +2373,8 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor)
 
 	mutex_lock(&cpufreq_governor_mutex);
 	list_del(&governor->governor_list);
+	if (governor == default_governor)
+		default_governor = cpufreq_default_governor();
 	mutex_unlock(&cpufreq_governor_mutex);
 }
 EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
--->8---

should do the trick. That removes the unnecessary reference count, and
feels like a good place to hook things -- that is how cpuidle does it
too IIRC.

I'll double check the locking/synchronization, but that shouldn't be too
bad (famous last words).

Cheers,
Quentin
Viresh Kumar June 16, 2020, 9:54 a.m. UTC | #6
On 16-06-20, 10:48, Quentin Perret wrote:
> ---8<---
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 0f05caedc320..a9219404e07f 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2340,6 +2340,11 @@ int cpufreq_register_governor(struct cpufreq_governor *governor)
>  		list_add(&governor->governor_list, &cpufreq_governor_list);
>  	}
>  
> +	if (!strncasecmp(cpufreq_param_governor, governor->name, CPUFREQ_NAME_LEN))
> +		default_governor = governor;
> +	else if (!default_governor && cpufreq_default_governor() == governor)
> +		default_governor = cpufreq_default_governor();

Instead of the else part here, maybe just do this from
cpufreq_core_init() only once, and so we will always have
default_governor set.

> +
>  	mutex_unlock(&cpufreq_governor_mutex);
>  	return err;
>  }
> @@ -2368,6 +2373,8 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor)
>  
>  	mutex_lock(&cpufreq_governor_mutex);
>  	list_del(&governor->governor_list);
> +	if (governor == default_governor)
> +		default_governor = cpufreq_default_governor();
>  	mutex_unlock(&cpufreq_governor_mutex);
>  }
>  EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
> --->8---
> 
> should do the trick. That removes the unnecessary reference count, and
> feels like a good place to hook things -- that is how cpuidle does it
> too IIRC.
> 
> I'll double check the locking/synchronization, but that shouldn't be too
> bad (famous last words).
> 
> Cheers,
> Quentin
Quentin Perret June 16, 2020, 9:56 a.m. UTC | #7
On Tuesday 16 Jun 2020 at 15:24:38 (+0530), Viresh Kumar wrote:
> On 16-06-20, 10:48, Quentin Perret wrote:
> > ---8<---
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 0f05caedc320..a9219404e07f 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2340,6 +2340,11 @@ int cpufreq_register_governor(struct cpufreq_governor *governor)
> >  		list_add(&governor->governor_list, &cpufreq_governor_list);
> >  	}
> >  
> > +	if (!strncasecmp(cpufreq_param_governor, governor->name, CPUFREQ_NAME_LEN))
> > +		default_governor = governor;
> > +	else if (!default_governor && cpufreq_default_governor() == governor)
> > +		default_governor = cpufreq_default_governor();
> 
> Instead of the else part here, maybe just do this from
> cpufreq_core_init() only once, and so we will always have
> default_governor set.

Sounds good.

Thanks!
Quentin
Quentin Perret June 23, 2020, 9:25 a.m. UTC | #8
Hi,

Thanks for the report.

On Monday 22 Jun 2020 at 08:54:57 (+0800), kernel test robot wrote:
> Greeting,
> 
> FYI, we noticed the following commit (built with gcc-9):
> 
> commit: d83f959b5e7a6378a4afbff23de2a2d064d95749 ("[PATCH 2/2] cpufreq: Specify default governor on command line")
> url: https://github.com/0day-ci/linux/commits/Quentin-Perret/cpufreq-Specify-the-default-governor-on-command-line/20200616-005920
> base: https://git.kernel.org/cgit/linux/kernel/git/rafael/linux-pm.git linux-next
> 
> in testcase: kernel-selftests
> with following parameters:
> 
> 	group: kselftests-x86
> 	ucode: 0xdc
> 
> test-description: The kernel contains a set of "self tests" under the tools/testing/selftests/ directory. These are intended to be small unit tests to exercise individual code paths in the kernel.
> test-url: https://www.kernel.org/doc/Documentation/kselftest.txt
> 
> 
> on test machine: 8 threads Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz with 16G memory
> 
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
> 
> 
> 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot <rong.a.chen@intel.com>
> 
> 
> 
> [    8.715369] intel_pstate: Intel P-state driver initializing
> [    8.721146] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 0 (-61)
> [    8.728900] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 1 (-61)
> [    8.736615] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 2 (-61)
> [    8.744400] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 3 (-61)
> [    8.752222] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 4 (-61)
> [    8.760010] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 5 (-61)
> [    8.768077] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 6 (-61)
> [    8.775891] cpufreq: cpufreq_online: Failed to initialize policy for cpu: 7 (-61)

That, I think, is because of the issue I reported here:

    https://lore.kernel.org/lkml/20200615174141.GA235811@google.com/

The v2 (to be posted shortly) will address this.

Thanks,
Quentin
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..5fd3c9f187eb 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -703,6 +703,11 @@ 
 	cpufreq.off=1	[CPU_FREQ]
 			disable the cpufreq sub-system
 
+	cpufreq.default_governor=
+			[CPU_FREQ] Name of the default cpufreq governor to use.
+			This governor must be registered in the kernel before
+			the cpufreq driver probes.
+
 	cpu_init_udelay=N
 			[X86] Delay for N microsec between assert and de-assert
 			of APIC INIT to start processors.  This delay occurs
diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst
index 0c74a7784964..368e612145d2 100644
--- a/Documentation/admin-guide/pm/cpufreq.rst
+++ b/Documentation/admin-guide/pm/cpufreq.rst
@@ -147,9 +147,9 @@  CPUs in it.
 
 The next major initialization step for a new policy object is to attach a
 scaling governor to it (to begin with, that is the default scaling governor
-determined by the kernel configuration, but it may be changed later
-via ``sysfs``).  First, a pointer to the new policy object is passed to the
-governor's ``->init()`` callback which is expected to initialize all of the
+determined by the kernel command line or configuration, but it may be changed
+later via ``sysfs``).  First, a pointer to the new policy object is passed to
+the governor's ``->init()`` callback which is expected to initialize all of the
 data structures necessary to handle the given policy and, possibly, to add
 a governor ``sysfs`` interface to it.  Next, the governor is started by
 invoking its ``->start()`` callback.
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 0128de3603df..0f05caedc320 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -50,6 +50,9 @@  static LIST_HEAD(cpufreq_governor_list);
 #define for_each_governor(__governor)				\
 	list_for_each_entry(__governor, &cpufreq_governor_list, governor_list)
 
+static char cpufreq_param_governor[CPUFREQ_NAME_LEN];
+static struct cpufreq_governor *default_governor;
+
 /**
  * The "cpufreq driver" - the arch- or hardware-dependent low
  * level driver of CPUFreq support, and its spinlock. This lock
@@ -1055,7 +1058,6 @@  __weak struct cpufreq_governor *cpufreq_default_governor(void)
 
 static int cpufreq_init_policy(struct cpufreq_policy *policy)
 {
-	struct cpufreq_governor *def_gov = cpufreq_default_governor();
 	struct cpufreq_governor *gov = NULL;
 	unsigned int pol = CPUFREQ_POLICY_UNKNOWN;
 
@@ -1065,8 +1067,8 @@  static int cpufreq_init_policy(struct cpufreq_policy *policy)
 		if (gov) {
 			pr_debug("Restoring governor %s for cpu %d\n",
 				 policy->governor->name, policy->cpu);
-		} else if (def_gov) {
-			gov = def_gov;
+		} else if (default_governor) {
+			gov = default_governor;
 		} else {
 			return -ENODATA;
 		}
@@ -1074,8 +1076,8 @@  static int cpufreq_init_policy(struct cpufreq_policy *policy)
 		/* Use the default policy if there is no last_policy. */
 		if (policy->last_policy) {
 			pol = policy->last_policy;
-		} else if (def_gov) {
-			pol = cpufreq_parse_policy(def_gov->name);
+		} else if (default_governor) {
+			pol = cpufreq_parse_policy(default_governor->name);
 			/*
 			 * In case the default governor is neiter "performance"
 			 * nor "powersave", fall back to the initial policy
@@ -2196,6 +2198,24 @@  __weak struct cpufreq_governor *cpufreq_fallback_governor(void)
 	return NULL;
 }
 
+static void cpufreq_get_default_governor(void)
+{
+	default_governor = cpufreq_parse_governor(cpufreq_param_governor);
+	if (!default_governor) {
+		if (*cpufreq_param_governor)
+			pr_warn("Failed to find %s\n", cpufreq_param_governor);
+		default_governor = cpufreq_default_governor();
+	}
+}
+
+static void cpufreq_put_default_governor(void)
+{
+	if (!default_governor)
+		return;
+	module_put(default_governor->owner);
+	default_governor = NULL;
+}
+
 static int cpufreq_init_governor(struct cpufreq_policy *policy)
 {
 	int ret;
@@ -2701,6 +2721,8 @@  int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 
 	if (driver_data->setpolicy)
 		driver_data->flags |= CPUFREQ_CONST_LOOPS;
+	else
+		cpufreq_get_default_governor();
 
 	if (cpufreq_boost_supported()) {
 		ret = create_boost_sysfs_file();
@@ -2769,6 +2791,7 @@  int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	subsys_interface_unregister(&cpufreq_interface);
 	remove_boost_sysfs_file();
 	cpuhp_remove_state_nocalls_cpuslocked(hp_online);
+	cpufreq_put_default_governor();
 
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
 
@@ -2792,4 +2815,5 @@  static int __init cpufreq_core_init(void)
 	return 0;
 }
 module_param(off, int, 0444);
+module_param_string(default_governor, cpufreq_param_governor, CPUFREQ_NAME_LEN, 0444);
 core_initcall(cpufreq_core_init);