diff mbox series

[v10,15/15] OPTIONAL: cpufreq: dt: Register an Energy Model

Message ID 20181203095628.11858-16-quentin.perret@arm.com (mailing list archive)
State Not Applicable, archived
Headers show
Series Energy Aware Scheduling | expand

Commit Message

Quentin Perret Dec. 3, 2018, 9:56 a.m. UTC
*******************************************************************
* This patch illustrates the usage of the newly introduced Energy *
* Model framework and isn't supposed to be merged as-is.          *
*******************************************************************

The Energy Model framework provides an API to register the active power
of CPUs. Call this API from the cpufreq-dt driver with an estimation
of the power as P = C * V^2 * f with C, V, and f respectively the
capacitance of the CPU and the voltage and frequency of the OPP.

The CPU capacitance is read from the "dynamic-power-coefficient" DT
binding (originally introduced for thermal/IPA), and the voltage and
frequency values from PM_OPP.

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
---
 drivers/cpufreq/cpufreq-dt.c | 48 +++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

Comments

Matthias Kaehlcke Jan. 8, 2019, 8:38 p.m. UTC | #1
Hi Quentin,

On Mon, Dec 03, 2018 at 09:56:28AM +0000, Quentin Perret wrote:
> *******************************************************************
> * This patch illustrates the usage of the newly introduced Energy *
> * Model framework and isn't supposed to be merged as-is.          *
> *******************************************************************
> 
> The Energy Model framework provides an API to register the active power
> of CPUs. Call this API from the cpufreq-dt driver with an estimation
> of the power as P = C * V^2 * f with C, V, and f respectively the
> capacitance of the CPU and the voltage and frequency of the OPP.
> 
> The CPU capacitance is read from the "dynamic-power-coefficient" DT
> binding (originally introduced for thermal/IPA), and the voltage and
> frequency values from PM_OPP.
> 
> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Signed-off-by: Quentin Perret <quentin.perret@arm.com>
> ---
>  drivers/cpufreq/cpufreq-dt.c | 48 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
> index e58bfcb1169e..4cfef5554d86 100644
> --- a/drivers/cpufreq/cpufreq-dt.c
> +++ b/drivers/cpufreq/cpufreq-dt.c
> @@ -16,6 +16,7 @@
>  #include <linux/cpu_cooling.h>
>  #include <linux/cpufreq.h>
>  #include <linux/cpumask.h>
> +#include <linux/energy_model.h>
>  #include <linux/err.h>
>  #include <linux/module.h>
>  #include <linux/of.h>
> @@ -150,8 +151,50 @@ static int resources_available(void)
>  	return 0;
>  }
>  
> +static int __maybe_unused of_est_power(unsigned long *mW, unsigned long *KHz,
> +				       int cpu)
> +{
> +	unsigned long mV, Hz, MHz;
> +	struct device *cpu_dev;
> +	struct dev_pm_opp *opp;
> +	struct device_node *np;
> +	u32 cap;
> +	u64 tmp;
> +
> +	cpu_dev = get_cpu_device(cpu);
> +	if (!cpu_dev)
> +		return -ENODEV;
> +
> +	np = of_node_get(cpu_dev->of_node);
> +	if (!np)
> +		return -EINVAL;
> +
> +	if (of_property_read_u32(np, "dynamic-power-coefficient", &cap))
> +		return -EINVAL;
> +
> +	Hz = *KHz * 1000;
> +	opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz);
> +	if (IS_ERR(opp))
> +		return -EINVAL;
> +
> +	mV = dev_pm_opp_get_voltage(opp) / 1000;
> +	dev_pm_opp_put(opp);
> +	if (!mV)
> +		return -EINVAL;
> +
> +	MHz = Hz / 1000000;
> +	tmp = (u64)cap * mV * mV * MHz;
> +	do_div(tmp, 1000000000);
> +
> +	*mW = (unsigned long)tmp;
> +	*KHz = Hz / 1000;
> +
> +	return 0;
> +}
> +
>  static int cpufreq_init(struct cpufreq_policy *policy)
>  {
> +	struct em_data_callback em_cb = EM_DATA_CB(of_est_power);
>  	struct cpufreq_frequency_table *freq_table;
>  	struct opp_table *opp_table = NULL;
>  	struct private_data *priv;
> @@ -160,7 +203,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
>  	unsigned int transition_latency;
>  	bool fallback = false;
>  	const char *name;
> -	int ret;
> +	int ret, nr_opp;
>  
>  	cpu_dev = get_cpu_device(policy->cpu);
>  	if (!cpu_dev) {
> @@ -237,6 +280,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
>  		ret = -EPROBE_DEFER;
>  		goto out_free_opp;
>  	}
> +	nr_opp = ret;
>  
>  	if (fallback) {
>  		cpumask_setall(policy->cpus);
> @@ -280,6 +324,8 @@ static int cpufreq_init(struct cpufreq_policy *policy)
>  	policy->cpuinfo.transition_latency = transition_latency;
>  	policy->dvfs_possible_from_any_cpu = true;
>  
> +	em_register_perf_domain(policy->cpus, nr_opp, &em_cb);

Shouldn't there also be a function to unregister a perf domain to be
called from cpufreq_exit()?

->exit() is called on suspend when the CPUs are taken offline, and
->init() on resume, hence em_register_perf_domain() is called multiple
times, but without doing any cleanup.

Cheers

Matthias
Quentin Perret Jan. 9, 2019, 10:57 a.m. UTC | #2
Hi Matthias ,

On Tuesday 08 Jan 2019 at 12:38:13 (-0800), Matthias Kaehlcke wrote:
> >  static int cpufreq_init(struct cpufreq_policy *policy)
> >  {
> > +	struct em_data_callback em_cb = EM_DATA_CB(of_est_power);
> >  	struct cpufreq_frequency_table *freq_table;
> >  	struct opp_table *opp_table = NULL;
> >  	struct private_data *priv;
> > @@ -160,7 +203,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
> >  	unsigned int transition_latency;
> >  	bool fallback = false;
> >  	const char *name;
> > -	int ret;
> > +	int ret, nr_opp;
> >  
> >  	cpu_dev = get_cpu_device(policy->cpu);
> >  	if (!cpu_dev) {
> > @@ -237,6 +280,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
> >  		ret = -EPROBE_DEFER;
> >  		goto out_free_opp;
> >  	}
> > +	nr_opp = ret;
> >  
> >  	if (fallback) {
> >  		cpumask_setall(policy->cpus);
> > @@ -280,6 +324,8 @@ static int cpufreq_init(struct cpufreq_policy *policy)
> >  	policy->cpuinfo.transition_latency = transition_latency;
> >  	policy->dvfs_possible_from_any_cpu = true;
> >  
> > +	em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
> 
> Shouldn't there also be a function to unregister a perf domain to be
> called from cpufreq_exit()?
> 
> ->exit() is called on suspend when the CPUs are taken offline, and
> ->init() on resume, hence em_register_perf_domain() is called multiple
> times, but without doing any cleanup.

Right, but this is safe to do as em_register_perf_domain() checks for
the existence of the domain straight away. So the second time you call
this em_register_perf_domain() should just bail out. Are you seeing an
issue with this ?

As of now, the EM framework is really simple -- it justs allocates the
pd tables once during boot, and they stay in memory forever. While
arguably less than optimal, that makes things a whole lot easier to
manage on the client side (i.e. the scheduler) w/o needing RCU
protection or so.

Thanks,
Quentin
Matthias Kaehlcke Jan. 9, 2019, 6:14 p.m. UTC | #3
On Wed, Jan 09, 2019 at 10:57:57AM +0000, Quentin Perret wrote:
> Hi Matthias ,
> 
> On Tuesday 08 Jan 2019 at 12:38:13 (-0800), Matthias Kaehlcke wrote:
> > >  static int cpufreq_init(struct cpufreq_policy *policy)
> > >  {
> > > +	struct em_data_callback em_cb = EM_DATA_CB(of_est_power);
> > >  	struct cpufreq_frequency_table *freq_table;
> > >  	struct opp_table *opp_table = NULL;
> > >  	struct private_data *priv;
> > > @@ -160,7 +203,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
> > >  	unsigned int transition_latency;
> > >  	bool fallback = false;
> > >  	const char *name;
> > > -	int ret;
> > > +	int ret, nr_opp;
> > >  
> > >  	cpu_dev = get_cpu_device(policy->cpu);
> > >  	if (!cpu_dev) {
> > > @@ -237,6 +280,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
> > >  		ret = -EPROBE_DEFER;
> > >  		goto out_free_opp;
> > >  	}
> > > +	nr_opp = ret;
> > >  
> > >  	if (fallback) {
> > >  		cpumask_setall(policy->cpus);
> > > @@ -280,6 +324,8 @@ static int cpufreq_init(struct cpufreq_policy *policy)
> > >  	policy->cpuinfo.transition_latency = transition_latency;
> > >  	policy->dvfs_possible_from_any_cpu = true;
> > >  
> > > +	em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
> > 
> > Shouldn't there also be a function to unregister a perf domain to be
> > called from cpufreq_exit()?
> > 
> > ->exit() is called on suspend when the CPUs are taken offline, and
> > ->init() on resume, hence em_register_perf_domain() is called multiple
> > times, but without doing any cleanup.
> 
> Right, but this is safe to do as em_register_perf_domain() checks for
> the existence of the domain straight away. So the second time you call
> this em_register_perf_domain() should just bail out. Are you seeing an
> issue with this ?
> 
> As of now, the EM framework is really simple -- it justs allocates the
> pd tables once during boot, and they stay in memory forever. While
> arguably less than optimal, that makes things a whole lot easier to
> manage on the client side (i.e. the scheduler) w/o needing RCU
> protection or so.

I think registering the perf domain only once is fine, since the info
isn't supposed to change and will likely be used again after
_exit(). However since we have em_cpu_get() I'd suggest to use it and
only call em_register_perf_domain() if no perf domain is registered
yet for the CPU. This makes it more evident that the registration is
only done once and simplifies error handling (currently not done at
all), since it's not necessary to check for the special case -EEXIST.

Cheers

Matthias
Quentin Perret Jan. 10, 2019, 9:08 a.m. UTC | #4
On Wednesday 09 Jan 2019 at 10:14:51 (-0800), Matthias Kaehlcke wrote:
> I think registering the perf domain only once is fine, since the info
> isn't supposed to change and will likely be used again after
> _exit(). However since we have em_cpu_get() I'd suggest to use it and
> only call em_register_perf_domain() if no perf domain is registered
> yet for the CPU. This makes it more evident that the registration is
> only done once and simplifies error handling (currently not done at
> all), since it's not necessary to check for the special case -EEXIST.

Right, a check on em_cpu_get() on the driver side shouldn't hurt. We
don't actually have upstream drivers using that API yet but I intend to
change that soon. I guess we'll need to have that discussion with each
individual CPUFreq driver maintainer but that hopefully shouldn't be a
problem.

Thanks for the feedback,
Quentin
diff mbox series

Patch

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index e58bfcb1169e..4cfef5554d86 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -16,6 +16,7 @@ 
 #include <linux/cpu_cooling.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
+#include <linux/energy_model.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -150,8 +151,50 @@  static int resources_available(void)
 	return 0;
 }
 
+static int __maybe_unused of_est_power(unsigned long *mW, unsigned long *KHz,
+				       int cpu)
+{
+	unsigned long mV, Hz, MHz;
+	struct device *cpu_dev;
+	struct dev_pm_opp *opp;
+	struct device_node *np;
+	u32 cap;
+	u64 tmp;
+
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev)
+		return -ENODEV;
+
+	np = of_node_get(cpu_dev->of_node);
+	if (!np)
+		return -EINVAL;
+
+	if (of_property_read_u32(np, "dynamic-power-coefficient", &cap))
+		return -EINVAL;
+
+	Hz = *KHz * 1000;
+	opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz);
+	if (IS_ERR(opp))
+		return -EINVAL;
+
+	mV = dev_pm_opp_get_voltage(opp) / 1000;
+	dev_pm_opp_put(opp);
+	if (!mV)
+		return -EINVAL;
+
+	MHz = Hz / 1000000;
+	tmp = (u64)cap * mV * mV * MHz;
+	do_div(tmp, 1000000000);
+
+	*mW = (unsigned long)tmp;
+	*KHz = Hz / 1000;
+
+	return 0;
+}
+
 static int cpufreq_init(struct cpufreq_policy *policy)
 {
+	struct em_data_callback em_cb = EM_DATA_CB(of_est_power);
 	struct cpufreq_frequency_table *freq_table;
 	struct opp_table *opp_table = NULL;
 	struct private_data *priv;
@@ -160,7 +203,7 @@  static int cpufreq_init(struct cpufreq_policy *policy)
 	unsigned int transition_latency;
 	bool fallback = false;
 	const char *name;
-	int ret;
+	int ret, nr_opp;
 
 	cpu_dev = get_cpu_device(policy->cpu);
 	if (!cpu_dev) {
@@ -237,6 +280,7 @@  static int cpufreq_init(struct cpufreq_policy *policy)
 		ret = -EPROBE_DEFER;
 		goto out_free_opp;
 	}
+	nr_opp = ret;
 
 	if (fallback) {
 		cpumask_setall(policy->cpus);
@@ -280,6 +324,8 @@  static int cpufreq_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.transition_latency = transition_latency;
 	policy->dvfs_possible_from_any_cpu = true;
 
+	em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+
 	return 0;
 
 out_free_cpufreq_table: