diff mbox

[RFC,v4,12/12] OPTIONAL: cpufreq: dt: Register an Energy Model

Message ID 20180628114043.24724-13-quentin.perret@arm.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Quentin Perret June 28, 2018, 11:40 a.m. UTC
*******************************************************************
* This patch illustrates the usage of the newly introduced Energy *
* Model framework and isn't supposed to be merged as-is.          *
*******************************************************************

The Energy Model framework provides an API to register the active power
of CPUs. Call this API from the cpufreq-dt driver with an estimation
of the power as P = C * V^2 * f with C, V, and f respectively the
capacitance of the CPU and the voltage and frequency of the OPP.

The CPU capacitance is read from the "dynamic-power-coefficient" DT
binding (originally introduced for thermal/IPA), and the voltage and
frequency values from PM_OPP.

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
---
 drivers/cpufreq/cpufreq-dt.c | 45 +++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

Comments

Vincent Guittot July 6, 2018, 10:10 a.m. UTC | #1
On Thu, 28 Jun 2018 at 13:41, Quentin Perret <quentin.perret@arm.com> wrote:
>
> *******************************************************************
> * This patch illustrates the usage of the newly introduced Energy *
> * Model framework and isn't supposed to be merged as-is.          *
> *******************************************************************
>
> The Energy Model framework provides an API to register the active power
> of CPUs. Call this API from the cpufreq-dt driver with an estimation
> of the power as P = C * V^2 * f with C, V, and f respectively the
> capacitance of the CPU and the voltage and frequency of the OPP.
>
> The CPU capacitance is read from the "dynamic-power-coefficient" DT
> binding (originally introduced for thermal/IPA), and the voltage and
> frequency values from PM_OPP.
>
> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Signed-off-by: Quentin Perret <quentin.perret@arm.com>
> ---
>  drivers/cpufreq/cpufreq-dt.c | 45 +++++++++++++++++++++++++++++++++++-
>  1 file changed, 44 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
> index 190ea0dccb79..5a0747f73121 100644
> --- a/drivers/cpufreq/cpufreq-dt.c
> +++ b/drivers/cpufreq/cpufreq-dt.c
> @@ -16,6 +16,7 @@
>  #include <linux/cpu_cooling.h>
>  #include <linux/cpufreq.h>
>  #include <linux/cpumask.h>
> +#include <linux/energy_model.h>
>  #include <linux/err.h>
>  #include <linux/module.h>
>  #include <linux/of.h>
> @@ -149,8 +150,47 @@ static int resources_available(void)
>         return 0;
>  }
>
> +static int of_est_power(unsigned long *mW, unsigned long *KHz, int cpu)
> +{
> +       unsigned long mV, Hz, MHz;
> +       struct device *cpu_dev;
> +       struct dev_pm_opp *opp;
> +       struct device_node *np;
> +       u32 cap;
> +       u64 tmp;
> +
> +       cpu_dev = get_cpu_device(cpu);
> +       if (!cpu_dev)
> +               return -ENODEV;
> +
> +       np = of_node_get(cpu_dev->of_node);
> +       if (!np)
> +               return -EINVAL;
> +
> +       if (of_property_read_u32(np, "dynamic-power-coefficient", &cap))
> +               return -EINVAL;
> +
> +       Hz = *KHz * 1000;
> +       opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz);
> +       if (IS_ERR(opp))
> +               return -EINVAL;
> +
> +       mV = dev_pm_opp_get_voltage(opp) / 1000;
> +       dev_pm_opp_put(opp);
> +
> +       MHz = Hz / 1000000;
> +       tmp = (u64)cap * mV * mV * MHz;
> +       do_div(tmp, 1000000000);

Could you explain the formula above ? and especially the 1000000000 it
seems related to the use of mV and mW instead of uV and uW ...

Can't you just optimize that into
tmp = (u64)cap * mV * mV * Hz;
do_div(tmp, 1000);

> +
> +       *mW = (unsigned long)tmp;
> +       *KHz = Hz / 1000;
> +
> +       return 0;
> +}
> +
>  static int cpufreq_init(struct cpufreq_policy *policy)
>  {
> +       struct em_data_callback em_cb = EM_DATA_CB(of_est_power);
>         struct cpufreq_frequency_table *freq_table;
>         struct opp_table *opp_table = NULL;
>         struct private_data *priv;
> @@ -159,7 +199,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
>         unsigned int transition_latency;
>         bool fallback = false;
>         const char *name;
> -       int ret;
> +       int ret, nr_opp;
>
>         cpu_dev = get_cpu_device(policy->cpu);
>         if (!cpu_dev) {
> @@ -226,6 +266,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
>                 ret = -EPROBE_DEFER;
>                 goto out_free_opp;
>         }
> +       nr_opp = ret;
>
>         if (fallback) {
>                 cpumask_setall(policy->cpus);
> @@ -278,6 +319,8 @@ static int cpufreq_init(struct cpufreq_policy *policy)
>         policy->cpuinfo.transition_latency = transition_latency;
>         policy->dvfs_possible_from_any_cpu = true;
>
> +       em_register_freq_domain(policy->cpus, nr_opp, &em_cb);
> +
>         return 0;
>
>  out_free_cpufreq_table:
> --
> 2.17.1
>
Quentin Perret July 6, 2018, 10:18 a.m. UTC | #2
On Friday 06 Jul 2018 at 12:10:02 (+0200), Vincent Guittot wrote:
> On Thu, 28 Jun 2018 at 13:41, Quentin Perret <quentin.perret@arm.com> wrote:
> > +static int of_est_power(unsigned long *mW, unsigned long *KHz, int cpu)
> > +{
> > +       unsigned long mV, Hz, MHz;
> > +       struct device *cpu_dev;
> > +       struct dev_pm_opp *opp;
> > +       struct device_node *np;
> > +       u32 cap;
> > +       u64 tmp;
> > +
> > +       cpu_dev = get_cpu_device(cpu);
> > +       if (!cpu_dev)
> > +               return -ENODEV;
> > +
> > +       np = of_node_get(cpu_dev->of_node);
> > +       if (!np)
> > +               return -EINVAL;
> > +
> > +       if (of_property_read_u32(np, "dynamic-power-coefficient", &cap))
> > +               return -EINVAL;
> > +
> > +       Hz = *KHz * 1000;
> > +       opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz);
> > +       if (IS_ERR(opp))
> > +               return -EINVAL;
> > +
> > +       mV = dev_pm_opp_get_voltage(opp) / 1000;
> > +       dev_pm_opp_put(opp);
> > +
> > +       MHz = Hz / 1000000;
> > +       tmp = (u64)cap * mV * mV * MHz;
> > +       do_div(tmp, 1000000000);
> 
> Could you explain the formula above ? and especially the 1000000000 it
> seems related to the use of mV and mW instead of uV and uW ...

That's correct, this is just a matter of units. I simply tried to
reproduce here what is currently done for IPA:

https://elixir.bootlin.com/linux/latest/source/drivers/thermal/cpu_cooling.c#L252

> 
> Can't you just optimize that into
> tmp = (u64)cap * mV * mV * Hz;
> do_div(tmp, 1000);

I don't think that'll work. If you use Hz instead of MHz you'll need to
divide tmp by 1000000000000000 to get milli-watts, as specified by the
EM framework.

FYI, I don't consider this patch to be ready to be merged. I kept it
self-contained and simple for the example but I actually think that this
of_est_power function should probably be factored-out somewhere else.
scpi-cpufreq could use it as well. Anyway, I guess we can discuss that
later once the APIs of the EM framework are agreed :-)

Thanks,
Quentin
Vincent Guittot July 30, 2018, 3:53 p.m. UTC | #3
Hi Quentin,

I have noticed the new version but prefer to continue on this thread
to keep the history of the discussion.

On Fri, 6 Jul 2018 at 12:19, Quentin Perret <quentin.perret@arm.com> wrote:
>
> On Friday 06 Jul 2018 at 12:10:02 (+0200), Vincent Guittot wrote:
> > On Thu, 28 Jun 2018 at 13:41, Quentin Perret <quentin.perret@arm.com> wrote:
> > > +static int of_est_power(unsigned long *mW, unsigned long *KHz, int cpu)
> > > +{
> > > +       unsigned long mV, Hz, MHz;
> > > +       struct device *cpu_dev;
> > > +       struct dev_pm_opp *opp;
> > > +       struct device_node *np;
> > > +       u32 cap;
> > > +       u64 tmp;
> > > +
> > > +       cpu_dev = get_cpu_device(cpu);
> > > +       if (!cpu_dev)
> > > +               return -ENODEV;
> > > +
> > > +       np = of_node_get(cpu_dev->of_node);
> > > +       if (!np)
> > > +               return -EINVAL;
> > > +
> > > +       if (of_property_read_u32(np, "dynamic-power-coefficient", &cap))
> > > +               return -EINVAL;
> > > +
> > > +       Hz = *KHz * 1000;
> > > +       opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz);
> > > +       if (IS_ERR(opp))
> > > +               return -EINVAL;
> > > +
> > > +       mV = dev_pm_opp_get_voltage(opp) / 1000;
> > > +       dev_pm_opp_put(opp);
> > > +
> > > +       MHz = Hz / 1000000;
> > > +       tmp = (u64)cap * mV * mV * MHz;
> > > +       do_div(tmp, 1000000000);
> >
> > Could you explain the formula above ? and especially the 1000000000 it
> > seems related to the use of mV and mW instead of uV and uW ...
>
> That's correct, this is just a matter of units. I simply tried to
> reproduce here what is currently done for IPA:
>
> https://elixir.bootlin.com/linux/latest/source/drivers/thermal/cpu_cooling.c#L252

ok, so you copy/paste what is done in cpu cooling device ?

Nevertheless I still have some concerns with the formula used here and
in cpu cooling device:

https://elixir.bootlin.com/linux/latest/source/Documentation/devicetree/bindings/arm/cpus.txt#L273
The documentation says
Pdyn = dynamic-power-coefficient * V^2 * f where voltage is in uV,
frequency is in MHz.
 and  dynamic power coefficient in units of mW/MHz/uV^2.

This implies that the results is in mW and we can summarize the formula with :

mW = C * (uV)^2 * Mhz with C = dynamic-power-coefficient
then uV = 1000*mV
mW = C * (1000*mV)^2 * Mhz
mW = C * 1000000 * mV * mV * Mhz

Which is different from what is used here and in cpu cooling device.
I may have done something wrong with the formula but i can't catch
it... My brain is not yet fully back from vacations

Mhz = Hz / 1000000

mW = C * mV * mV * 1000000 * Hz / 1000000
mW = C * mV * mV * Hz

Let take a simple example with
C = 1
Voltage = 1000 uV = 1 mV
freq = 2 Mhz = 2000000 Hz

mW = C * (uV)^2 * Mhz = 1 * (1000)^2 * 2 = 2000000
mW = C * mV * mV * Hz = 1 * 1 * 1 * 2000000 = 2000000
mW = C * mV * mV * MHz / 1000000000 = 1 * 1 * 1 * 2 / 1000000000 = 0

so there is a mismatch between the formula in the documentation and
the formula used in cpu_cooling.c (and in this patch).

That being said, I can understand that we can easily overflow the
result if we use some kind of realistic values like the one for
hikey960:
dynamic-power-coefficient  for cpu4 is : 550
for highest OPP of cpu4:
opp-hz = /bits/ 64 <2362000000>;
opp-microvolt = <1100000>;

Which gives with the formula in the documentation:
mW = 550 * (1100000)^2 * 2362 = 1,571911×10^18 which is quite huge
even for a big core running at max freq
and with the formula in cpu_cooling.c:
mW = 550 * 1100 * 1100 * 2362 / 1000000000 = 1571 which seems to be a
bit more reasonable

So it seems that the dynamic-power-coefficient unit is not mW/MHz/uV^2
but mW/Ghz/V^2
And given that we use integer, we provide Voltage and frequency by
(mV/1000) and (Mhz/1000) and factorize all /1000

May be Javi who added the formula for cpu cooling can confirm on this ?

>
> >
> > Can't you just optimize that into
> > tmp = (u64)cap * mV * mV * Hz;
> > do_div(tmp, 1000);

In fact the do_div is not needed, I thought that dyn coef unit was
uW/Mhz/uV^2 whereas it's written mW/MHz/uV^2

>
> I don't think that'll work. If you use Hz instead of MHz you'll need to
> divide tmp by 1000000000000000 to get milli-watts, as specified by the
> EM framework.
>
> FYI, I don't consider this patch to be ready to be merged. I kept it
> self-contained and simple for the example but I actually think that this
> of_est_power function should probably be factored-out somewhere else.
> scpi-cpufreq could use it as well. Anyway, I guess we can discuss that
> later once the APIs of the EM framework are agreed :-)

It's just that the tests results that you provided in the cover letter
has used this patch and running tests with wrong formula (it seems
that's the documentation is wrong) doesn't give much confidence on the
results

Thanks
Vincent
>
> Thanks,
> Quentin
Quentin Perret July 30, 2018, 4:20 p.m. UTC | #4
Hi Vincent,

On Monday 30 Jul 2018 at 17:53:23 (+0200), Vincent Guittot wrote:
[...]
> ok, so you copy/paste what is done in cpu cooling device ?
> 
> Nevertheless I still have some concerns with the formula used here and
> in cpu cooling device:
> 
> https://elixir.bootlin.com/linux/latest/source/Documentation/devicetree/bindings/arm/cpus.txt#L273
> The documentation says
> Pdyn = dynamic-power-coefficient * V^2 * f where voltage is in uV,
> frequency is in MHz.
>  and  dynamic power coefficient in units of mW/MHz/uV^2.
> 
> This implies that the results is in mW and we can summarize the formula with :
> 
> mW = C * (uV)^2 * Mhz with C = dynamic-power-coefficient
> then uV = 1000*mV
> mW = C * (1000*mV)^2 * Mhz
> mW = C * 1000000 * mV * mV * Mhz
> 
> Which is different from what is used here and in cpu cooling device.
> I may have done something wrong with the formula but i can't catch
> it... My brain is not yet fully back from vacations

I think your brain works fine ;) The doc does seem to be incorrect ...
I also believe that this issue has been discussed recently in another
thread:

http://archive.armlinux.org.uk/lurker/message/20180720.141530.2bf75d5c.en.html

> Which gives with the formula in the documentation:
> mW = 550 * (1100000)^2 * 2362 = 1,571911×10^18 which is quite huge
> even for a big core running at max freq
> and with the formula in cpu_cooling.c:
> mW = 550 * 1100 * 1100 * 2362 / 1000000000 = 1571 which seems to be a
> bit more reasonable

Indeed, 1.5W for a big CPU at max OPP is in the right ballpark I
believe.

FYI, this is the power values I have in the EM for all OPPs on H960:

        +--------+-------+--------+--------+
        |      A53s      |       A73s      |
        +--------+-------+--------+--------+
        |  MHz   |  mW   |  MHz   |  mW    |
        +--------+-------+--------+--------+
        |  533   |  28   |  903   |  243   |
        |  999   |  70   |  1421  |  500   |
        |  1402  |  124  |  1805  |  804   |
        |  1709  |  187  |  2112  |  1161  |
        |  1844  |  245  |  2362  |  1571  |
        +--------+-------+--------+--------+

[...]

> It's just that the tests results that you provided in the cover letter
> has used this patch and running tests with wrong formula (it seems
> that's the documentation is wrong) doesn't give much confidence on the
> results

I can understand that the mismatch between the code and the
documentation can be slightly confusing. However, as you said yourself
the code _is_ correct, so the test results are valid.

Moreover, the actual unit for the power costs doesn't make any
difference with EAS as long as that doesn't cause precision issues. EAS
works in relative terms, so even if the unit was in, say, uW instead of
mW, the test results would still be valid. And again, that's not the
case, the code does the right thing here.

Thanks,
Quentin
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 190ea0dccb79..5a0747f73121 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -16,6 +16,7 @@ 
 #include <linux/cpu_cooling.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
+#include <linux/energy_model.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -149,8 +150,47 @@  static int resources_available(void)
 	return 0;
 }
 
+static int of_est_power(unsigned long *mW, unsigned long *KHz, int cpu)
+{
+	unsigned long mV, Hz, MHz;
+	struct device *cpu_dev;
+	struct dev_pm_opp *opp;
+	struct device_node *np;
+	u32 cap;
+	u64 tmp;
+
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev)
+		return -ENODEV;
+
+	np = of_node_get(cpu_dev->of_node);
+	if (!np)
+		return -EINVAL;
+
+	if (of_property_read_u32(np, "dynamic-power-coefficient", &cap))
+		return -EINVAL;
+
+	Hz = *KHz * 1000;
+	opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz);
+	if (IS_ERR(opp))
+		return -EINVAL;
+
+	mV = dev_pm_opp_get_voltage(opp) / 1000;
+	dev_pm_opp_put(opp);
+
+	MHz = Hz / 1000000;
+	tmp = (u64)cap * mV * mV * MHz;
+	do_div(tmp, 1000000000);
+
+	*mW = (unsigned long)tmp;
+	*KHz = Hz / 1000;
+
+	return 0;
+}
+
 static int cpufreq_init(struct cpufreq_policy *policy)
 {
+	struct em_data_callback em_cb = EM_DATA_CB(of_est_power);
 	struct cpufreq_frequency_table *freq_table;
 	struct opp_table *opp_table = NULL;
 	struct private_data *priv;
@@ -159,7 +199,7 @@  static int cpufreq_init(struct cpufreq_policy *policy)
 	unsigned int transition_latency;
 	bool fallback = false;
 	const char *name;
-	int ret;
+	int ret, nr_opp;
 
 	cpu_dev = get_cpu_device(policy->cpu);
 	if (!cpu_dev) {
@@ -226,6 +266,7 @@  static int cpufreq_init(struct cpufreq_policy *policy)
 		ret = -EPROBE_DEFER;
 		goto out_free_opp;
 	}
+	nr_opp = ret;
 
 	if (fallback) {
 		cpumask_setall(policy->cpus);
@@ -278,6 +319,8 @@  static int cpufreq_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.transition_latency = transition_latency;
 	policy->dvfs_possible_from_any_cpu = true;
 
+	em_register_freq_domain(policy->cpus, nr_opp, &em_cb);
+
 	return 0;
 
 out_free_cpufreq_table: