diff mbox series

[v5,09/23] PM: EM: Use runtime modified EM for CPUs energy estimation in EAS

Message ID 20231129110853.94344-10-lukasz.luba@arm.com (mailing list archive)
State Changes Requested, archived
Headers show
Series Introduce runtime modifiable Energy Model | expand

Commit Message

Lukasz Luba Nov. 29, 2023, 11:08 a.m. UTC
The new Energy Model (EM) supports runtime modification of the performance
state table to better model the power used by the SoC. Use this new
feature to improve energy estimation and therefore task placement in
Energy Aware Scheduler (EAS).

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
---
 include/linux/energy_model.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

Comments

Qais Yousef Dec. 17, 2023, 5:59 p.m. UTC | #1
On 11/29/23 11:08, Lukasz Luba wrote:
> The new Energy Model (EM) supports runtime modification of the performance
> state table to better model the power used by the SoC. Use this new
> feature to improve energy estimation and therefore task placement in
> Energy Aware Scheduler (EAS).

nit: you moved the code to use the new runtime em table instead of the one
parsed at boot.

> 
> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
> ---
>  include/linux/energy_model.h | 16 ++++++++++++----
>  1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
> index 1e618e431cac..94a77a813724 100644
> --- a/include/linux/energy_model.h
> +++ b/include/linux/energy_model.h
> @@ -238,6 +238,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>  				unsigned long max_util, unsigned long sum_util,
>  				unsigned long allowed_cpu_cap)
>  {
> +	struct em_perf_table *runtime_table;
>  	unsigned long freq, scale_cpu;
>  	struct em_perf_state *ps;
>  	int cpu, i;
> @@ -255,7 +256,14 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>  	 */
>  	cpu = cpumask_first(to_cpumask(pd->cpus));
>  	scale_cpu = arch_scale_cpu_capacity(cpu);
> -	ps = &pd->table[pd->nr_perf_states - 1];
> +
> +	/*
> +	 * No rcu_read_lock() since it's already called by task scheduler.
> +	 * The runtime_table is always there for CPUs, so we don't check.
> +	 */

WARN_ON(rcu_read_lock_held()) instead?


Cheers

--
Qais Yousef

> +	runtime_table = rcu_dereference(pd->runtime_table);
> +
> +	ps = &runtime_table->state[pd->nr_perf_states - 1];
>  
>  	max_util = map_util_perf(max_util);
>  	max_util = min(max_util, allowed_cpu_cap);
> @@ -265,9 +273,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>  	 * Find the lowest performance state of the Energy Model above the
>  	 * requested frequency.
>  	 */
> -	i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq,
> -				      pd->flags);
> -	ps = &pd->table[i];
> +	i = em_pd_get_efficient_state(runtime_table->state, pd->nr_perf_states,
> +				      freq, pd->flags);
> +	ps = &runtime_table->state[i];
>  
>  	/*
>  	 * The capacity of a CPU in the domain at the performance state (ps)
> -- 
> 2.25.1
>
Xuewen Yan Dec. 19, 2023, 4:03 a.m. UTC | #2
On Mon, Dec 18, 2023 at 1:59 AM Qais Yousef <qyousef@layalina.io> wrote:
>
> On 11/29/23 11:08, Lukasz Luba wrote:
> > The new Energy Model (EM) supports runtime modification of the performance
> > state table to better model the power used by the SoC. Use this new
> > feature to improve energy estimation and therefore task placement in
> > Energy Aware Scheduler (EAS).
>
> nit: you moved the code to use the new runtime em table instead of the one
> parsed at boot.
>
> >
> > Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
> > ---
> >  include/linux/energy_model.h | 16 ++++++++++++----
> >  1 file changed, 12 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
> > index 1e618e431cac..94a77a813724 100644
> > --- a/include/linux/energy_model.h
> > +++ b/include/linux/energy_model.h
> > @@ -238,6 +238,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
> >                               unsigned long max_util, unsigned long sum_util,
> >                               unsigned long allowed_cpu_cap)
> >  {
> > +     struct em_perf_table *runtime_table;
> >       unsigned long freq, scale_cpu;
> >       struct em_perf_state *ps;
> >       int cpu, i;
> > @@ -255,7 +256,14 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
> >        */
> >       cpu = cpumask_first(to_cpumask(pd->cpus));
> >       scale_cpu = arch_scale_cpu_capacity(cpu);
> > -     ps = &pd->table[pd->nr_perf_states - 1];
> > +
> > +     /*
> > +      * No rcu_read_lock() since it's already called by task scheduler.
> > +      * The runtime_table is always there for CPUs, so we don't check.
> > +      */
>
> WARN_ON(rcu_read_lock_held()) instead?

I agree, or SCHED_WARN_ON(!rcu_read_lock_held()) ?

>
>
> Cheers
>
> --
> Qais Yousef
>
> > +     runtime_table = rcu_dereference(pd->runtime_table);
> > +
> > +     ps = &runtime_table->state[pd->nr_perf_states - 1];
> >
> >       max_util = map_util_perf(max_util);
> >       max_util = min(max_util, allowed_cpu_cap);
> > @@ -265,9 +273,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
> >        * Find the lowest performance state of the Energy Model above the
> >        * requested frequency.
> >        */
> > -     i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq,
> > -                                   pd->flags);
> > -     ps = &pd->table[i];
> > +     i = em_pd_get_efficient_state(runtime_table->state, pd->nr_perf_states,
> > +                                   freq, pd->flags);
> > +     ps = &runtime_table->state[i];
> >
> >       /*
> >        * The capacity of a CPU in the domain at the performance state (ps)
> > --
> > 2.25.1
> >
>
Lukasz Luba Dec. 19, 2023, 8:32 a.m. UTC | #3
Hi Qais and Xuewen,

On 12/19/23 04:03, Xuewen Yan wrote:
> On Mon, Dec 18, 2023 at 1:59 AM Qais Yousef <qyousef@layalina.io> wrote:
>>
>> On 11/29/23 11:08, Lukasz Luba wrote:
>>> The new Energy Model (EM) supports runtime modification of the performance
>>> state table to better model the power used by the SoC. Use this new
>>> feature to improve energy estimation and therefore task placement in
>>> Energy Aware Scheduler (EAS).
>>
>> nit: you moved the code to use the new runtime em table instead of the one
>> parsed at boot.
>>
>>>
>>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
>>> ---
>>>   include/linux/energy_model.h | 16 ++++++++++++----
>>>   1 file changed, 12 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
>>> index 1e618e431cac..94a77a813724 100644
>>> --- a/include/linux/energy_model.h
>>> +++ b/include/linux/energy_model.h
>>> @@ -238,6 +238,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>>>                                unsigned long max_util, unsigned long sum_util,
>>>                                unsigned long allowed_cpu_cap)
>>>   {
>>> +     struct em_perf_table *runtime_table;
>>>        unsigned long freq, scale_cpu;
>>>        struct em_perf_state *ps;
>>>        int cpu, i;
>>> @@ -255,7 +256,14 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>>>         */
>>>        cpu = cpumask_first(to_cpumask(pd->cpus));
>>>        scale_cpu = arch_scale_cpu_capacity(cpu);
>>> -     ps = &pd->table[pd->nr_perf_states - 1];
>>> +
>>> +     /*
>>> +      * No rcu_read_lock() since it's already called by task scheduler.
>>> +      * The runtime_table is always there for CPUs, so we don't check.
>>> +      */
>>
>> WARN_ON(rcu_read_lock_held()) instead?
> 
> I agree, or SCHED_WARN_ON(!rcu_read_lock_held()) ?

I disagree here. This is a sched function in hot path and as comment
says:

-----------------------
  * This function must be used only for CPU devices. There is no validation,
  * i.e. if the EM is a CPU type and has cpumask allocated. It is called 
from
  * the scheduler code quite frequently and that is why there is not checks.
-----------------------

We don't have to put the checks or warnings everywhere in the kernel
functions. Especially hot one like this one.

As you might not notice, we don't even check if the pd->cpus is not NULL

Regards,
Lukasz
Qais Yousef Dec. 28, 2023, 5:32 p.m. UTC | #4
On 12/19/23 08:32, Lukasz Luba wrote:
> Hi Qais and Xuewen,
> 
> On 12/19/23 04:03, Xuewen Yan wrote:
> > On Mon, Dec 18, 2023 at 1:59 AM Qais Yousef <qyousef@layalina.io> wrote:
> > > 
> > > On 11/29/23 11:08, Lukasz Luba wrote:
> > > > The new Energy Model (EM) supports runtime modification of the performance
> > > > state table to better model the power used by the SoC. Use this new
> > > > feature to improve energy estimation and therefore task placement in
> > > > Energy Aware Scheduler (EAS).
> > > 
> > > nit: you moved the code to use the new runtime em table instead of the one
> > > parsed at boot.
> > > 
> > > > 
> > > > Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
> > > > ---
> > > >   include/linux/energy_model.h | 16 ++++++++++++----
> > > >   1 file changed, 12 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
> > > > index 1e618e431cac..94a77a813724 100644
> > > > --- a/include/linux/energy_model.h
> > > > +++ b/include/linux/energy_model.h
> > > > @@ -238,6 +238,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
> > > >                                unsigned long max_util, unsigned long sum_util,
> > > >                                unsigned long allowed_cpu_cap)
> > > >   {
> > > > +     struct em_perf_table *runtime_table;
> > > >        unsigned long freq, scale_cpu;
> > > >        struct em_perf_state *ps;
> > > >        int cpu, i;
> > > > @@ -255,7 +256,14 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
> > > >         */
> > > >        cpu = cpumask_first(to_cpumask(pd->cpus));
> > > >        scale_cpu = arch_scale_cpu_capacity(cpu);
> > > > -     ps = &pd->table[pd->nr_perf_states - 1];
> > > > +
> > > > +     /*
> > > > +      * No rcu_read_lock() since it's already called by task scheduler.
> > > > +      * The runtime_table is always there for CPUs, so we don't check.
> > > > +      */
> > > 
> > > WARN_ON(rcu_read_lock_held()) instead?
> > 
> > I agree, or SCHED_WARN_ON(!rcu_read_lock_held()) ?
> 
> I disagree here. This is a sched function in hot path and as comment

WARN_ON() is not a sched function.

> says:
> 
> -----------------------
>  * This function must be used only for CPU devices. There is no validation,
>  * i.e. if the EM is a CPU type and has cpumask allocated. It is called from
>  * the scheduler code quite frequently and that is why there is not checks.
> -----------------------
> 
> We don't have to put the checks or warnings everywhere in the kernel
> functions. Especially hot one like this one.

When checks are necessary, there are ways even for hot paths.

> 
> As you might not notice, we don't even check if the pd->cpus is not NULL

rcu_read_lock_held() is only enabled for lockdebug build and it's the standard
way to document and add verification to ensure locking rules are honoured. On
non lockdebug build this will be compiled out.

You had to put a long comment to ensure locking rules are correct, why not
use existing infrastructure instead to provide better checks and inherent
documentation?

We had a bug recently where the rcu_read_lock() was moved and this broke some
function buried down in the call stack. So subtle code shuffles elsewhere can
cause unwanted side effects; and it's hard to catch these bugs.

	https://lore.kernel.org/stable/20231009130130.210024505@linuxfoundation.org/
Lukasz Luba Jan. 2, 2024, 11:17 a.m. UTC | #5
On 12/28/23 17:32, Qais Yousef wrote:
> On 12/19/23 08:32, Lukasz Luba wrote:
>> Hi Qais and Xuewen,
>>
>> On 12/19/23 04:03, Xuewen Yan wrote:
>>> On Mon, Dec 18, 2023 at 1:59 AM Qais Yousef <qyousef@layalina.io> wrote:
>>>>
>>>> On 11/29/23 11:08, Lukasz Luba wrote:
>>>>> The new Energy Model (EM) supports runtime modification of the performance
>>>>> state table to better model the power used by the SoC. Use this new
>>>>> feature to improve energy estimation and therefore task placement in
>>>>> Energy Aware Scheduler (EAS).
>>>>
>>>> nit: you moved the code to use the new runtime em table instead of the one
>>>> parsed at boot.
>>>>
>>>>>
>>>>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
>>>>> ---
>>>>>    include/linux/energy_model.h | 16 ++++++++++++----
>>>>>    1 file changed, 12 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
>>>>> index 1e618e431cac..94a77a813724 100644
>>>>> --- a/include/linux/energy_model.h
>>>>> +++ b/include/linux/energy_model.h
>>>>> @@ -238,6 +238,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>>>>>                                 unsigned long max_util, unsigned long sum_util,
>>>>>                                 unsigned long allowed_cpu_cap)
>>>>>    {
>>>>> +     struct em_perf_table *runtime_table;
>>>>>         unsigned long freq, scale_cpu;
>>>>>         struct em_perf_state *ps;
>>>>>         int cpu, i;
>>>>> @@ -255,7 +256,14 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>>>>>          */
>>>>>         cpu = cpumask_first(to_cpumask(pd->cpus));
>>>>>         scale_cpu = arch_scale_cpu_capacity(cpu);
>>>>> -     ps = &pd->table[pd->nr_perf_states - 1];
>>>>> +
>>>>> +     /*
>>>>> +      * No rcu_read_lock() since it's already called by task scheduler.
>>>>> +      * The runtime_table is always there for CPUs, so we don't check.
>>>>> +      */
>>>>
>>>> WARN_ON(rcu_read_lock_held()) instead?
>>>
>>> I agree, or SCHED_WARN_ON(!rcu_read_lock_held()) ?
>>
>> I disagree here. This is a sched function in hot path and as comment
> 
> WARN_ON() is not a sched function.

I was referring to em_cpu_energy() being sched function. No one else
should call it. That's the old contract also put into the doc of
that function.

> 
>> says:
>>
>> -----------------------
>>   * This function must be used only for CPU devices. There is no validation,
>>   * i.e. if the EM is a CPU type and has cpumask allocated. It is called from
>>   * the scheduler code quite frequently and that is why there is not checks.
>> -----------------------
>>
>> We don't have to put the checks or warnings everywhere in the kernel
>> functions. Especially hot one like this one.
> 
> When checks are necessary, there are ways even for hot paths.

We have that function called from feec() where the RCU must be hold,
otherwise the whole EAS would be unstable.

> 
>>
>> As you might not notice, we don't even check if the pd->cpus is not NULL
> 
> rcu_read_lock_held() is only enabled for lockdebug build and it's the standard
> way to document and add verification to ensure locking rules are honoured. On
> non lockdebug build this will be compiled out.
> 
> You had to put a long comment to ensure locking rules are correct, why not
> use existing infrastructure instead to provide better checks and inherent
> documentation?

I didn't want to add any more overhead in this hot path.

> 
> We had a bug recently where the rcu_read_lock() was moved and this broke some
> function buried down in the call stack. So subtle code shuffles elsewhere can
> cause unwanted side effects; and it's hard to catch these bugs.
> 
> 	https://lore.kernel.org/stable/20231009130130.210024505@linuxfoundation.org/

OK, let me check that w/ and w/o lockdebug build and the
SCHED_WARN_ON(!rcu_read_lock_held())

Although, it would be only a safety net for accidental use of
em_cpu_energy() from code path other than feec()...
Which actually might bring some value.
diff mbox series

Patch

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 1e618e431cac..94a77a813724 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -238,6 +238,7 @@  static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 				unsigned long max_util, unsigned long sum_util,
 				unsigned long allowed_cpu_cap)
 {
+	struct em_perf_table *runtime_table;
 	unsigned long freq, scale_cpu;
 	struct em_perf_state *ps;
 	int cpu, i;
@@ -255,7 +256,14 @@  static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 */
 	cpu = cpumask_first(to_cpumask(pd->cpus));
 	scale_cpu = arch_scale_cpu_capacity(cpu);
-	ps = &pd->table[pd->nr_perf_states - 1];
+
+	/*
+	 * No rcu_read_lock() since it's already called by task scheduler.
+	 * The runtime_table is always there for CPUs, so we don't check.
+	 */
+	runtime_table = rcu_dereference(pd->runtime_table);
+
+	ps = &runtime_table->state[pd->nr_perf_states - 1];
 
 	max_util = map_util_perf(max_util);
 	max_util = min(max_util, allowed_cpu_cap);
@@ -265,9 +273,9 @@  static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 * Find the lowest performance state of the Energy Model above the
 	 * requested frequency.
 	 */
-	i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq,
-				      pd->flags);
-	ps = &pd->table[i];
+	i = em_pd_get_efficient_state(runtime_table->state, pd->nr_perf_states,
+				      freq, pd->flags);
+	ps = &runtime_table->state[i];
 
 	/*
 	 * The capacity of a CPU in the domain at the performance state (ps)