diff mbox series

[1/2] Modify cpupower to schedule itself on cores it is reading MSRs from

Message ID 20190918163445.129103-1-Janakarajan.Natarajan@amd.com (mailing list archive)
State Changes Requested
Delegated to: Shuah Khan
Headers show
Series [1/2] Modify cpupower to schedule itself on cores it is reading MSRs from | expand

Commit Message

Janakarajan Natarajan Sept. 18, 2019, 4:34 p.m. UTC
Modify cpupower to schedule itself on each of the cpus in the system and
then get the APERF/MPERF register values.

This is advantageous because an IPI is not generated when a read_msr() is
executed on the local logical CPU thereby reducing the chance of having
APERF and MPERF being out of sync.

Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
---
 .../utils/idle_monitor/mperf_monitor.c        | 38 ++++++++++++++-----
 1 file changed, 28 insertions(+), 10 deletions(-)

Comments

Janakarajan Natarajan Sept. 27, 2019, 4:07 p.m. UTC | #1
On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
> Modify cpupower to schedule itself on each of the cpus in the system and
> then get the APERF/MPERF register values.
>
> This is advantageous because an IPI is not generated when a read_msr() is
> executed on the local logical CPU thereby reducing the chance of having
> APERF and MPERF being out of sync.
>
> Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>


Any concerns regarding this patchset?


Thanks.


> ---
>   .../utils/idle_monitor/mperf_monitor.c        | 38 ++++++++++++++-----
>   1 file changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> index 44806a6dae11..8b072e39c897 100644
> --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> @@ -10,6 +10,7 @@
>   #include <stdlib.h>
>   #include <string.h>
>   #include <limits.h>
> +#include <sched.h>
>   
>   #include <cpufreq.h>
>   
> @@ -86,15 +87,33 @@ static int mperf_get_tsc(unsigned long long *tsc)
>   	return ret;
>   }
>   
> +static int get_aperf_mperf(int cpu, unsigned long long *aval,
> +			   unsigned long long *mval)
> +{
> +	cpu_set_t set;
> +	int ret;
> +
> +	CPU_ZERO(&set);
> +	CPU_SET(cpu, &set);
> +	if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
> +		dprint("Could not migrate to cpu: %d\n", cpu);
> +		return 1;
> +	}
> +
> +	ret = read_msr(cpu, MSR_APERF, aval);
> +	ret |= read_msr(cpu, MSR_MPERF, mval);
> +
> +	return ret;
> +}
> +
>   static int mperf_init_stats(unsigned int cpu)
>   {
> -	unsigned long long val;
> +	unsigned long long aval, mval;
>   	int ret;
>   
> -	ret = read_msr(cpu, MSR_APERF, &val);
> -	aperf_previous_count[cpu] = val;
> -	ret |= read_msr(cpu, MSR_MPERF, &val);
> -	mperf_previous_count[cpu] = val;
> +	ret = get_aperf_mperf(cpu, &aval, &mval);
> +	aperf_previous_count[cpu] = aval;
> +	mperf_previous_count[cpu] = mval;
>   	is_valid[cpu] = !ret;
>   
>   	return 0;
> @@ -102,13 +121,12 @@ static int mperf_init_stats(unsigned int cpu)
>   
>   static int mperf_measure_stats(unsigned int cpu)
>   {
> -	unsigned long long val;
> +	unsigned long long aval, mval;
>   	int ret;
>   
> -	ret = read_msr(cpu, MSR_APERF, &val);
> -	aperf_current_count[cpu] = val;
> -	ret |= read_msr(cpu, MSR_MPERF, &val);
> -	mperf_current_count[cpu] = val;
> +	ret = get_aperf_mperf(cpu, &aval, &mval);
> +	aperf_current_count[cpu] = aval;
> +	mperf_current_count[cpu] = mval;
>   	is_valid[cpu] = !ret;
>   
>   	return 0;
shuah Sept. 27, 2019, 6:59 p.m. UTC | #2
On 9/18/19 10:34 AM, Natarajan, Janakarajan wrote:
> Modify cpupower to schedule itself on each of the cpus in the system and
> then get the APERF/MPERF register values.
> 
> This is advantageous because an IPI is not generated when a read_msr() is
> executed on the local logical CPU thereby reducing the chance of having
> APERF and MPERF being out of sync.

Somehow this doesn't read right. Is this that you are trying to avoid
APERF and MPERF being out of sync with this change?

This description is rather confusing.

> 
> Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
> ---
>   .../utils/idle_monitor/mperf_monitor.c        | 38 ++++++++++++++-----
>   1 file changed, 28 insertions(+), 10 deletions(-)
> 
> diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> index 44806a6dae11..8b072e39c897 100644
> --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
> @@ -10,6 +10,7 @@
>   #include <stdlib.h>
>   #include <string.h>
>   #include <limits.h>
> +#include <sched.h>
>   
>   #include <cpufreq.h>
>   
> @@ -86,15 +87,33 @@ static int mperf_get_tsc(unsigned long long *tsc)
>   	return ret;
>   }
>   
> +static int get_aperf_mperf(int cpu, unsigned long long *aval,
> +			   unsigned long long *mval)
> +{
> +	cpu_set_t set;
> +	int ret;
> +
> +	CPU_ZERO(&set);
> +	CPU_SET(cpu, &set);
> +	if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
> +		dprint("Could not migrate to cpu: %d\n", cpu);
> +		return 1;
> +	}
> +
> +	ret = read_msr(cpu, MSR_APERF, aval);
> +	ret |= read_msr(cpu, MSR_MPERF, mval);
> +
> +	return ret;
> +}
> +
>   static int mperf_init_stats(unsigned int cpu)
>   {
> -	unsigned long long val;
> +	unsigned long long aval, mval;
>   	int ret;
>   
> -	ret = read_msr(cpu, MSR_APERF, &val);
> -	aperf_previous_count[cpu] = val;
> -	ret |= read_msr(cpu, MSR_MPERF, &val);
> -	mperf_previous_count[cpu] = val;
> +	ret = get_aperf_mperf(cpu, &aval, &mval);

get_aperf_mperf() could return error right? It returns 1 when
sched_setaffinity() fails. Shouldn't the return value checked,
instead of using aval and mval?

> +	aperf_previous_count[cpu] = aval;
> +	mperf_previous_count[cpu] = mval;
>   	is_valid[cpu] = !ret;
>   
>   	return 0;
> @@ -102,13 +121,12 @@ static int mperf_init_stats(unsigned int cpu)
>   
>   static int mperf_measure_stats(unsigned int cpu)
>   {
> -	unsigned long long val;
> +	unsigned long long aval, mval;
>   	int ret;
>   
> -	ret = read_msr(cpu, MSR_APERF, &val);
> -	aperf_current_count[cpu] = val;
> -	ret |= read_msr(cpu, MSR_MPERF, &val);
> -	mperf_current_count[cpu] = val;
> +	ret = get_aperf_mperf(cpu, &aval, &mval);

Same comments as above here.

> +	aperf_current_count[cpu] = aval;
> +	mperf_current_count[cpu] = mval;
>   	is_valid[cpu] = !ret;
>   
>   	return 0;
> 

thanks,
-- Shuah
Thomas Renninger Sept. 27, 2019, 9:48 p.m. UTC | #3
On Friday, September 27, 2019 6:07:56 PM CEST  Natarajan, Janakarajan  wrote:
> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:

> > This is advantageous because an IPI is not generated when a read_msr() is
> > executed on the local logical CPU thereby reducing the chance of having
> > APERF and MPERF being out of sync.
> > +	if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
> > +		dprint("Could not migrate to cpu: %d\n", cpu);
> > +		return 1;

On a 80 core cpu the process would be pushed around through the
system quite a lot. 
This might affect what you are measuring or the other measure values?
Otherwise it's the kernel's MSR read only, not the whole cpupower process,
right? No idea about the exact overhead, though. Others in CC list should
know.

Afaik msr reads through msr module should be avoided anyway?
Those which are worth it are abstracted through sysfs nowadays?

For aperf/mperf it might make sense to define a sysfs file where you
can read both, as this is what you always need?

It would take a while, but could be a longterm solution which is also
usable in secure boot or without msr module case.

      Thomas
Janakarajan Natarajan Sept. 30, 2019, 3:34 p.m. UTC | #4
On 9/27/2019 1:59 PM, shuah wrote:
> On 9/18/19 10:34 AM, Natarajan, Janakarajan wrote:
>> Modify cpupower to schedule itself on each of the cpus in the system and
>> then get the APERF/MPERF register values.
>>
>> This is advantageous because an IPI is not generated when a 
>> read_msr() is
>> executed on the local logical CPU thereby reducing the chance of having
>> APERF and MPERF being out of sync.
>
> Somehow this doesn't read right. Is this that you are trying to avoid
> APERF and MPERF being out of sync with this change?
>
> This description is rather confusing.


We are trying to avoid a separate IPI for APERF and MPERF. We have seen 
that a separate IPI for

each can cause APERF and MPERF to go out of sync.


If the cpupower is already executing on the core it wants to get the 
APERF/MPERF from, then

generic_exec_single() does not generate the IPI to execute the rdmsr(). 
Rather it just executes

on the current cpu.


I can change the description to add more detail.


>
>>
>> Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
>> ---
>>   .../utils/idle_monitor/mperf_monitor.c        | 38 ++++++++++++++-----
>>   1 file changed, 28 insertions(+), 10 deletions(-)
>>
>> diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c 
>> b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
>> index 44806a6dae11..8b072e39c897 100644
>> --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
>> +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
>> @@ -10,6 +10,7 @@
>>   #include <stdlib.h>
>>   #include <string.h>
>>   #include <limits.h>
>> +#include <sched.h>
>>     #include <cpufreq.h>
>>   @@ -86,15 +87,33 @@ static int mperf_get_tsc(unsigned long long *tsc)
>>       return ret;
>>   }
>>   +static int get_aperf_mperf(int cpu, unsigned long long *aval,
>> +               unsigned long long *mval)
>> +{
>> +    cpu_set_t set;
>> +    int ret;
>> +
>> +    CPU_ZERO(&set);
>> +    CPU_SET(cpu, &set);
>> +    if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
>> +        dprint("Could not migrate to cpu: %d\n", cpu);
>> +        return 1;
>> +    }
>> +
>> +    ret = read_msr(cpu, MSR_APERF, aval);
>> +    ret |= read_msr(cpu, MSR_MPERF, mval);
>> +
>> +    return ret;
>> +}
>> +
>>   static int mperf_init_stats(unsigned int cpu)
>>   {
>> -    unsigned long long val;
>> +    unsigned long long aval, mval;
>>       int ret;
>>   -    ret = read_msr(cpu, MSR_APERF, &val);
>> -    aperf_previous_count[cpu] = val;
>> -    ret |= read_msr(cpu, MSR_MPERF, &val);
>> -    mperf_previous_count[cpu] = val;
>> +    ret = get_aperf_mperf(cpu, &aval, &mval);
>
> get_aperf_mperf() could return error right? It returns 1 when
> sched_setaffinity() fails. Shouldn't the return value checked,
> instead of using aval and mval?


We set the is_valid[cpu] to the return value. Later on the is_valid[cpu]

is checked before proceeding to calculate the effective freq.


Thanks.


>
>> +    aperf_previous_count[cpu] = aval;
>> +    mperf_previous_count[cpu] = mval;
>>       is_valid[cpu] = !ret;
>>         return 0;
>> @@ -102,13 +121,12 @@ static int mperf_init_stats(unsigned int cpu)
>>     static int mperf_measure_stats(unsigned int cpu)
>>   {
>> -    unsigned long long val;
>> +    unsigned long long aval, mval;
>>       int ret;
>>   -    ret = read_msr(cpu, MSR_APERF, &val);
>> -    aperf_current_count[cpu] = val;
>> -    ret |= read_msr(cpu, MSR_MPERF, &val);
>> -    mperf_current_count[cpu] = val;
>> +    ret = get_aperf_mperf(cpu, &aval, &mval);
>
> Same comments as above here.
>
>> +    aperf_current_count[cpu] = aval;
>> +    mperf_current_count[cpu] = mval;
>>       is_valid[cpu] = !ret;
>>         return 0;
>>
>
> thanks,
> -- Shuah
Janakarajan Natarajan Oct. 2, 2019, 2:45 p.m. UTC | #5
On 9/27/19 4:48 PM, Thomas Renninger wrote:
> On Friday, September 27, 2019 6:07:56 PM CEST  Natarajan, Janakarajan  wrote:
>> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
>>> This is advantageous because an IPI is not generated when a read_msr() is
>>> executed on the local logical CPU thereby reducing the chance of having
>>> APERF and MPERF being out of sync.
>>> +	if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
>>> +		dprint("Could not migrate to cpu: %d\n", cpu);
>>> +		return 1;
> On a 80 core cpu the process would be pushed around through the
> system quite a lot.
> This might affect what you are measuring or the other measure values?
> Otherwise it's the kernel's MSR read only, not the whole cpupower process,
> right? No idea about the exact overhead, though. Others in CC list should
> know.


On a 256 logical-cpu Rome system we see C0 value from cpupower output go 
from 0.01 to ~(0.1 to 1.00)

for all cpus with the 1st patch.

However, this goes down to ~0.01 when we use the RDPRU instruction 
(which can be used to get

APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).


> Afaik msr reads through msr module should be avoided anyway?
> Those which are worth it are abstracted through sysfs nowadays?
>
> For aperf/mperf it might make sense to define a sysfs file where you
> can read both, as this is what you always need?
>
> It would take a while, but could be a longterm solution which is also
> usable in secure boot or without msr module case.


Yes. That is a good long term idea. An interface which could be used to 
query APERF, MPERF

for a logical cpu in one go.

However, for systems that provide an instruction  to get register values 
from userspace, would a

command-line parameter be acceptable?

i.e. p: precise measurement.

When this is set, the cpupower process can migrate to each cpu and ,if 
an instruction is available

which can get the APERF/MPERF from CPL > 0, use it. That would cut out 
the msr module and the

overhead can be reduced.


Thanks.


>        Thomas
>
>
>
Thomas Renninger Oct. 5, 2019, 12:40 p.m. UTC | #6
Hi,

On Wednesday, October 2, 2019 4:45:03 PM CEST Natarajan, Janakarajan wrote:
> On 9/27/19 4:48 PM, Thomas Renninger wrote:
> 
> > On Friday, September 27, 2019 6:07:56 PM CEST  Natarajan, Janakarajan 
> > wrote:
> 
> >> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
 
> On a 256 logical-cpu Rome system we see C0 value from cpupower output go 
> from 0.01 to ~(0.1 to 1.00)
> 
> for all cpus with the 1st patch.
> 
> However, this goes down to ~0.01 when we use the RDPRU instruction 
> (which can be used to get
> 
> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).

And this one only exists on latest AMD cpus, right?

> However, for systems that provide an instruction  to get register values 
> from userspace, would a command-line parameter be acceptable?

Parameter sounds like a good idea. In fact, there already is such a paramter.
cpupower monitor --help
       -c
           Schedule  the  process  on every core before starting and ending 
measuring.  This could be needed for the Idle_Stats monitor when no other MSR 
based monitor (has to be run on the core that is measured) is run in parallel.  
This is to wake up the processors from deeper sleep states and let the kernel 
reaccount its cpuidle (C-state) information before reading the cpuidle timings 
from sysfs.

Best is you exchange the order of your patches. The 2nd looks rather straight
forward and you can add my reviewed-by.

If you still need adjustings with -c param, they can be discussed separately.
It would also be nice to mention in which case it makes sense to use it in the 
manpage or advantages/drawbacks if you don't.

Thanks!

    Thomas
Janakarajan Natarajan Oct. 7, 2019, 9:11 p.m. UTC | #7
On 10/5/2019 7:40 AM, Thomas Renninger wrote:
> Hi,
>
> On Wednesday, October 2, 2019 4:45:03 PM CEST Natarajan, Janakarajan wrote:
>> On 9/27/19 4:48 PM, Thomas Renninger wrote:
>>
>>> On Friday, September 27, 2019 6:07:56 PM CEST  Natarajan, Janakarajan
>>> wrote:
>>>> On 9/18/2019 11:34 AM, Natarajan, Janakarajan wrote:
>   
>> On a 256 logical-cpu Rome system we see C0 value from cpupower output go
>> from 0.01 to ~(0.1 to 1.00)
>>
>> for all cpus with the 1st patch.
>>
>> However, this goes down to ~0.01 when we use the RDPRU instruction
>> (which can be used to get
>>
>> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
> And this one only exists on latest AMD cpus, right?


Yes. The RDPRU instruction exists only on AMD cpus.


>
>> However, for systems that provide an instruction  to get register values
>> from userspace, would a command-line parameter be acceptable?
> Parameter sounds like a good idea. In fact, there already is such a paramter.
> cpupower monitor --help
>         -c
>             Schedule  the  process  on every core before starting and ending
> measuring.  This could be needed for the Idle_Stats monitor when no other MSR
> based monitor (has to be run on the core that is measured) is run in parallel.
> This is to wake up the processors from deeper sleep states and let the kernel
> reaccount its cpuidle (C-state) information before reading the cpuidle timings
> from sysfs.
>
> Best is you exchange the order of your patches. The 2nd looks rather straight
> forward and you can add my reviewed-by.


The RDPRU instruction reads the APERF/MPERF of the cpu on which it is 
running. If we do

not schedule it on each cpu specifically, it will read the APERF/MPERF 
of the cpu in which it runs/might

happen to run on, which will not be the correct behavior.


>
> If you still need adjustings with -c param, they can be discussed separately.


The -c parameter causes cpupower to schedule itself on each of the cpus 
of the system in a loop.

After the loop the cpupower starts the measurement of APERF/MPERF of 
each cpu.

This doesn't offer the behavior needed to use RDPRU, which requires 
cpupower to execute on the cpu

whose APERF/MPERF values we are interested in.


Thanks,

Janak


> It would also be nice to mention in which case it makes sense to use it in the
> manpage or advantages/drawbacks if you don't.
>
> Thanks!
>
>      Thomas
>
>
>
Thomas Renninger Oct. 10, 2019, 11:22 a.m. UTC | #8
On Monday, October 7, 2019 11:11:30 PM CEST Natarajan, Janakarajan wrote:
> On 10/5/2019 7:40 AM, Thomas Renninger wrote:
> 
...
> >>
> >> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
> > 
> > And this one only exists on latest AMD cpus, right?
> 
> Yes. The RDPRU instruction exists only on AMD cpus.
> >
> >> However, for systems that provide an instruction  to get register values
> >> from userspace, would a command-line parameter be acceptable?
> > 
> > Parameter sounds like a good idea. In fact, there already is such a
> > paramter.
 cpupower monitor --help
> > 
> >         -c
> >         
> >             Schedule  the  process  on every core before starting and
> >             ending
> > 
> > measuring.  This could be needed for the Idle_Stats monitor when no other
> > MSR based monitor (has to be run on the core that is measured) is run in
> > parallel. This is to wake up the processors from deeper sleep states and
> > let the kernel reaccount its cpuidle (C-state) information before reading
> > the cpuidle timings from sysfs.
> >
> > Best is you exchange the order of your patches. The 2nd looks rather
> > straight forward and you can add my reviewed-by.
> 
> The RDPRU instruction reads the APERF/MPERF of the cpu on which it is 
> running. If we do not schedule it on each cpu specifically, it will read the APERF/MPERF
> of the cpu in which it runs/might happen to run on, which will not be the correct behavior.

Got it. And I also didn't fully read -c. I now remember.. For C-states accounting
you want to have each CPU woken up at measure start and end for accurate measuring.

It's a pity that the monitors do the per_cpu calls themselves.
So a general idle-monitor param is not possible or can only done by for example by
adding a flag to the cpuidle_monitor struct:

struct cpuidle_monitor

unsigned int needs_root:1
unsigned int per_cpu_schedule:1

not sure whether a:
struct {
  unsigned int needs_root:1
  unsigned int per_cpu_schedule:1
} flags

should/must be put around in a separate cleanup patch (and needs_root users adjusted).

You (and other monitors for which this might make sense) can then implement
the per_cpu_schedule flag. In AMD case you might want (you have to)
directly set it.

All around a -b/-u (--bind-measure-to-cpu, --unbind-measure-to-cpu)
parameter could be added at some point of time if it matters. And monitors
having this could bind or not.
This possibly could nuke out -c param. Or at least the idle state counter
monitor could do it itself. But don't mind about this.

What do you think?

And you should be able to re-use the bind_cpu function used in -c case?

       Thomas
Janakarajan Natarajan Oct. 11, 2019, 4:58 p.m. UTC | #9
On 10/10/2019 6:22 AM, Thomas Renninger wrote:
> On Monday, October 7, 2019 11:11:30 PM CEST Natarajan, Janakarajan wrote:
>> On 10/5/2019 7:40 AM, Thomas Renninger wrote:
>>
> ...
>>>> APERF/MPERF from CPL > 0) and avoid using the msr module (patch 2).
>>> And this one only exists on latest AMD cpus, right?
>> Yes. The RDPRU instruction exists only on AMD cpus.
>>>> However, for systems that provide an instruction  to get register values
>>>> from userspace, would a command-line parameter be acceptable?
>>> Parameter sounds like a good idea. In fact, there already is such a
>>> paramter.
>   cpupower monitor --help
>>>          -c
>>>          
>>>              Schedule  the  process  on every core before starting and
>>>              ending
>>>
>>> measuring.  This could be needed for the Idle_Stats monitor when no other
>>> MSR based monitor (has to be run on the core that is measured) is run in
>>> parallel. This is to wake up the processors from deeper sleep states and
>>> let the kernel reaccount its cpuidle (C-state) information before reading
>>> the cpuidle timings from sysfs.
>>>
>>> Best is you exchange the order of your patches. The 2nd looks rather
>>> straight forward and you can add my reviewed-by.
>> The RDPRU instruction reads the APERF/MPERF of the cpu on which it is
>> running. If we do not schedule it on each cpu specifically, it will read the APERF/MPERF
>> of the cpu in which it runs/might happen to run on, which will not be the correct behavior.
> Got it. And I also didn't fully read -c. I now remember.. For C-states accounting
> you want to have each CPU woken up at measure start and end for accurate measuring.
>
> It's a pity that the monitors do the per_cpu calls themselves.
> So a general idle-monitor param is not possible or can only done by for example by
> adding a flag to the cpuidle_monitor struct:
>
> struct cpuidle_monitor
>
> unsigned int needs_root:1
> unsigned int per_cpu_schedule:1
>
> not sure whether a:
> struct {
>    unsigned int needs_root:1
>    unsigned int per_cpu_schedule:1
> } flags
>
> should/must be put around in a separate cleanup patch (and needs_root users adjusted).
>
> You (and other monitors for which this might make sense) can then implement
> the per_cpu_schedule flag. In AMD case you might want (you have to)
> directly set it.
>
> All around a -b/-u (--bind-measure-to-cpu, --unbind-measure-to-cpu)
> parameter could be added at some point of time if it matters. And monitors
> having this could bind or not.
> This possibly could nuke out -c param. Or at least the idle state counter
> monitor could do it itself. But don't mind about this.
>
> What do you think?


This is a good suggestion. I can submit a v2 with:

a) a patch to readjust the needs_root variable

b) a patch to introduce and use the per_cpu_schedule

c) a patch to introduce and use the RDPRU instruction


>
> And you should be able to re-use the bind_cpu function used in -c case?


Yes. I noticed that bind_cpu() is doing what I need. I will use that.


Thanks,

Janak


>
>         Thomas
>
>
>
diff mbox series

Patch

diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
index 44806a6dae11..8b072e39c897 100644
--- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
+++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
@@ -10,6 +10,7 @@ 
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include <sched.h>
 
 #include <cpufreq.h>
 
@@ -86,15 +87,33 @@  static int mperf_get_tsc(unsigned long long *tsc)
 	return ret;
 }
 
+static int get_aperf_mperf(int cpu, unsigned long long *aval,
+			   unsigned long long *mval)
+{
+	cpu_set_t set;
+	int ret;
+
+	CPU_ZERO(&set);
+	CPU_SET(cpu, &set);
+	if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
+		dprint("Could not migrate to cpu: %d\n", cpu);
+		return 1;
+	}
+
+	ret = read_msr(cpu, MSR_APERF, aval);
+	ret |= read_msr(cpu, MSR_MPERF, mval);
+
+	return ret;
+}
+
 static int mperf_init_stats(unsigned int cpu)
 {
-	unsigned long long val;
+	unsigned long long aval, mval;
 	int ret;
 
-	ret = read_msr(cpu, MSR_APERF, &val);
-	aperf_previous_count[cpu] = val;
-	ret |= read_msr(cpu, MSR_MPERF, &val);
-	mperf_previous_count[cpu] = val;
+	ret = get_aperf_mperf(cpu, &aval, &mval);
+	aperf_previous_count[cpu] = aval;
+	mperf_previous_count[cpu] = mval;
 	is_valid[cpu] = !ret;
 
 	return 0;
@@ -102,13 +121,12 @@  static int mperf_init_stats(unsigned int cpu)
 
 static int mperf_measure_stats(unsigned int cpu)
 {
-	unsigned long long val;
+	unsigned long long aval, mval;
 	int ret;
 
-	ret = read_msr(cpu, MSR_APERF, &val);
-	aperf_current_count[cpu] = val;
-	ret |= read_msr(cpu, MSR_MPERF, &val);
-	mperf_current_count[cpu] = val;
+	ret = get_aperf_mperf(cpu, &aval, &mval);
+	aperf_current_count[cpu] = aval;
+	mperf_current_count[cpu] = mval;
 	is_valid[cpu] = !ret;
 
 	return 0;