diff mbox

drivers/perf: arm_pmu: save/restore cpu cycle counter in cpu_pm_pmu_notify

Message ID 1510813648-45090-1-git-send-email-hejianet@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jia He Nov. 16, 2017, 6:27 a.m. UTC
Sometimes userspace need a high resolution cycle counter by reading
pmccntr_el0.

In commit da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM
notifier"), it resets all the counters even when the pmcr_el0.E and
pmcntenset_el0.C are both 1 . That is incorrect.

We need to save the registers and counter before CPU_PM_ENTER and
restore them after CPU_PM_EXIT.

Fixes: da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM notifier")
Signed-off-by: Jia He <jia.he@hxt-semitech.com>
---
 drivers/perf/arm_pmu.c | 72 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 6 deletions(-)

Comments

Mark Rutland Nov. 20, 2017, 12:32 p.m. UTC | #1
Hi,

On Thu, Nov 16, 2017 at 06:27:28AM +0000, Jia He wrote:
> Sometimes userspace need a high resolution cycle counter by reading
> pmccntr_el0.
> 
> In commit da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM
> notifier"), it resets all the counters even when the pmcr_el0.E and
> pmcntenset_el0.C are both 1 . That is incorrect.

I appreciate that you may wish to make use of the cycle counter from
userspace, but this is the intended behaviour kernel-side. Direct
userspace counter acceess is not supported.

In power states where context is lost, any perf events are
saved/restored by cpu_pm_pmu_setup(). So we certainly shouldn't be
modifying the counter registers in any other PM code.

We *could* expose counters to userspace on homogeneous systems, so long
as users stuck to the usual perf data page interface. However, this
comes with a number of subtle problems, and no-one's done the work to
enable this.

Even then, perf may modify counters at any point in time, and
monotonicity (and/or presence) of counters is not guaranteed.

> We need to save the registers and counter before CPU_PM_ENTER and
> restore them after CPU_PM_EXIT.
> 
> Fixes: da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM notifier")

As above, this patch is not a  fix, and is not currently necessary.

Thanks,
Mark.

> Signed-off-by: Jia He <jia.he@hxt-semitech.com>
> ---
>  drivers/perf/arm_pmu.c | 72 +++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 66 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 7bc5eee..cf55c91 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -26,6 +26,12 @@
>  
>  #include <asm/irq_regs.h>
>  
> +#ifdef CONFIG_CPU_PM
> +DEFINE_PER_CPU(u32, saved_pmcr_el0);
> +DEFINE_PER_CPU(u32, saved_pmcntenset_el0);
> +DEFINE_PER_CPU(u64, saved_cycle_cnter);
> +#endif
> +
>  static int
>  armpmu_map_cache_event(const unsigned (*cache_map)
>  				      [PERF_COUNT_HW_CACHE_MAX]
> @@ -719,6 +725,15 @@ static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd)
>  	}
>  }
>  
> +static int pmc_cycle_counter_enabled(void)
> +{
> +	if ((read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_E) &&
> +				read_sysreg(pmcntenset_el0) & 1<<31)
> +		return 1;
> +
> +	return 0;
> +}
> +
>  static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
>  			     void *v)
>  {
> @@ -729,16 +744,53 @@ static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
>  	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
>  		return NOTIFY_DONE;
>  
> -	/*
> -	 * Always reset the PMU registers on power-up even if
> -	 * there are no events running.
> -	 */
> -	if (cmd == CPU_PM_EXIT && armpmu->reset)
> -		armpmu->reset(armpmu);
> +	if (cmd == CPU_PM_EXIT) {
> +		/*
> +		 * Always reset the PMU registers on power-up even if
> +		 * there are no events running.
> +		 */
> +		if (armpmu->reset)
> +			armpmu->reset(armpmu);
> +
> +		/*
> +		 * Restore the saved pmcr_el0 and pmcntenset_el0
> +		 * if pmc cycle counter is enabled, restore the counter
> +		 */
> +		write_sysreg(get_cpu_var(saved_pmcr_el0), pmcr_el0);
> +		write_sysreg(get_cpu_var(saved_pmcntenset_el0),
> +				pmcntenset_el0);
> +
> +		if (pmc_cycle_counter_enabled()) {
> +			write_sysreg(get_cpu_var(saved_cycle_cnter),
> +					pmccntr_el0);
> +			put_cpu_var(saved_cycle_cnter);
> +		}
> +		put_cpu_var(saved_pmcntenset_el0);
> +		put_cpu_var(saved_pmcr_el0);
> +	}
> +
> +	if (cmd == CPU_PM_ENTER) {
> +		/* If currently pmc cycle counter is enabled,
> +		 * save the counter to percpu section
> +		 */
> +		if (pmc_cycle_counter_enabled()) {
> +			get_cpu_var(saved_cycle_cnter) = read_sysreg(
> +							pmccntr_el0);
> +			put_cpu_var(saved_cycle_cnter);
> +		}
> +
> +		get_cpu_var(saved_pmcr_el0) = read_sysreg(pmcr_el0);
> +
> +		get_cpu_var(saved_pmcntenset_el0) = read_sysreg(
> +							pmcntenset_el0);
> +		put_cpu_var(saved_pmcntenset_el0);
> +		put_cpu_var(saved_pmcr_el0);
> +	}
>  
>  	if (!enabled)
>  		return NOTIFY_OK;
>  
> +	/* if any hw_events is used */
>  	switch (cmd) {
>  	case CPU_PM_ENTER:
>  		armpmu->stop(armpmu);
> @@ -758,7 +810,15 @@ static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
>  
>  static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu)
>  {
> +	int i;
>  	cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify;
> +
> +	for_each_possible_cpu(i) {
> +		per_cpu(saved_pmcr_el0, i) = 0;
> +		per_cpu(saved_pmcntenset_el0, i) = 0;
> +		per_cpu(saved_cycle_cnter, i) = 0;
> +	}
> +
>  	return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb);
>  }
>  
> -- 
> 2.7.4
>
Jia He Nov. 20, 2017, 1:50 p.m. UTC | #2
Hi Mark

Thanks for your review.


On 11/20/2017 8:32 PM, Mark Rutland Wrote:
> Hi,
>
> On Thu, Nov 16, 2017 at 06:27:28AM +0000, Jia He wrote:
>> Sometimes userspace need a high resolution cycle counter by reading
>> pmccntr_el0.
>>
>> In commit da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM
>> notifier"), it resets all the counters even when the pmcr_el0.E and
>> pmcntenset_el0.C are both 1 . That is incorrect.
> I appreciate that you may wish to make use of the cycle counter from
> userspace, but this is the intended behaviour kernel-side. Direct
> userspace counter acceess is not supported.
Sorry for my previous description. Not only user space, but also any 
pmccntr_el0 reading
in kernel space is 0 except perf infrastructure.

>
> In power states where context is lost, any perf events are
> saved/restored by cpu_pm_pmu_setup(). So we certainly shouldn't be
> modifying the counter registers in any other PM code.
>
> We *could* expose counters to userspace on homogeneous systems, so long
> as users stuck to the usual perf data page interface. However, this
> comes with a number of subtle problems, and no-one's done the work to
> enable this.
>
> Even then, perf may modify counters at any point in time, and
> monotonicity (and/or presence) of counters is not guaranteed.

Yes, the cycle counter register pmccntr_el0 will be impacted by perf 
usage.But do you think
pmccntr_el0 is intented for perf only? Currently, many user space/kernel 
space programs will use
pmccntr_el0 as a timestamp( just likethe rdtsc() on X86). After commit 
da4e4f18afe0, all the cycle counter readingis 0 because of CPU PM 
enter/exit state changes in armv8pmu_reset.
Mark Rutland Nov. 20, 2017, 2:04 p.m. UTC | #3
On Mon, Nov 20, 2017 at 09:50:15PM +0800, Jia He wrote:
> On 11/20/2017 8:32 PM, Mark Rutland Wrote:
> > On Thu, Nov 16, 2017 at 06:27:28AM +0000, Jia He wrote:
> > > Sometimes userspace need a high resolution cycle counter by reading
> > > pmccntr_el0.
> > > 
> > > In commit da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM
> > > notifier"), it resets all the counters even when the pmcr_el0.E and
> > > pmcntenset_el0.C are both 1 . That is incorrect.
> > I appreciate that you may wish to make use of the cycle counter from
> > userspace, but this is the intended behaviour kernel-side. Direct
> > userspace counter acceess is not supported.
> Sorry for my previous description. Not only user space, but also any
> pmccntr_el0 reading
> in kernel space is 0 except perf infrastructure.

The only user of PMCCNTR in the kernel is the perf infrastructure.

If you want to perform in-kernel counts, please use
perf_event_create_kernel_counter() and related functions.

> > In power states where context is lost, any perf events are
> > saved/restored by cpu_pm_pmu_setup(). So we certainly shouldn't be
> > modifying the counter registers in any other PM code.
> > 
> > We *could* expose counters to userspace on homogeneous systems, so long
> > as users stuck to the usual perf data page interface. However, this
> > comes with a number of subtle problems, and no-one's done the work to
> > enable this.
> > 
> > Even then, perf may modify counters at any point in time, and
> > monotonicity (and/or presence) of counters is not guaranteed.
> 
> Yes, the cycle counter register pmccntr_el0 will be impacted by perf
> usage.But do you think pmccntr_el0 is intented for perf only? 

We only intend for the in-kernel perf infrastructure to access
pmccntr_el0; nothing else should touch it.

> Currently, many user space/kernel space programs will use pmccntr_el0
> as a timestamp( just likethe rdtsc() on X86).

This is not supported, and will return completely unusable numbers if
it were. This will be randomly reset/modified, CPUs aren't in lock-step,
so this isn't monotonic, etc.

If you want a timestamp, the architecture provides an counter that is
monotonic and uniform across all CPUs. We expose that to userspace via
the VDSO (e.g. clock_gettime with CLOCK_MONOTONIC_RAW) and it can be
read in-kernel through the usual clocksource APIs.

PMCCNTR is *not* usable in this manner.

> After commit da4e4f18afe0, all the cycle counter readingis 0 because
> of CPU PM enter/exit state changes in armv8pmu_reset.

Previously, had a CPU been reset, the counter would hold an UNKNOWN
value (i.e. it would be reset to junk), and may or may not have been
accessible depending on what CNTKCTL reset to.

So if you tried to read PMCCNTR in userspace, and happened to have been
granted access, the values would have been bogus and unusable.

Commit da4e4f18afe0 fixed things and made the behaviour consistent, as
intended.

Thanks,
Mark.
Jia He Nov. 20, 2017, 2:15 p.m. UTC | #4
Thanks, got it

Cheers,

Jia


On 11/20/2017 10:04 PM, Mark Rutland Wrote:
> On Mon, Nov 20, 2017 at 09:50:15PM +0800, Jia He wrote:
>> On 11/20/2017 8:32 PM, Mark Rutland Wrote:
>>> On Thu, Nov 16, 2017 at 06:27:28AM +0000, Jia He wrote:
>>>> Sometimes userspace need a high resolution cycle counter by reading
>>>> pmccntr_el0.
>>>>
>>>> In commit da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM
>>>> notifier"), it resets all the counters even when the pmcr_el0.E and
>>>> pmcntenset_el0.C are both 1 . That is incorrect.
>>> I appreciate that you may wish to make use of the cycle counter from
>>> userspace, but this is the intended behaviour kernel-side. Direct
>>> userspace counter acceess is not supported.
>> Sorry for my previous description. Not only user space, but also any
>> pmccntr_el0 reading
>> in kernel space is 0 except perf infrastructure.
> The only user of PMCCNTR in the kernel is the perf infrastructure.
>
> If you want to perform in-kernel counts, please use
> perf_event_create_kernel_counter() and related functions.
>
>>> In power states where context is lost, any perf events are
>>> saved/restored by cpu_pm_pmu_setup(). So we certainly shouldn't be
>>> modifying the counter registers in any other PM code.
>>>
>>> We *could* expose counters to userspace on homogeneous systems, so long
>>> as users stuck to the usual perf data page interface. However, this
>>> comes with a number of subtle problems, and no-one's done the work to
>>> enable this.
>>>
>>> Even then, perf may modify counters at any point in time, and
>>> monotonicity (and/or presence) of counters is not guaranteed.
>> Yes, the cycle counter register pmccntr_el0 will be impacted by perf
>> usage.But do you think pmccntr_el0 is intented for perf only?
> We only intend for the in-kernel perf infrastructure to access
> pmccntr_el0; nothing else should touch it.
>
>> Currently, many user space/kernel space programs will use pmccntr_el0
>> as a timestamp( just likethe rdtsc() on X86).
> This is not supported, and will return completely unusable numbers if
> it were. This will be randomly reset/modified, CPUs aren't in lock-step,
> so this isn't monotonic, etc.
>
> If you want a timestamp, the architecture provides an counter that is
> monotonic and uniform across all CPUs. We expose that to userspace via
> the VDSO (e.g. clock_gettime with CLOCK_MONOTONIC_RAW) and it can be
> read in-kernel through the usual clocksource APIs.
>
> PMCCNTR is *not* usable in this manner.
>
>> After commit da4e4f18afe0, all the cycle counter readingis 0 because
>> of CPU PM enter/exit state changes in armv8pmu_reset.
> Previously, had a CPU been reset, the counter would hold an UNKNOWN
> value (i.e. it would be reset to junk), and may or may not have been
> accessible depending on what CNTKCTL reset to.
>
> So if you tried to read PMCCNTR in userspace, and happened to have been
> granted access, the values would have been bogus and unusable.
>
> Commit da4e4f18afe0 fixed things and made the behaviour consistent, as
> intended.
>
> Thanks,
> Mark.
>
diff mbox

Patch

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 7bc5eee..cf55c91 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -26,6 +26,12 @@ 
 
 #include <asm/irq_regs.h>
 
+#ifdef CONFIG_CPU_PM
+DEFINE_PER_CPU(u32, saved_pmcr_el0);
+DEFINE_PER_CPU(u32, saved_pmcntenset_el0);
+DEFINE_PER_CPU(u64, saved_cycle_cnter);
+#endif
+
 static int
 armpmu_map_cache_event(const unsigned (*cache_map)
 				      [PERF_COUNT_HW_CACHE_MAX]
@@ -719,6 +725,15 @@  static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd)
 	}
 }
 
+static int pmc_cycle_counter_enabled(void)
+{
+	if ((read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_E) &&
+				read_sysreg(pmcntenset_el0) & 1<<31)
+		return 1;
+
+	return 0;
+}
+
 static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
 			     void *v)
 {
@@ -729,16 +744,53 @@  static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
 	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
 		return NOTIFY_DONE;
 
-	/*
-	 * Always reset the PMU registers on power-up even if
-	 * there are no events running.
-	 */
-	if (cmd == CPU_PM_EXIT && armpmu->reset)
-		armpmu->reset(armpmu);
+	if (cmd == CPU_PM_EXIT) {
+		/*
+		 * Always reset the PMU registers on power-up even if
+		 * there are no events running.
+		 */
+		if (armpmu->reset)
+			armpmu->reset(armpmu);
+
+		/*
+		 * Restore the saved pmcr_el0 and pmcntenset_el0
+		 * if pmc cycle counter is enabled, restore the counter
+		 */
+		write_sysreg(get_cpu_var(saved_pmcr_el0), pmcr_el0);
+		write_sysreg(get_cpu_var(saved_pmcntenset_el0),
+				pmcntenset_el0);
+
+		if (pmc_cycle_counter_enabled()) {
+			write_sysreg(get_cpu_var(saved_cycle_cnter),
+					pmccntr_el0);
+			put_cpu_var(saved_cycle_cnter);
+		}
+		put_cpu_var(saved_pmcntenset_el0);
+		put_cpu_var(saved_pmcr_el0);
+	}
+
+	if (cmd == CPU_PM_ENTER) {
+		/* If currently pmc cycle counter is enabled,
+		 * save the counter to percpu section
+		 */
+		if (pmc_cycle_counter_enabled()) {
+			get_cpu_var(saved_cycle_cnter) = read_sysreg(
+							pmccntr_el0);
+			put_cpu_var(saved_cycle_cnter);
+		}
+
+		get_cpu_var(saved_pmcr_el0) = read_sysreg(pmcr_el0);
+
+		get_cpu_var(saved_pmcntenset_el0) = read_sysreg(
+							pmcntenset_el0);
+		put_cpu_var(saved_pmcntenset_el0);
+		put_cpu_var(saved_pmcr_el0);
+	}
 
 	if (!enabled)
 		return NOTIFY_OK;
 
+	/* if any hw_events is used */
 	switch (cmd) {
 	case CPU_PM_ENTER:
 		armpmu->stop(armpmu);
@@ -758,7 +810,15 @@  static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
 
 static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu)
 {
+	int i;
 	cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify;
+
+	for_each_possible_cpu(i) {
+		per_cpu(saved_pmcr_el0, i) = 0;
+		per_cpu(saved_pmcntenset_el0, i) = 0;
+		per_cpu(saved_cycle_cnter, i) = 0;
+	}
+
 	return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb);
 }