diff mbox series

[v3,1/9] arm64: perf: avoid PMXEV* indirection

Message ID 1562596377-33196-2-git-send-email-julien.thierry@arm.com (mailing list archive)
State New, archived
Headers show
Series arm_pmu: Use NMI for perf interrupt | expand

Commit Message

Julien Thierry July 8, 2019, 2:32 p.m. UTC
From: Mark Rutland <mark.rutland@arm.com>

Currently we access the counter registers and their respective type
registers indirectly. This requires us to write to PMSELR, issue an ISB,
then access the relevant PMXEV* registers.

This is unfortunate, because:

* Under virtualization, accessing one registers requires two traps to
  the hypervisor, even though we could access the register directly with
  a single trap.

* We have to issue an ISB which we could otherwise avoid the cost of.

* When we use NMIs, the NMI handler will have to save/restore the select
  register in case the code it preempted was attempting to access a
  counter or its type register.

We can avoid these issues by directly accessing the relevant registers.
This patch adds helpers to do so.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
[Julien T.: Don't inline read/write functions to avoid big code-size
	increase, remove unused read_pmevtypern function,
	fix counter index issue.]
Signed-off-by: Julien Thierry <julien.thierry@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 13 deletions(-)

--
1.9.1

Comments

Mark Rutland July 8, 2019, 3:03 p.m. UTC | #1
On Mon, Jul 08, 2019 at 03:32:49PM +0100, Julien Thierry wrote:
> From: Mark Rutland <mark.rutland@arm.com>
> 
> Currently we access the counter registers and their respective type
> registers indirectly. This requires us to write to PMSELR, issue an ISB,
> then access the relevant PMXEV* registers.
> 
> This is unfortunate, because:
> 
> * Under virtualization, accessing one registers requires two traps to
>   the hypervisor, even though we could access the register directly with
>   a single trap.
> 
> * We have to issue an ISB which we could otherwise avoid the cost of.
> 
> * When we use NMIs, the NMI handler will have to save/restore the select
>   register in case the code it preempted was attempting to access a
>   counter or its type register.
> 
> We can avoid these issues by directly accessing the relevant registers.
> This patch adds helpers to do so.
> 
> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
> [Julien T.: Don't inline read/write functions to avoid big code-size
> 	increase, remove unused read_pmevtypern function,
> 	fix counter index issue.]
> Signed-off-by: Julien Thierry <julien.thierry@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Jiri Olsa <jolsa@redhat.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> ---
>  arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------
>  1 file changed, 83 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
> index 96e90e2..7759f8a 100644
> --- a/arch/arm64/kernel/perf_event.c
> +++ b/arch/arm64/kernel/perf_event.c
> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
>  #define	ARMV8_IDX_TO_COUNTER(x)	\
>  	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
> 
> +/*
> + * This code is really good
> + */
> +
> +#define PMEVN_CASE(n, case_macro) \
> +	case n: case_macro(n); break
> +
> +#define PMEVN_SWITCH(x, case_macro)				\
> +	do {							\
> +		switch (x) {					\
> +		PMEVN_CASE(0,  case_macro);			\
> +		PMEVN_CASE(1,  case_macro);			\
> +		PMEVN_CASE(2,  case_macro);			\
> +		PMEVN_CASE(3,  case_macro);			\
> +		PMEVN_CASE(4,  case_macro);			\
> +		PMEVN_CASE(5,  case_macro);			\
> +		PMEVN_CASE(6,  case_macro);			\
> +		PMEVN_CASE(7,  case_macro);			\
> +		PMEVN_CASE(8,  case_macro);			\
> +		PMEVN_CASE(9,  case_macro);			\
> +		PMEVN_CASE(10, case_macro);			\
> +		PMEVN_CASE(11, case_macro);			\
> +		PMEVN_CASE(12, case_macro);			\
> +		PMEVN_CASE(13, case_macro);			\
> +		PMEVN_CASE(14, case_macro);			\
> +		PMEVN_CASE(15, case_macro);			\
> +		PMEVN_CASE(16, case_macro);			\
> +		PMEVN_CASE(17, case_macro);			\
> +		PMEVN_CASE(18, case_macro);			\
> +		PMEVN_CASE(19, case_macro);			\
> +		PMEVN_CASE(21, case_macro);			\
> +		PMEVN_CASE(22, case_macro);			\
> +		PMEVN_CASE(23, case_macro);			\
> +		PMEVN_CASE(24, case_macro);			\
> +		PMEVN_CASE(25, case_macro);			\
> +		PMEVN_CASE(26, case_macro);			\
> +		PMEVN_CASE(27, case_macro);			\
> +		PMEVN_CASE(28, case_macro);			\
> +		PMEVN_CASE(29, case_macro);			\
> +		PMEVN_CASE(30, case_macro);			\
> +		default: WARN(1, "Inavlid PMEV* index");	\

Nit: s/inavlid/invalid/

> +		}						\
> +	} while (0)
> +
> +#define RETURN_READ_PMEVCNTRN(n) \
> +	return read_sysreg(pmevcntr##n##_el0);
> +static unsigned long read_pmevcntrn(int n)
> +{
> +	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
> +	return 0;
> +}
> +#undef RETURN_READ_PMEVCNTRN
> +
> +#define WRITE_PMEVCNTRN(n) \
> +	write_sysreg(val, pmevcntr##n##_el0);
> +static void write_pmevcntrn(int n, unsigned long val)
> +{
> +	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
> +}
> +#undef WRITE_PMEVCNTRN
> +
> +#define WRITE_PMEVTYPERN(n) \
> +	write_sysreg(val, pmevtyper##n##_el0);
> +static void write_pmevtypern(int n, unsigned long val)
> +{
> +	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
> +}
> +#undef WRITE_PMEVTYPERN
> +
> +#undef PMEVN_SWITCH

I think we can drop the undefs. These are local to this C file, and the
names are sufficiently unique to avoid collision. Note that we missed
the undef for PMEVN_CASE, and I imagine keeping that sane will be messy
in future.

Other than that, I haven't come up with a nicer way of writing the
above, so that looks good to me.

> +
>  static inline u32 armv8pmu_pmcr_read(void)
>  {
>  	return read_sysreg(pmcr_el0);
> @@ -397,17 +468,11 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx)
>  	return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx));
>  }
> 
> -static inline void armv8pmu_select_counter(int idx)
> +static inline u32 armv8pmu_read_evcntr(int idx)
>  {
>  	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> -	write_sysreg(counter, pmselr_el0);
> -	isb();
> -}
> 
> -static inline u32 armv8pmu_read_evcntr(int idx)
> -{
> -	armv8pmu_select_counter(idx);
> -	return read_sysreg(pmxevcntr_el0);
> +	return read_pmevcntrn(counter);
>  }
> 
>  static inline u64 armv8pmu_read_hw_counter(struct perf_event *event)
> @@ -441,8 +506,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event)
> 
>  static inline void armv8pmu_write_evcntr(int idx, u32 value)
>  {
> -	armv8pmu_select_counter(idx);
> -	write_sysreg(value, pmxevcntr_el0);
> +	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> +
> +	write_pmevcntrn(counter, value);
>  }
> 
>  static inline void armv8pmu_write_hw_counter(struct perf_event *event,
> @@ -483,9 +549,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value)
> 
>  static inline void armv8pmu_write_evtype(int idx, u32 val)
>  {
> -	armv8pmu_select_counter(idx);
> +	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> +
>  	val &= ARMV8_PMU_EVTYPE_MASK;
> -	write_sysreg(val, pmxevtyper_el0);
> +	write_pmevtypern(counter, val);
>  }
> 
>  static inline void armv8pmu_write_event_type(struct perf_event *event)
> @@ -505,7 +572,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event)
>  		armv8pmu_write_evtype(idx - 1, hwc->config_base);
>  		armv8pmu_write_evtype(idx, chain_evt);
>  	} else {
> -		armv8pmu_write_evtype(idx, hwc->config_base);
> +		if (idx == ARMV8_IDX_CYCLE_COUNTER)
> +			write_sysreg(hwc->config_base, pmccfiltr_el0);
> +		else
> +			armv8pmu_write_evtype(idx, hwc->config_base);
>  	}
>  }

... and this all looks sound.

With the typo fixed and undefs dropped:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Thanks,
Mark.

> 
> --
> 1.9.1
Steven Price July 10, 2019, 10:57 a.m. UTC | #2
On 08/07/2019 15:32, Julien Thierry wrote:
> From: Mark Rutland <mark.rutland@arm.com>
> 
> Currently we access the counter registers and their respective type
> registers indirectly. This requires us to write to PMSELR, issue an ISB,
> then access the relevant PMXEV* registers.
> 
> This is unfortunate, because:
> 
> * Under virtualization, accessing one registers requires two traps to
>   the hypervisor, even though we could access the register directly with
>   a single trap.
> 
> * We have to issue an ISB which we could otherwise avoid the cost of.
> 
> * When we use NMIs, the NMI handler will have to save/restore the select
>   register in case the code it preempted was attempting to access a
>   counter or its type register.
> 
> We can avoid these issues by directly accessing the relevant registers.
> This patch adds helpers to do so.
> 
> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
> [Julien T.: Don't inline read/write functions to avoid big code-size
> 	increase, remove unused read_pmevtypern function,
> 	fix counter index issue.]
> Signed-off-by: Julien Thierry <julien.thierry@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Jiri Olsa <jolsa@redhat.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> ---
>  arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------
>  1 file changed, 83 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
> index 96e90e2..7759f8a 100644
> --- a/arch/arm64/kernel/perf_event.c
> +++ b/arch/arm64/kernel/perf_event.c
> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
>  #define	ARMV8_IDX_TO_COUNTER(x)	\
>  	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
> 
> +/*
> + * This code is really good
> + */
> +
> +#define PMEVN_CASE(n, case_macro) \
> +	case n: case_macro(n); break
> +
> +#define PMEVN_SWITCH(x, case_macro)				\
> +	do {							\
> +		switch (x) {					\
> +		PMEVN_CASE(0,  case_macro);			\
> +		PMEVN_CASE(1,  case_macro);			\
> +		PMEVN_CASE(2,  case_macro);			\
> +		PMEVN_CASE(3,  case_macro);			\
> +		PMEVN_CASE(4,  case_macro);			\
> +		PMEVN_CASE(5,  case_macro);			\
> +		PMEVN_CASE(6,  case_macro);			\
> +		PMEVN_CASE(7,  case_macro);			\
> +		PMEVN_CASE(8,  case_macro);			\
> +		PMEVN_CASE(9,  case_macro);			\
> +		PMEVN_CASE(10, case_macro);			\
> +		PMEVN_CASE(11, case_macro);			\
> +		PMEVN_CASE(12, case_macro);			\
> +		PMEVN_CASE(13, case_macro);			\
> +		PMEVN_CASE(14, case_macro);			\
> +		PMEVN_CASE(15, case_macro);			\
> +		PMEVN_CASE(16, case_macro);			\
> +		PMEVN_CASE(17, case_macro);			\
> +		PMEVN_CASE(18, case_macro);			\
> +		PMEVN_CASE(19, case_macro);			\

Is 20 missing on purpose?

> +		PMEVN_CASE(21, case_macro);			\
> +		PMEVN_CASE(22, case_macro);			\
> +		PMEVN_CASE(23, case_macro);			\
> +		PMEVN_CASE(24, case_macro);			\
> +		PMEVN_CASE(25, case_macro);			\
> +		PMEVN_CASE(26, case_macro);			\
> +		PMEVN_CASE(27, case_macro);			\
> +		PMEVN_CASE(28, case_macro);			\
> +		PMEVN_CASE(29, case_macro);			\
> +		PMEVN_CASE(30, case_macro);			\
> +		default: WARN(1, "Inavlid PMEV* index");	\
				  ^^^^^^^ Invalid

Steve

> +		}						\
> +	} while (0)
> +
> +#define RETURN_READ_PMEVCNTRN(n) \
> +	return read_sysreg(pmevcntr##n##_el0);
> +static unsigned long read_pmevcntrn(int n)
> +{
> +	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
> +	return 0;
> +}
> +#undef RETURN_READ_PMEVCNTRN
> +
> +#define WRITE_PMEVCNTRN(n) \
> +	write_sysreg(val, pmevcntr##n##_el0);
> +static void write_pmevcntrn(int n, unsigned long val)
> +{
> +	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
> +}
> +#undef WRITE_PMEVCNTRN
> +
> +#define WRITE_PMEVTYPERN(n) \
> +	write_sysreg(val, pmevtyper##n##_el0);
> +static void write_pmevtypern(int n, unsigned long val)
> +{
> +	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
> +}
> +#undef WRITE_PMEVTYPERN
> +
> +#undef PMEVN_SWITCH
> +
>  static inline u32 armv8pmu_pmcr_read(void)
>  {
>  	return read_sysreg(pmcr_el0);
> @@ -397,17 +468,11 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx)
>  	return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx));
>  }
> 
> -static inline void armv8pmu_select_counter(int idx)
> +static inline u32 armv8pmu_read_evcntr(int idx)
>  {
>  	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> -	write_sysreg(counter, pmselr_el0);
> -	isb();
> -}
> 
> -static inline u32 armv8pmu_read_evcntr(int idx)
> -{
> -	armv8pmu_select_counter(idx);
> -	return read_sysreg(pmxevcntr_el0);
> +	return read_pmevcntrn(counter);
>  }
> 
>  static inline u64 armv8pmu_read_hw_counter(struct perf_event *event)
> @@ -441,8 +506,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event)
> 
>  static inline void armv8pmu_write_evcntr(int idx, u32 value)
>  {
> -	armv8pmu_select_counter(idx);
> -	write_sysreg(value, pmxevcntr_el0);
> +	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> +
> +	write_pmevcntrn(counter, value);
>  }
> 
>  static inline void armv8pmu_write_hw_counter(struct perf_event *event,
> @@ -483,9 +549,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value)
> 
>  static inline void armv8pmu_write_evtype(int idx, u32 val)
>  {
> -	armv8pmu_select_counter(idx);
> +	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> +
>  	val &= ARMV8_PMU_EVTYPE_MASK;
> -	write_sysreg(val, pmxevtyper_el0);
> +	write_pmevtypern(counter, val);
>  }
> 
>  static inline void armv8pmu_write_event_type(struct perf_event *event)
> @@ -505,7 +572,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event)
>  		armv8pmu_write_evtype(idx - 1, hwc->config_base);
>  		armv8pmu_write_evtype(idx, chain_evt);
>  	} else {
> -		armv8pmu_write_evtype(idx, hwc->config_base);
> +		if (idx == ARMV8_IDX_CYCLE_COUNTER)
> +			write_sysreg(hwc->config_base, pmccfiltr_el0);
> +		else
> +			armv8pmu_write_evtype(idx, hwc->config_base);
>  	}
>  }
> 
> --
> 1.9.1
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Julien Thierry July 10, 2019, 11:01 a.m. UTC | #3
On 10/07/2019 11:57, Steven Price wrote:
> On 08/07/2019 15:32, Julien Thierry wrote:
>> From: Mark Rutland <mark.rutland@arm.com>
>>
>> Currently we access the counter registers and their respective type
>> registers indirectly. This requires us to write to PMSELR, issue an ISB,
>> then access the relevant PMXEV* registers.
>>
>> This is unfortunate, because:
>>
>> * Under virtualization, accessing one registers requires two traps to
>>   the hypervisor, even though we could access the register directly with
>>   a single trap.
>>
>> * We have to issue an ISB which we could otherwise avoid the cost of.
>>
>> * When we use NMIs, the NMI handler will have to save/restore the select
>>   register in case the code it preempted was attempting to access a
>>   counter or its type register.
>>
>> We can avoid these issues by directly accessing the relevant registers.
>> This patch adds helpers to do so.
>>
>> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
>> [Julien T.: Don't inline read/write functions to avoid big code-size
>> 	increase, remove unused read_pmevtypern function,
>> 	fix counter index issue.]
>> Signed-off-by: Julien Thierry <julien.thierry@arm.com>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
>> Cc: Jiri Olsa <jolsa@redhat.com>
>> Cc: Namhyung Kim <namhyung@kernel.org>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> ---
>>  arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 83 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
>> index 96e90e2..7759f8a 100644
>> --- a/arch/arm64/kernel/perf_event.c
>> +++ b/arch/arm64/kernel/perf_event.c
>> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
>>  #define	ARMV8_IDX_TO_COUNTER(x)	\
>>  	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
>>
>> +/*
>> + * This code is really good
>> + */
>> +
>> +#define PMEVN_CASE(n, case_macro) \
>> +	case n: case_macro(n); break
>> +
>> +#define PMEVN_SWITCH(x, case_macro)				\
>> +	do {							\
>> +		switch (x) {					\
>> +		PMEVN_CASE(0,  case_macro);			\
>> +		PMEVN_CASE(1,  case_macro);			\
>> +		PMEVN_CASE(2,  case_macro);			\
>> +		PMEVN_CASE(3,  case_macro);			\
>> +		PMEVN_CASE(4,  case_macro);			\
>> +		PMEVN_CASE(5,  case_macro);			\
>> +		PMEVN_CASE(6,  case_macro);			\
>> +		PMEVN_CASE(7,  case_macro);			\
>> +		PMEVN_CASE(8,  case_macro);			\
>> +		PMEVN_CASE(9,  case_macro);			\
>> +		PMEVN_CASE(10, case_macro);			\
>> +		PMEVN_CASE(11, case_macro);			\
>> +		PMEVN_CASE(12, case_macro);			\
>> +		PMEVN_CASE(13, case_macro);			\
>> +		PMEVN_CASE(14, case_macro);			\
>> +		PMEVN_CASE(15, case_macro);			\
>> +		PMEVN_CASE(16, case_macro);			\
>> +		PMEVN_CASE(17, case_macro);			\
>> +		PMEVN_CASE(18, case_macro);			\
>> +		PMEVN_CASE(19, case_macro);			\
> 
> Is 20 missing on purpose?
> 

That would have been fun to debug. Well spotted!

I'll fix it in the next version.

Thanks,
Shijith Thotton July 16, 2019, 10:33 a.m. UTC | #4
On 7/10/19 4:01 AM, Julien Thierry wrote:
> 
> 
> On 10/07/2019 11:57, Steven Price wrote:
>> On 08/07/2019 15:32, Julien Thierry wrote:
>>> From: Mark Rutland <mark.rutland@arm.com>
>>>
>>> Currently we access the counter registers and their respective type
>>> registers indirectly. This requires us to write to PMSELR, issue an ISB,
>>> then access the relevant PMXEV* registers.
>>>
>>> This is unfortunate, because:
>>>
>>> * Under virtualization, accessing one registers requires two traps to
>>>    the hypervisor, even though we could access the register directly with
>>>    a single trap.
>>>
>>> * We have to issue an ISB which we could otherwise avoid the cost of.
>>>
>>> * When we use NMIs, the NMI handler will have to save/restore the select
>>>    register in case the code it preempted was attempting to access a
>>>    counter or its type register.
>>>
>>> We can avoid these issues by directly accessing the relevant registers.
>>> This patch adds helpers to do so.
>>>
>>> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
>>> [Julien T.: Don't inline read/write functions to avoid big code-size
>>> 	increase, remove unused read_pmevtypern function,
>>> 	fix counter index issue.]
>>> Signed-off-by: Julien Thierry <julien.thierry@arm.com>
>>> Cc: Will Deacon <will.deacon@arm.com>
>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>> Cc: Ingo Molnar <mingo@redhat.com>
>>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
>>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
>>> Cc: Jiri Olsa <jolsa@redhat.com>
>>> Cc: Namhyung Kim <namhyung@kernel.org>
>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>> ---
>>>   arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------
>>>   1 file changed, 83 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
>>> index 96e90e2..7759f8a 100644
>>> --- a/arch/arm64/kernel/perf_event.c
>>> +++ b/arch/arm64/kernel/perf_event.c
>>> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
>>>   #define	ARMV8_IDX_TO_COUNTER(x)	\
>>>   	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
>>>
>>> +/*
>>> + * This code is really good
>>> + */
>>> +
>>> +#define PMEVN_CASE(n, case_macro) \
>>> +	case n: case_macro(n); break
>>> +
>>> +#define PMEVN_SWITCH(x, case_macro)				\
>>> +	do {							\
>>> +		switch (x) {					\
>>> +		PMEVN_CASE(0,  case_macro);			\
>>> +		PMEVN_CASE(1,  case_macro);			\
>>> +		PMEVN_CASE(2,  case_macro);			\
>>> +		PMEVN_CASE(3,  case_macro);			\
>>> +		PMEVN_CASE(4,  case_macro);			\
>>> +		PMEVN_CASE(5,  case_macro);			\
>>> +		PMEVN_CASE(6,  case_macro);			\
>>> +		PMEVN_CASE(7,  case_macro);			\
>>> +		PMEVN_CASE(8,  case_macro);			\
>>> +		PMEVN_CASE(9,  case_macro);			\
>>> +		PMEVN_CASE(10, case_macro);			\
>>> +		PMEVN_CASE(11, case_macro);			\
>>> +		PMEVN_CASE(12, case_macro);			\
>>> +		PMEVN_CASE(13, case_macro);			\
>>> +		PMEVN_CASE(14, case_macro);			\
>>> +		PMEVN_CASE(15, case_macro);			\
>>> +		PMEVN_CASE(16, case_macro);			\
>>> +		PMEVN_CASE(17, case_macro);			\
>>> +		PMEVN_CASE(18, case_macro);			\
>>> +		PMEVN_CASE(19, case_macro);			\
>>
>> Is 20 missing on purpose?
>>
> 
> That would have been fun to debug. Well spotted!
> 
> I'll fix it in the next version.
> 
> Thanks,
> 

Tried perf top/record on this patch and are working fine.
Output of perf record on "iperf -c 127.0.0.1 -t 30" is below. (single core)

With Pseudo-NMI:
     20.35%  [k] lock_acquire
     16.95%  [k] lock_release
     11.02%  [k] __arch_copy_from_user
      7.78%  [k] lock_is_held_type
      2.12%  [k] ipt_do_table
      1.34%  [k] kmem_cache_free
      1.25%  [k] _raw_spin_unlock_irqrestore
      1.21%  [k] __nf_conntrack_find_get
      1.06%  [k] get_page_from_freelist
      1.04%  [k] ktime_get
      1.03%  [k] kfree
      1.00%  [k] nf_conntrack_tcp_packet
      0.96%  [k] tcp_sendmsg_locked
      0.87%  [k] __softirqentry_text_start
      0.87%  [k] process_backlog
      0.76%  [k] __local_bh_enable_ip
      0.75%  [k] ip_finish_output2
      0.68%  [k] __tcp_transmit_skb
      0.62%  [k] enqueue_to_backlog
      0.60%  [k] __lock_acquire.isra.17
      0.58%  [k] __free_pages_ok
      0.54%  [k] nf_conntrack_in

With IRQ:
     16.49%  [k] __arch_copy_from_user
     12.38%  [k] _raw_spin_unlock_irqrestore
      9.41%  [k] lock_acquire
      6.92%  [k] lock_release
      3.71%  [k] lock_is_held_type
      2.80%  [k] ipt_do_table
      2.06%  [k] get_page_from_freelist
      1.82%  [k] ktime_get
      1.73%  [k] process_backlog
      1.27%  [k] nf_conntrack_tcp_packet
      1.21%  [k] enqueue_to_backlog
      1.17%  [k] __tcp_transmit_skb
      1.12%  [k] ip_finish_output2
      1.11%  [k] tcp_sendmsg_locked
      1.06%  [k] __free_pages_ok
      1.05%  [k] tcp_ack
      0.99%  [k] __netif_receive_skb_core
      0.88%  [k] __nf_conntrack_find_get
      0.71%  [k] nf_conntrack_in
      0.61%  [k] kmem_cache_free
      0.59%  [k] kfree
      0.57%  [k] __alloc_pages_nodemask

Thanks Juilen and Wei,
Tested-by: Shijith Thotton <sthotton@marvell.com>
Julien Thierry July 16, 2019, 10:54 a.m. UTC | #5
Hi Shijith,

On 16/07/2019 11:33, Shijith Thotton wrote:
>
>
> On 7/10/19 4:01 AM, Julien Thierry wrote:
>>
>>
>> On 10/07/2019 11:57, Steven Price wrote:
>>> On 08/07/2019 15:32, Julien Thierry wrote:
>>>> From: Mark Rutland <mark.rutland@arm.com>
>>>>
>>>> Currently we access the counter registers and their respective type
>>>> registers indirectly. This requires us to write to PMSELR, issue an ISB,
>>>> then access the relevant PMXEV* registers.
>>>>
>>>> This is unfortunate, because:
>>>>
>>>> * Under virtualization, accessing one registers requires two traps to
>>>>    the hypervisor, even though we could access the register directly with
>>>>    a single trap.
>>>>
>>>> * We have to issue an ISB which we could otherwise avoid the cost of.
>>>>
>>>> * When we use NMIs, the NMI handler will have to save/restore the select
>>>>    register in case the code it preempted was attempting to access a
>>>>    counter or its type register.
>>>>
>>>> We can avoid these issues by directly accessing the relevant registers.
>>>> This patch adds helpers to do so.
>>>>
>>>> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
>>>> [Julien T.: Don't inline read/write functions to avoid big code-size
>>>>    increase, remove unused read_pmevtypern function,
>>>>    fix counter index issue.]
>>>> Signed-off-by: Julien Thierry <julien.thierry@arm.com>
>>>> Cc: Will Deacon <will.deacon@arm.com>
>>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>>> Cc: Ingo Molnar <mingo@redhat.com>
>>>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
>>>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
>>>> Cc: Jiri Olsa <jolsa@redhat.com>
>>>> Cc: Namhyung Kim <namhyung@kernel.org>
>>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>>> ---
>>>>   arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------
>>>>   1 file changed, 83 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
>>>> index 96e90e2..7759f8a 100644
>>>> --- a/arch/arm64/kernel/perf_event.c
>>>> +++ b/arch/arm64/kernel/perf_event.c
>>>> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
>>>>   #define  ARMV8_IDX_TO_COUNTER(x) \
>>>>    (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
>>>>
>>>> +/*
>>>> + * This code is really good
>>>> + */
>>>> +
>>>> +#define PMEVN_CASE(n, case_macro) \
>>>> +  case n: case_macro(n); break
>>>> +
>>>> +#define PMEVN_SWITCH(x, case_macro)                               \
>>>> +  do {                                                    \
>>>> +          switch (x) {                                    \
>>>> +          PMEVN_CASE(0,  case_macro);                     \
>>>> +          PMEVN_CASE(1,  case_macro);                     \
>>>> +          PMEVN_CASE(2,  case_macro);                     \
>>>> +          PMEVN_CASE(3,  case_macro);                     \
>>>> +          PMEVN_CASE(4,  case_macro);                     \
>>>> +          PMEVN_CASE(5,  case_macro);                     \
>>>> +          PMEVN_CASE(6,  case_macro);                     \
>>>> +          PMEVN_CASE(7,  case_macro);                     \
>>>> +          PMEVN_CASE(8,  case_macro);                     \
>>>> +          PMEVN_CASE(9,  case_macro);                     \
>>>> +          PMEVN_CASE(10, case_macro);                     \
>>>> +          PMEVN_CASE(11, case_macro);                     \
>>>> +          PMEVN_CASE(12, case_macro);                     \
>>>> +          PMEVN_CASE(13, case_macro);                     \
>>>> +          PMEVN_CASE(14, case_macro);                     \
>>>> +          PMEVN_CASE(15, case_macro);                     \
>>>> +          PMEVN_CASE(16, case_macro);                     \
>>>> +          PMEVN_CASE(17, case_macro);                     \
>>>> +          PMEVN_CASE(18, case_macro);                     \
>>>> +          PMEVN_CASE(19, case_macro);                     \
>>>
>>> Is 20 missing on purpose?
>>>
>>
>> That would have been fun to debug. Well spotted!
>>
>> I'll fix it in the next version.
>>
>> Thanks,
>>
>
> Tried perf top/record on this patch and are working fine.
> Output of perf record on "iperf -c 127.0.0.1 -t 30" is below. (single core)
>
> With Pseudo-NMI:
>      20.35%  [k] lock_acquire
>      16.95%  [k] lock_release
>      11.02%  [k] __arch_copy_from_user
>       7.78%  [k] lock_is_held_type
>       2.12%  [k] ipt_do_table
>       1.34%  [k] kmem_cache_free
>       1.25%  [k] _raw_spin_unlock_irqrestore
>       1.21%  [k] __nf_conntrack_find_get
>       1.06%  [k] get_page_from_freelist
>       1.04%  [k] ktime_get
>       1.03%  [k] kfree
>       1.00%  [k] nf_conntrack_tcp_packet
>       0.96%  [k] tcp_sendmsg_locked
>       0.87%  [k] __softirqentry_text_start
>       0.87%  [k] process_backlog
>       0.76%  [k] __local_bh_enable_ip
>       0.75%  [k] ip_finish_output2
>       0.68%  [k] __tcp_transmit_skb
>       0.62%  [k] enqueue_to_backlog
>       0.60%  [k] __lock_acquire.isra.17
>       0.58%  [k] __free_pages_ok
>       0.54%  [k] nf_conntrack_in
>
> With IRQ:
>      16.49%  [k] __arch_copy_from_user
>      12.38%  [k] _raw_spin_unlock_irqrestore
>       9.41%  [k] lock_acquire
>       6.92%  [k] lock_release
>       3.71%  [k] lock_is_held_type
>       2.80%  [k] ipt_do_table
>       2.06%  [k] get_page_from_freelist
>       1.82%  [k] ktime_get
>       1.73%  [k] process_backlog
>       1.27%  [k] nf_conntrack_tcp_packet
>       1.21%  [k] enqueue_to_backlog
>       1.17%  [k] __tcp_transmit_skb
>       1.12%  [k] ip_finish_output2
>       1.11%  [k] tcp_sendmsg_locked
>       1.06%  [k] __free_pages_ok
>       1.05%  [k] tcp_ack
>       0.99%  [k] __netif_receive_skb_core
>       0.88%  [k] __nf_conntrack_find_get
>       0.71%  [k] nf_conntrack_in
>       0.61%  [k] kmem_cache_free
>       0.59%  [k] kfree
>       0.57%  [k] __alloc_pages_nodemask
>
> Thanks Juilen and Wei,
> Tested-by: Shijith Thotton <sthotton@marvell.com>
>

Thanks for testing this and confirming the improvement.

I'm gonna post a new version soon. Is it alright if I apply this tag for
the other arm64 patches that enable the use of Pseudo-NMI for the PMU?
(I'm mostly thinking of patches 8 and 9 since there haven't been
comments on them and won't have behavioural changes in the next version).

Thanks,

--
Julien Thierry
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Shijith Thotton July 17, 2019, 4:45 a.m. UTC | #6
Hi Julien,

On 7/16/19 3:54 AM, Julien Thierry wrote:
> On 16/07/2019 11:33, Shijith Thotton wrote:
>> On 7/10/19 4:01 AM, Julien Thierry wrote:
>>> On 10/07/2019 11:57, Steven Price wrote:
>>>> On 08/07/2019 15:32, Julien Thierry wrote:
>>>>> From: Mark Rutland <mark.rutland@arm.com>
>>>>>
>>>>> Currently we access the counter registers and their respective type
>>>>> registers indirectly. This requires us to write to PMSELR, issue an ISB,
>>>>> then access the relevant PMXEV* registers.
>>>>>
>>>>> This is unfortunate, because:
>>>>>
>>>>> * Under virtualization, accessing one registers requires two traps to
>>>>>     the hypervisor, even though we could access the register directly with
>>>>>     a single trap.
>>>>>
>>>>> * We have to issue an ISB which we could otherwise avoid the cost of.
>>>>>
>>>>> * When we use NMIs, the NMI handler will have to save/restore the select
>>>>>     register in case the code it preempted was attempting to access a
>>>>>     counter or its type register.
>>>>>
>>>>> We can avoid these issues by directly accessing the relevant registers.
>>>>> This patch adds helpers to do so.
>>>>>
>>>>> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
>>>>> [Julien T.: Don't inline read/write functions to avoid big code-size
>>>>>     increase, remove unused read_pmevtypern function,
>>>>>     fix counter index issue.]
>>>>> Signed-off-by: Julien Thierry <julien.thierry@arm.com>
>>>>> Cc: Will Deacon <will.deacon@arm.com>
>>>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>>>> Cc: Ingo Molnar <mingo@redhat.com>
>>>>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
>>>>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
>>>>> Cc: Jiri Olsa <jolsa@redhat.com>
>>>>> Cc: Namhyung Kim <namhyung@kernel.org>
>>>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>>>> ---
>>>>>    arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------
>>>>>    1 file changed, 83 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
>>>>> index 96e90e2..7759f8a 100644
>>>>> --- a/arch/arm64/kernel/perf_event.c
>>>>> +++ b/arch/arm64/kernel/perf_event.c
>>>>> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
>>>>>    #define  ARMV8_IDX_TO_COUNTER(x) \
>>>>>     (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
>>>>>
>>>>> +/*
>>>>> + * This code is really good
>>>>> + */
>>>>> +
>>>>> +#define PMEVN_CASE(n, case_macro) \
>>>>> +  case n: case_macro(n); break
>>>>> +
>>>>> +#define PMEVN_SWITCH(x, case_macro)                               \
>>>>> +  do {                                                    \
>>>>> +          switch (x) {                                    \
>>>>> +          PMEVN_CASE(0,  case_macro);                     \
>>>>> +          PMEVN_CASE(1,  case_macro);                     \
>>>>> +          PMEVN_CASE(2,  case_macro);                     \
>>>>> +          PMEVN_CASE(3,  case_macro);                     \
>>>>> +          PMEVN_CASE(4,  case_macro);                     \
>>>>> +          PMEVN_CASE(5,  case_macro);                     \
>>>>> +          PMEVN_CASE(6,  case_macro);                     \
>>>>> +          PMEVN_CASE(7,  case_macro);                     \
>>>>> +          PMEVN_CASE(8,  case_macro);                     \
>>>>> +          PMEVN_CASE(9,  case_macro);                     \
>>>>> +          PMEVN_CASE(10, case_macro);                     \
>>>>> +          PMEVN_CASE(11, case_macro);                     \
>>>>> +          PMEVN_CASE(12, case_macro);                     \
>>>>> +          PMEVN_CASE(13, case_macro);                     \
>>>>> +          PMEVN_CASE(14, case_macro);                     \
>>>>> +          PMEVN_CASE(15, case_macro);                     \
>>>>> +          PMEVN_CASE(16, case_macro);                     \
>>>>> +          PMEVN_CASE(17, case_macro);                     \
>>>>> +          PMEVN_CASE(18, case_macro);                     \
>>>>> +          PMEVN_CASE(19, case_macro);                     \
>>>>
>>>> Is 20 missing on purpose?
>>>>
>>>
>>> That would have been fun to debug. Well spotted!
>>>
>>> I'll fix it in the next version.
>>>
>>> Thanks,
>>>
>>
>> Tried perf top/record on this patch and are working fine.
>> Output of perf record on "iperf -c 127.0.0.1 -t 30" is below. (single core)
>>
>> With Pseudo-NMI:
>>       20.35%  [k] lock_acquire
>>       16.95%  [k] lock_release
>>       11.02%  [k] __arch_copy_from_user
>>        7.78%  [k] lock_is_held_type
>>        2.12%  [k] ipt_do_table
>>        1.34%  [k] kmem_cache_free
>>        1.25%  [k] _raw_spin_unlock_irqrestore
>>        1.21%  [k] __nf_conntrack_find_get
>>        1.06%  [k] get_page_from_freelist
>>        1.04%  [k] ktime_get
>>        1.03%  [k] kfree
>>        1.00%  [k] nf_conntrack_tcp_packet
>>        0.96%  [k] tcp_sendmsg_locked
>>        0.87%  [k] __softirqentry_text_start
>>        0.87%  [k] process_backlog
>>        0.76%  [k] __local_bh_enable_ip
>>        0.75%  [k] ip_finish_output2
>>        0.68%  [k] __tcp_transmit_skb
>>        0.62%  [k] enqueue_to_backlog
>>        0.60%  [k] __lock_acquire.isra.17
>>        0.58%  [k] __free_pages_ok
>>        0.54%  [k] nf_conntrack_in
>>
>> With IRQ:
>>       16.49%  [k] __arch_copy_from_user
>>       12.38%  [k] _raw_spin_unlock_irqrestore
>>        9.41%  [k] lock_acquire
>>        6.92%  [k] lock_release
>>        3.71%  [k] lock_is_held_type
>>        2.80%  [k] ipt_do_table
>>        2.06%  [k] get_page_from_freelist
>>        1.82%  [k] ktime_get
>>        1.73%  [k] process_backlog
>>        1.27%  [k] nf_conntrack_tcp_packet
>>        1.21%  [k] enqueue_to_backlog
>>        1.17%  [k] __tcp_transmit_skb
>>        1.12%  [k] ip_finish_output2
>>        1.11%  [k] tcp_sendmsg_locked
>>        1.06%  [k] __free_pages_ok
>>        1.05%  [k] tcp_ack
>>        0.99%  [k] __netif_receive_skb_core
>>        0.88%  [k] __nf_conntrack_find_get
>>        0.71%  [k] nf_conntrack_in
>>        0.61%  [k] kmem_cache_free
>>        0.59%  [k] kfree
>>        0.57%  [k] __alloc_pages_nodemask
>>
>> Thanks Juilen and Wei,
>> Tested-by: Shijith Thotton <sthotton@marvell.com>
>>
> 
> Thanks for testing this and confirming the improvement.
> 
> I'm gonna post a new version soon. Is it alright if I apply this tag for
> the other arm64 patches that enable the use of Pseudo-NMI for the PMU?
> (I'm mostly thinking of patches 8 and 9 since there haven't been
> comments on them and won't have behavioural changes in the next version).
> 

Yes please.

Thanks,
Shijith
diff mbox series

Patch

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 96e90e2..7759f8a 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -369,6 +369,77 @@  static inline bool armv8pmu_event_is_chained(struct perf_event *event)
 #define	ARMV8_IDX_TO_COUNTER(x)	\
 	(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)

+/*
+ * This code is really good
+ */
+
+#define PMEVN_CASE(n, case_macro) \
+	case n: case_macro(n); break
+
+#define PMEVN_SWITCH(x, case_macro)				\
+	do {							\
+		switch (x) {					\
+		PMEVN_CASE(0,  case_macro);			\
+		PMEVN_CASE(1,  case_macro);			\
+		PMEVN_CASE(2,  case_macro);			\
+		PMEVN_CASE(3,  case_macro);			\
+		PMEVN_CASE(4,  case_macro);			\
+		PMEVN_CASE(5,  case_macro);			\
+		PMEVN_CASE(6,  case_macro);			\
+		PMEVN_CASE(7,  case_macro);			\
+		PMEVN_CASE(8,  case_macro);			\
+		PMEVN_CASE(9,  case_macro);			\
+		PMEVN_CASE(10, case_macro);			\
+		PMEVN_CASE(11, case_macro);			\
+		PMEVN_CASE(12, case_macro);			\
+		PMEVN_CASE(13, case_macro);			\
+		PMEVN_CASE(14, case_macro);			\
+		PMEVN_CASE(15, case_macro);			\
+		PMEVN_CASE(16, case_macro);			\
+		PMEVN_CASE(17, case_macro);			\
+		PMEVN_CASE(18, case_macro);			\
+		PMEVN_CASE(19, case_macro);			\
+		PMEVN_CASE(21, case_macro);			\
+		PMEVN_CASE(22, case_macro);			\
+		PMEVN_CASE(23, case_macro);			\
+		PMEVN_CASE(24, case_macro);			\
+		PMEVN_CASE(25, case_macro);			\
+		PMEVN_CASE(26, case_macro);			\
+		PMEVN_CASE(27, case_macro);			\
+		PMEVN_CASE(28, case_macro);			\
+		PMEVN_CASE(29, case_macro);			\
+		PMEVN_CASE(30, case_macro);			\
+		default: WARN(1, "Inavlid PMEV* index");	\
+		}						\
+	} while (0)
+
+#define RETURN_READ_PMEVCNTRN(n) \
+	return read_sysreg(pmevcntr##n##_el0);
+static unsigned long read_pmevcntrn(int n)
+{
+	PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
+	return 0;
+}
+#undef RETURN_READ_PMEVCNTRN
+
+#define WRITE_PMEVCNTRN(n) \
+	write_sysreg(val, pmevcntr##n##_el0);
+static void write_pmevcntrn(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
+}
+#undef WRITE_PMEVCNTRN
+
+#define WRITE_PMEVTYPERN(n) \
+	write_sysreg(val, pmevtyper##n##_el0);
+static void write_pmevtypern(int n, unsigned long val)
+{
+	PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
+}
+#undef WRITE_PMEVTYPERN
+
+#undef PMEVN_SWITCH
+
 static inline u32 armv8pmu_pmcr_read(void)
 {
 	return read_sysreg(pmcr_el0);
@@ -397,17 +468,11 @@  static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx)
 	return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx));
 }

-static inline void armv8pmu_select_counter(int idx)
+static inline u32 armv8pmu_read_evcntr(int idx)
 {
 	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
-	write_sysreg(counter, pmselr_el0);
-	isb();
-}

-static inline u32 armv8pmu_read_evcntr(int idx)
-{
-	armv8pmu_select_counter(idx);
-	return read_sysreg(pmxevcntr_el0);
+	return read_pmevcntrn(counter);
 }

 static inline u64 armv8pmu_read_hw_counter(struct perf_event *event)
@@ -441,8 +506,9 @@  static u64 armv8pmu_read_counter(struct perf_event *event)

 static inline void armv8pmu_write_evcntr(int idx, u32 value)
 {
-	armv8pmu_select_counter(idx);
-	write_sysreg(value, pmxevcntr_el0);
+	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+
+	write_pmevcntrn(counter, value);
 }

 static inline void armv8pmu_write_hw_counter(struct perf_event *event,
@@ -483,9 +549,10 @@  static void armv8pmu_write_counter(struct perf_event *event, u64 value)

 static inline void armv8pmu_write_evtype(int idx, u32 val)
 {
-	armv8pmu_select_counter(idx);
+	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+
 	val &= ARMV8_PMU_EVTYPE_MASK;
-	write_sysreg(val, pmxevtyper_el0);
+	write_pmevtypern(counter, val);
 }

 static inline void armv8pmu_write_event_type(struct perf_event *event)
@@ -505,7 +572,10 @@  static inline void armv8pmu_write_event_type(struct perf_event *event)
 		armv8pmu_write_evtype(idx - 1, hwc->config_base);
 		armv8pmu_write_evtype(idx, chain_evt);
 	} else {
-		armv8pmu_write_evtype(idx, hwc->config_base);
+		if (idx == ARMV8_IDX_CYCLE_COUNTER)
+			write_sysreg(hwc->config_base, pmccfiltr_el0);
+		else
+			armv8pmu_write_evtype(idx, hwc->config_base);
 	}
 }