Message ID | 1562596377-33196-2-git-send-email-julien.thierry@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm_pmu: Use NMI for perf interrupt | expand |
On Mon, Jul 08, 2019 at 03:32:49PM +0100, Julien Thierry wrote: > From: Mark Rutland <mark.rutland@arm.com> > > Currently we access the counter registers and their respective type > registers indirectly. This requires us to write to PMSELR, issue an ISB, > then access the relevant PMXEV* registers. > > This is unfortunate, because: > > * Under virtualization, accessing one registers requires two traps to > the hypervisor, even though we could access the register directly with > a single trap. > > * We have to issue an ISB which we could otherwise avoid the cost of. > > * When we use NMIs, the NMI handler will have to save/restore the select > register in case the code it preempted was attempting to access a > counter or its type register. > > We can avoid these issues by directly accessing the relevant registers. > This patch adds helpers to do so. > > Signed-off-by: Mark Rutland <mark.rutland@arm.com> > [Julien T.: Don't inline read/write functions to avoid big code-size > increase, remove unused read_pmevtypern function, > fix counter index issue.] > Signed-off-by: Julien Thierry <julien.thierry@arm.com> > Cc: Will Deacon <will.deacon@arm.com> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Arnaldo Carvalho de Melo <acme@kernel.org> > Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> > Cc: Jiri Olsa <jolsa@redhat.com> > Cc: Namhyung Kim <namhyung@kernel.org> > Cc: Catalin Marinas <catalin.marinas@arm.com> > --- > arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------ > 1 file changed, 83 insertions(+), 13 deletions(-) > > diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c > index 96e90e2..7759f8a 100644 > --- a/arch/arm64/kernel/perf_event.c > +++ b/arch/arm64/kernel/perf_event.c > @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) > #define ARMV8_IDX_TO_COUNTER(x) \ > (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) > > +/* > + * This code is really good > + */ > + > +#define PMEVN_CASE(n, case_macro) \ > + case n: case_macro(n); break > + > +#define PMEVN_SWITCH(x, case_macro) \ > + do { \ > + switch (x) { \ > + PMEVN_CASE(0, case_macro); \ > + PMEVN_CASE(1, case_macro); \ > + PMEVN_CASE(2, case_macro); \ > + PMEVN_CASE(3, case_macro); \ > + PMEVN_CASE(4, case_macro); \ > + PMEVN_CASE(5, case_macro); \ > + PMEVN_CASE(6, case_macro); \ > + PMEVN_CASE(7, case_macro); \ > + PMEVN_CASE(8, case_macro); \ > + PMEVN_CASE(9, case_macro); \ > + PMEVN_CASE(10, case_macro); \ > + PMEVN_CASE(11, case_macro); \ > + PMEVN_CASE(12, case_macro); \ > + PMEVN_CASE(13, case_macro); \ > + PMEVN_CASE(14, case_macro); \ > + PMEVN_CASE(15, case_macro); \ > + PMEVN_CASE(16, case_macro); \ > + PMEVN_CASE(17, case_macro); \ > + PMEVN_CASE(18, case_macro); \ > + PMEVN_CASE(19, case_macro); \ > + PMEVN_CASE(21, case_macro); \ > + PMEVN_CASE(22, case_macro); \ > + PMEVN_CASE(23, case_macro); \ > + PMEVN_CASE(24, case_macro); \ > + PMEVN_CASE(25, case_macro); \ > + PMEVN_CASE(26, case_macro); \ > + PMEVN_CASE(27, case_macro); \ > + PMEVN_CASE(28, case_macro); \ > + PMEVN_CASE(29, case_macro); \ > + PMEVN_CASE(30, case_macro); \ > + default: WARN(1, "Inavlid PMEV* index"); \ Nit: s/inavlid/invalid/ > + } \ > + } while (0) > + > +#define RETURN_READ_PMEVCNTRN(n) \ > + return read_sysreg(pmevcntr##n##_el0); > +static unsigned long read_pmevcntrn(int n) > +{ > + PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN); > + return 0; > +} > +#undef RETURN_READ_PMEVCNTRN > + > +#define WRITE_PMEVCNTRN(n) \ > + write_sysreg(val, pmevcntr##n##_el0); > +static void write_pmevcntrn(int n, unsigned long val) > +{ > + PMEVN_SWITCH(n, WRITE_PMEVCNTRN); > +} > +#undef WRITE_PMEVCNTRN > + > +#define WRITE_PMEVTYPERN(n) \ > + write_sysreg(val, pmevtyper##n##_el0); > +static void write_pmevtypern(int n, unsigned long val) > +{ > + PMEVN_SWITCH(n, WRITE_PMEVTYPERN); > +} > +#undef WRITE_PMEVTYPERN > + > +#undef PMEVN_SWITCH I think we can drop the undefs. These are local to this C file, and the names are sufficiently unique to avoid collision. Note that we missed the undef for PMEVN_CASE, and I imagine keeping that sane will be messy in future. Other than that, I haven't come up with a nicer way of writing the above, so that looks good to me. > + > static inline u32 armv8pmu_pmcr_read(void) > { > return read_sysreg(pmcr_el0); > @@ -397,17 +468,11 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) > return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); > } > > -static inline void armv8pmu_select_counter(int idx) > +static inline u32 armv8pmu_read_evcntr(int idx) > { > u32 counter = ARMV8_IDX_TO_COUNTER(idx); > - write_sysreg(counter, pmselr_el0); > - isb(); > -} > > -static inline u32 armv8pmu_read_evcntr(int idx) > -{ > - armv8pmu_select_counter(idx); > - return read_sysreg(pmxevcntr_el0); > + return read_pmevcntrn(counter); > } > > static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) > @@ -441,8 +506,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event) > > static inline void armv8pmu_write_evcntr(int idx, u32 value) > { > - armv8pmu_select_counter(idx); > - write_sysreg(value, pmxevcntr_el0); > + u32 counter = ARMV8_IDX_TO_COUNTER(idx); > + > + write_pmevcntrn(counter, value); > } > > static inline void armv8pmu_write_hw_counter(struct perf_event *event, > @@ -483,9 +549,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) > > static inline void armv8pmu_write_evtype(int idx, u32 val) > { > - armv8pmu_select_counter(idx); > + u32 counter = ARMV8_IDX_TO_COUNTER(idx); > + > val &= ARMV8_PMU_EVTYPE_MASK; > - write_sysreg(val, pmxevtyper_el0); > + write_pmevtypern(counter, val); > } > > static inline void armv8pmu_write_event_type(struct perf_event *event) > @@ -505,7 +572,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event) > armv8pmu_write_evtype(idx - 1, hwc->config_base); > armv8pmu_write_evtype(idx, chain_evt); > } else { > - armv8pmu_write_evtype(idx, hwc->config_base); > + if (idx == ARMV8_IDX_CYCLE_COUNTER) > + write_sysreg(hwc->config_base, pmccfiltr_el0); > + else > + armv8pmu_write_evtype(idx, hwc->config_base); > } > } ... and this all looks sound. With the typo fixed and undefs dropped: Acked-by: Mark Rutland <mark.rutland@arm.com> Thanks, Mark. > > -- > 1.9.1
On 08/07/2019 15:32, Julien Thierry wrote: > From: Mark Rutland <mark.rutland@arm.com> > > Currently we access the counter registers and their respective type > registers indirectly. This requires us to write to PMSELR, issue an ISB, > then access the relevant PMXEV* registers. > > This is unfortunate, because: > > * Under virtualization, accessing one registers requires two traps to > the hypervisor, even though we could access the register directly with > a single trap. > > * We have to issue an ISB which we could otherwise avoid the cost of. > > * When we use NMIs, the NMI handler will have to save/restore the select > register in case the code it preempted was attempting to access a > counter or its type register. > > We can avoid these issues by directly accessing the relevant registers. > This patch adds helpers to do so. > > Signed-off-by: Mark Rutland <mark.rutland@arm.com> > [Julien T.: Don't inline read/write functions to avoid big code-size > increase, remove unused read_pmevtypern function, > fix counter index issue.] > Signed-off-by: Julien Thierry <julien.thierry@arm.com> > Cc: Will Deacon <will.deacon@arm.com> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Arnaldo Carvalho de Melo <acme@kernel.org> > Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> > Cc: Jiri Olsa <jolsa@redhat.com> > Cc: Namhyung Kim <namhyung@kernel.org> > Cc: Catalin Marinas <catalin.marinas@arm.com> > --- > arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------ > 1 file changed, 83 insertions(+), 13 deletions(-) > > diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c > index 96e90e2..7759f8a 100644 > --- a/arch/arm64/kernel/perf_event.c > +++ b/arch/arm64/kernel/perf_event.c > @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) > #define ARMV8_IDX_TO_COUNTER(x) \ > (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) > > +/* > + * This code is really good > + */ > + > +#define PMEVN_CASE(n, case_macro) \ > + case n: case_macro(n); break > + > +#define PMEVN_SWITCH(x, case_macro) \ > + do { \ > + switch (x) { \ > + PMEVN_CASE(0, case_macro); \ > + PMEVN_CASE(1, case_macro); \ > + PMEVN_CASE(2, case_macro); \ > + PMEVN_CASE(3, case_macro); \ > + PMEVN_CASE(4, case_macro); \ > + PMEVN_CASE(5, case_macro); \ > + PMEVN_CASE(6, case_macro); \ > + PMEVN_CASE(7, case_macro); \ > + PMEVN_CASE(8, case_macro); \ > + PMEVN_CASE(9, case_macro); \ > + PMEVN_CASE(10, case_macro); \ > + PMEVN_CASE(11, case_macro); \ > + PMEVN_CASE(12, case_macro); \ > + PMEVN_CASE(13, case_macro); \ > + PMEVN_CASE(14, case_macro); \ > + PMEVN_CASE(15, case_macro); \ > + PMEVN_CASE(16, case_macro); \ > + PMEVN_CASE(17, case_macro); \ > + PMEVN_CASE(18, case_macro); \ > + PMEVN_CASE(19, case_macro); \ Is 20 missing on purpose? > + PMEVN_CASE(21, case_macro); \ > + PMEVN_CASE(22, case_macro); \ > + PMEVN_CASE(23, case_macro); \ > + PMEVN_CASE(24, case_macro); \ > + PMEVN_CASE(25, case_macro); \ > + PMEVN_CASE(26, case_macro); \ > + PMEVN_CASE(27, case_macro); \ > + PMEVN_CASE(28, case_macro); \ > + PMEVN_CASE(29, case_macro); \ > + PMEVN_CASE(30, case_macro); \ > + default: WARN(1, "Inavlid PMEV* index"); \ ^^^^^^^ Invalid Steve > + } \ > + } while (0) > + > +#define RETURN_READ_PMEVCNTRN(n) \ > + return read_sysreg(pmevcntr##n##_el0); > +static unsigned long read_pmevcntrn(int n) > +{ > + PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN); > + return 0; > +} > +#undef RETURN_READ_PMEVCNTRN > + > +#define WRITE_PMEVCNTRN(n) \ > + write_sysreg(val, pmevcntr##n##_el0); > +static void write_pmevcntrn(int n, unsigned long val) > +{ > + PMEVN_SWITCH(n, WRITE_PMEVCNTRN); > +} > +#undef WRITE_PMEVCNTRN > + > +#define WRITE_PMEVTYPERN(n) \ > + write_sysreg(val, pmevtyper##n##_el0); > +static void write_pmevtypern(int n, unsigned long val) > +{ > + PMEVN_SWITCH(n, WRITE_PMEVTYPERN); > +} > +#undef WRITE_PMEVTYPERN > + > +#undef PMEVN_SWITCH > + > static inline u32 armv8pmu_pmcr_read(void) > { > return read_sysreg(pmcr_el0); > @@ -397,17 +468,11 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) > return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); > } > > -static inline void armv8pmu_select_counter(int idx) > +static inline u32 armv8pmu_read_evcntr(int idx) > { > u32 counter = ARMV8_IDX_TO_COUNTER(idx); > - write_sysreg(counter, pmselr_el0); > - isb(); > -} > > -static inline u32 armv8pmu_read_evcntr(int idx) > -{ > - armv8pmu_select_counter(idx); > - return read_sysreg(pmxevcntr_el0); > + return read_pmevcntrn(counter); > } > > static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) > @@ -441,8 +506,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event) > > static inline void armv8pmu_write_evcntr(int idx, u32 value) > { > - armv8pmu_select_counter(idx); > - write_sysreg(value, pmxevcntr_el0); > + u32 counter = ARMV8_IDX_TO_COUNTER(idx); > + > + write_pmevcntrn(counter, value); > } > > static inline void armv8pmu_write_hw_counter(struct perf_event *event, > @@ -483,9 +549,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) > > static inline void armv8pmu_write_evtype(int idx, u32 val) > { > - armv8pmu_select_counter(idx); > + u32 counter = ARMV8_IDX_TO_COUNTER(idx); > + > val &= ARMV8_PMU_EVTYPE_MASK; > - write_sysreg(val, pmxevtyper_el0); > + write_pmevtypern(counter, val); > } > > static inline void armv8pmu_write_event_type(struct perf_event *event) > @@ -505,7 +572,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event) > armv8pmu_write_evtype(idx - 1, hwc->config_base); > armv8pmu_write_evtype(idx, chain_evt); > } else { > - armv8pmu_write_evtype(idx, hwc->config_base); > + if (idx == ARMV8_IDX_CYCLE_COUNTER) > + write_sysreg(hwc->config_base, pmccfiltr_el0); > + else > + armv8pmu_write_evtype(idx, hwc->config_base); > } > } > > -- > 1.9.1 > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel >
On 10/07/2019 11:57, Steven Price wrote: > On 08/07/2019 15:32, Julien Thierry wrote: >> From: Mark Rutland <mark.rutland@arm.com> >> >> Currently we access the counter registers and their respective type >> registers indirectly. This requires us to write to PMSELR, issue an ISB, >> then access the relevant PMXEV* registers. >> >> This is unfortunate, because: >> >> * Under virtualization, accessing one registers requires two traps to >> the hypervisor, even though we could access the register directly with >> a single trap. >> >> * We have to issue an ISB which we could otherwise avoid the cost of. >> >> * When we use NMIs, the NMI handler will have to save/restore the select >> register in case the code it preempted was attempting to access a >> counter or its type register. >> >> We can avoid these issues by directly accessing the relevant registers. >> This patch adds helpers to do so. >> >> Signed-off-by: Mark Rutland <mark.rutland@arm.com> >> [Julien T.: Don't inline read/write functions to avoid big code-size >> increase, remove unused read_pmevtypern function, >> fix counter index issue.] >> Signed-off-by: Julien Thierry <julien.thierry@arm.com> >> Cc: Will Deacon <will.deacon@arm.com> >> Cc: Peter Zijlstra <peterz@infradead.org> >> Cc: Ingo Molnar <mingo@redhat.com> >> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> >> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> >> Cc: Jiri Olsa <jolsa@redhat.com> >> Cc: Namhyung Kim <namhyung@kernel.org> >> Cc: Catalin Marinas <catalin.marinas@arm.com> >> --- >> arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------ >> 1 file changed, 83 insertions(+), 13 deletions(-) >> >> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c >> index 96e90e2..7759f8a 100644 >> --- a/arch/arm64/kernel/perf_event.c >> +++ b/arch/arm64/kernel/perf_event.c >> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) >> #define ARMV8_IDX_TO_COUNTER(x) \ >> (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) >> >> +/* >> + * This code is really good >> + */ >> + >> +#define PMEVN_CASE(n, case_macro) \ >> + case n: case_macro(n); break >> + >> +#define PMEVN_SWITCH(x, case_macro) \ >> + do { \ >> + switch (x) { \ >> + PMEVN_CASE(0, case_macro); \ >> + PMEVN_CASE(1, case_macro); \ >> + PMEVN_CASE(2, case_macro); \ >> + PMEVN_CASE(3, case_macro); \ >> + PMEVN_CASE(4, case_macro); \ >> + PMEVN_CASE(5, case_macro); \ >> + PMEVN_CASE(6, case_macro); \ >> + PMEVN_CASE(7, case_macro); \ >> + PMEVN_CASE(8, case_macro); \ >> + PMEVN_CASE(9, case_macro); \ >> + PMEVN_CASE(10, case_macro); \ >> + PMEVN_CASE(11, case_macro); \ >> + PMEVN_CASE(12, case_macro); \ >> + PMEVN_CASE(13, case_macro); \ >> + PMEVN_CASE(14, case_macro); \ >> + PMEVN_CASE(15, case_macro); \ >> + PMEVN_CASE(16, case_macro); \ >> + PMEVN_CASE(17, case_macro); \ >> + PMEVN_CASE(18, case_macro); \ >> + PMEVN_CASE(19, case_macro); \ > > Is 20 missing on purpose? > That would have been fun to debug. Well spotted! I'll fix it in the next version. Thanks,
On 7/10/19 4:01 AM, Julien Thierry wrote: > > > On 10/07/2019 11:57, Steven Price wrote: >> On 08/07/2019 15:32, Julien Thierry wrote: >>> From: Mark Rutland <mark.rutland@arm.com> >>> >>> Currently we access the counter registers and their respective type >>> registers indirectly. This requires us to write to PMSELR, issue an ISB, >>> then access the relevant PMXEV* registers. >>> >>> This is unfortunate, because: >>> >>> * Under virtualization, accessing one registers requires two traps to >>> the hypervisor, even though we could access the register directly with >>> a single trap. >>> >>> * We have to issue an ISB which we could otherwise avoid the cost of. >>> >>> * When we use NMIs, the NMI handler will have to save/restore the select >>> register in case the code it preempted was attempting to access a >>> counter or its type register. >>> >>> We can avoid these issues by directly accessing the relevant registers. >>> This patch adds helpers to do so. >>> >>> Signed-off-by: Mark Rutland <mark.rutland@arm.com> >>> [Julien T.: Don't inline read/write functions to avoid big code-size >>> increase, remove unused read_pmevtypern function, >>> fix counter index issue.] >>> Signed-off-by: Julien Thierry <julien.thierry@arm.com> >>> Cc: Will Deacon <will.deacon@arm.com> >>> Cc: Peter Zijlstra <peterz@infradead.org> >>> Cc: Ingo Molnar <mingo@redhat.com> >>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> >>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> >>> Cc: Jiri Olsa <jolsa@redhat.com> >>> Cc: Namhyung Kim <namhyung@kernel.org> >>> Cc: Catalin Marinas <catalin.marinas@arm.com> >>> --- >>> arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------ >>> 1 file changed, 83 insertions(+), 13 deletions(-) >>> >>> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c >>> index 96e90e2..7759f8a 100644 >>> --- a/arch/arm64/kernel/perf_event.c >>> +++ b/arch/arm64/kernel/perf_event.c >>> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) >>> #define ARMV8_IDX_TO_COUNTER(x) \ >>> (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) >>> >>> +/* >>> + * This code is really good >>> + */ >>> + >>> +#define PMEVN_CASE(n, case_macro) \ >>> + case n: case_macro(n); break >>> + >>> +#define PMEVN_SWITCH(x, case_macro) \ >>> + do { \ >>> + switch (x) { \ >>> + PMEVN_CASE(0, case_macro); \ >>> + PMEVN_CASE(1, case_macro); \ >>> + PMEVN_CASE(2, case_macro); \ >>> + PMEVN_CASE(3, case_macro); \ >>> + PMEVN_CASE(4, case_macro); \ >>> + PMEVN_CASE(5, case_macro); \ >>> + PMEVN_CASE(6, case_macro); \ >>> + PMEVN_CASE(7, case_macro); \ >>> + PMEVN_CASE(8, case_macro); \ >>> + PMEVN_CASE(9, case_macro); \ >>> + PMEVN_CASE(10, case_macro); \ >>> + PMEVN_CASE(11, case_macro); \ >>> + PMEVN_CASE(12, case_macro); \ >>> + PMEVN_CASE(13, case_macro); \ >>> + PMEVN_CASE(14, case_macro); \ >>> + PMEVN_CASE(15, case_macro); \ >>> + PMEVN_CASE(16, case_macro); \ >>> + PMEVN_CASE(17, case_macro); \ >>> + PMEVN_CASE(18, case_macro); \ >>> + PMEVN_CASE(19, case_macro); \ >> >> Is 20 missing on purpose? >> > > That would have been fun to debug. Well spotted! > > I'll fix it in the next version. > > Thanks, > Tried perf top/record on this patch and are working fine. Output of perf record on "iperf -c 127.0.0.1 -t 30" is below. (single core) With Pseudo-NMI: 20.35% [k] lock_acquire 16.95% [k] lock_release 11.02% [k] __arch_copy_from_user 7.78% [k] lock_is_held_type 2.12% [k] ipt_do_table 1.34% [k] kmem_cache_free 1.25% [k] _raw_spin_unlock_irqrestore 1.21% [k] __nf_conntrack_find_get 1.06% [k] get_page_from_freelist 1.04% [k] ktime_get 1.03% [k] kfree 1.00% [k] nf_conntrack_tcp_packet 0.96% [k] tcp_sendmsg_locked 0.87% [k] __softirqentry_text_start 0.87% [k] process_backlog 0.76% [k] __local_bh_enable_ip 0.75% [k] ip_finish_output2 0.68% [k] __tcp_transmit_skb 0.62% [k] enqueue_to_backlog 0.60% [k] __lock_acquire.isra.17 0.58% [k] __free_pages_ok 0.54% [k] nf_conntrack_in With IRQ: 16.49% [k] __arch_copy_from_user 12.38% [k] _raw_spin_unlock_irqrestore 9.41% [k] lock_acquire 6.92% [k] lock_release 3.71% [k] lock_is_held_type 2.80% [k] ipt_do_table 2.06% [k] get_page_from_freelist 1.82% [k] ktime_get 1.73% [k] process_backlog 1.27% [k] nf_conntrack_tcp_packet 1.21% [k] enqueue_to_backlog 1.17% [k] __tcp_transmit_skb 1.12% [k] ip_finish_output2 1.11% [k] tcp_sendmsg_locked 1.06% [k] __free_pages_ok 1.05% [k] tcp_ack 0.99% [k] __netif_receive_skb_core 0.88% [k] __nf_conntrack_find_get 0.71% [k] nf_conntrack_in 0.61% [k] kmem_cache_free 0.59% [k] kfree 0.57% [k] __alloc_pages_nodemask Thanks Juilen and Wei, Tested-by: Shijith Thotton <sthotton@marvell.com>
Hi Shijith, On 16/07/2019 11:33, Shijith Thotton wrote: > > > On 7/10/19 4:01 AM, Julien Thierry wrote: >> >> >> On 10/07/2019 11:57, Steven Price wrote: >>> On 08/07/2019 15:32, Julien Thierry wrote: >>>> From: Mark Rutland <mark.rutland@arm.com> >>>> >>>> Currently we access the counter registers and their respective type >>>> registers indirectly. This requires us to write to PMSELR, issue an ISB, >>>> then access the relevant PMXEV* registers. >>>> >>>> This is unfortunate, because: >>>> >>>> * Under virtualization, accessing one registers requires two traps to >>>> the hypervisor, even though we could access the register directly with >>>> a single trap. >>>> >>>> * We have to issue an ISB which we could otherwise avoid the cost of. >>>> >>>> * When we use NMIs, the NMI handler will have to save/restore the select >>>> register in case the code it preempted was attempting to access a >>>> counter or its type register. >>>> >>>> We can avoid these issues by directly accessing the relevant registers. >>>> This patch adds helpers to do so. >>>> >>>> Signed-off-by: Mark Rutland <mark.rutland@arm.com> >>>> [Julien T.: Don't inline read/write functions to avoid big code-size >>>> increase, remove unused read_pmevtypern function, >>>> fix counter index issue.] >>>> Signed-off-by: Julien Thierry <julien.thierry@arm.com> >>>> Cc: Will Deacon <will.deacon@arm.com> >>>> Cc: Peter Zijlstra <peterz@infradead.org> >>>> Cc: Ingo Molnar <mingo@redhat.com> >>>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> >>>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> >>>> Cc: Jiri Olsa <jolsa@redhat.com> >>>> Cc: Namhyung Kim <namhyung@kernel.org> >>>> Cc: Catalin Marinas <catalin.marinas@arm.com> >>>> --- >>>> arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------ >>>> 1 file changed, 83 insertions(+), 13 deletions(-) >>>> >>>> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c >>>> index 96e90e2..7759f8a 100644 >>>> --- a/arch/arm64/kernel/perf_event.c >>>> +++ b/arch/arm64/kernel/perf_event.c >>>> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) >>>> #define ARMV8_IDX_TO_COUNTER(x) \ >>>> (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) >>>> >>>> +/* >>>> + * This code is really good >>>> + */ >>>> + >>>> +#define PMEVN_CASE(n, case_macro) \ >>>> + case n: case_macro(n); break >>>> + >>>> +#define PMEVN_SWITCH(x, case_macro) \ >>>> + do { \ >>>> + switch (x) { \ >>>> + PMEVN_CASE(0, case_macro); \ >>>> + PMEVN_CASE(1, case_macro); \ >>>> + PMEVN_CASE(2, case_macro); \ >>>> + PMEVN_CASE(3, case_macro); \ >>>> + PMEVN_CASE(4, case_macro); \ >>>> + PMEVN_CASE(5, case_macro); \ >>>> + PMEVN_CASE(6, case_macro); \ >>>> + PMEVN_CASE(7, case_macro); \ >>>> + PMEVN_CASE(8, case_macro); \ >>>> + PMEVN_CASE(9, case_macro); \ >>>> + PMEVN_CASE(10, case_macro); \ >>>> + PMEVN_CASE(11, case_macro); \ >>>> + PMEVN_CASE(12, case_macro); \ >>>> + PMEVN_CASE(13, case_macro); \ >>>> + PMEVN_CASE(14, case_macro); \ >>>> + PMEVN_CASE(15, case_macro); \ >>>> + PMEVN_CASE(16, case_macro); \ >>>> + PMEVN_CASE(17, case_macro); \ >>>> + PMEVN_CASE(18, case_macro); \ >>>> + PMEVN_CASE(19, case_macro); \ >>> >>> Is 20 missing on purpose? >>> >> >> That would have been fun to debug. Well spotted! >> >> I'll fix it in the next version. >> >> Thanks, >> > > Tried perf top/record on this patch and are working fine. > Output of perf record on "iperf -c 127.0.0.1 -t 30" is below. (single core) > > With Pseudo-NMI: > 20.35% [k] lock_acquire > 16.95% [k] lock_release > 11.02% [k] __arch_copy_from_user > 7.78% [k] lock_is_held_type > 2.12% [k] ipt_do_table > 1.34% [k] kmem_cache_free > 1.25% [k] _raw_spin_unlock_irqrestore > 1.21% [k] __nf_conntrack_find_get > 1.06% [k] get_page_from_freelist > 1.04% [k] ktime_get > 1.03% [k] kfree > 1.00% [k] nf_conntrack_tcp_packet > 0.96% [k] tcp_sendmsg_locked > 0.87% [k] __softirqentry_text_start > 0.87% [k] process_backlog > 0.76% [k] __local_bh_enable_ip > 0.75% [k] ip_finish_output2 > 0.68% [k] __tcp_transmit_skb > 0.62% [k] enqueue_to_backlog > 0.60% [k] __lock_acquire.isra.17 > 0.58% [k] __free_pages_ok > 0.54% [k] nf_conntrack_in > > With IRQ: > 16.49% [k] __arch_copy_from_user > 12.38% [k] _raw_spin_unlock_irqrestore > 9.41% [k] lock_acquire > 6.92% [k] lock_release > 3.71% [k] lock_is_held_type > 2.80% [k] ipt_do_table > 2.06% [k] get_page_from_freelist > 1.82% [k] ktime_get > 1.73% [k] process_backlog > 1.27% [k] nf_conntrack_tcp_packet > 1.21% [k] enqueue_to_backlog > 1.17% [k] __tcp_transmit_skb > 1.12% [k] ip_finish_output2 > 1.11% [k] tcp_sendmsg_locked > 1.06% [k] __free_pages_ok > 1.05% [k] tcp_ack > 0.99% [k] __netif_receive_skb_core > 0.88% [k] __nf_conntrack_find_get > 0.71% [k] nf_conntrack_in > 0.61% [k] kmem_cache_free > 0.59% [k] kfree > 0.57% [k] __alloc_pages_nodemask > > Thanks Juilen and Wei, > Tested-by: Shijith Thotton <sthotton@marvell.com> > Thanks for testing this and confirming the improvement. I'm gonna post a new version soon. Is it alright if I apply this tag for the other arm64 patches that enable the use of Pseudo-NMI for the PMU? (I'm mostly thinking of patches 8 and 9 since there haven't been comments on them and won't have behavioural changes in the next version). Thanks, -- Julien Thierry IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Hi Julien, On 7/16/19 3:54 AM, Julien Thierry wrote: > On 16/07/2019 11:33, Shijith Thotton wrote: >> On 7/10/19 4:01 AM, Julien Thierry wrote: >>> On 10/07/2019 11:57, Steven Price wrote: >>>> On 08/07/2019 15:32, Julien Thierry wrote: >>>>> From: Mark Rutland <mark.rutland@arm.com> >>>>> >>>>> Currently we access the counter registers and their respective type >>>>> registers indirectly. This requires us to write to PMSELR, issue an ISB, >>>>> then access the relevant PMXEV* registers. >>>>> >>>>> This is unfortunate, because: >>>>> >>>>> * Under virtualization, accessing one registers requires two traps to >>>>> the hypervisor, even though we could access the register directly with >>>>> a single trap. >>>>> >>>>> * We have to issue an ISB which we could otherwise avoid the cost of. >>>>> >>>>> * When we use NMIs, the NMI handler will have to save/restore the select >>>>> register in case the code it preempted was attempting to access a >>>>> counter or its type register. >>>>> >>>>> We can avoid these issues by directly accessing the relevant registers. >>>>> This patch adds helpers to do so. >>>>> >>>>> Signed-off-by: Mark Rutland <mark.rutland@arm.com> >>>>> [Julien T.: Don't inline read/write functions to avoid big code-size >>>>> increase, remove unused read_pmevtypern function, >>>>> fix counter index issue.] >>>>> Signed-off-by: Julien Thierry <julien.thierry@arm.com> >>>>> Cc: Will Deacon <will.deacon@arm.com> >>>>> Cc: Peter Zijlstra <peterz@infradead.org> >>>>> Cc: Ingo Molnar <mingo@redhat.com> >>>>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> >>>>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> >>>>> Cc: Jiri Olsa <jolsa@redhat.com> >>>>> Cc: Namhyung Kim <namhyung@kernel.org> >>>>> Cc: Catalin Marinas <catalin.marinas@arm.com> >>>>> --- >>>>> arch/arm64/kernel/perf_event.c | 96 ++++++++++++++++++++++++++++++++++++------ >>>>> 1 file changed, 83 insertions(+), 13 deletions(-) >>>>> >>>>> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c >>>>> index 96e90e2..7759f8a 100644 >>>>> --- a/arch/arm64/kernel/perf_event.c >>>>> +++ b/arch/arm64/kernel/perf_event.c >>>>> @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) >>>>> #define ARMV8_IDX_TO_COUNTER(x) \ >>>>> (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) >>>>> >>>>> +/* >>>>> + * This code is really good >>>>> + */ >>>>> + >>>>> +#define PMEVN_CASE(n, case_macro) \ >>>>> + case n: case_macro(n); break >>>>> + >>>>> +#define PMEVN_SWITCH(x, case_macro) \ >>>>> + do { \ >>>>> + switch (x) { \ >>>>> + PMEVN_CASE(0, case_macro); \ >>>>> + PMEVN_CASE(1, case_macro); \ >>>>> + PMEVN_CASE(2, case_macro); \ >>>>> + PMEVN_CASE(3, case_macro); \ >>>>> + PMEVN_CASE(4, case_macro); \ >>>>> + PMEVN_CASE(5, case_macro); \ >>>>> + PMEVN_CASE(6, case_macro); \ >>>>> + PMEVN_CASE(7, case_macro); \ >>>>> + PMEVN_CASE(8, case_macro); \ >>>>> + PMEVN_CASE(9, case_macro); \ >>>>> + PMEVN_CASE(10, case_macro); \ >>>>> + PMEVN_CASE(11, case_macro); \ >>>>> + PMEVN_CASE(12, case_macro); \ >>>>> + PMEVN_CASE(13, case_macro); \ >>>>> + PMEVN_CASE(14, case_macro); \ >>>>> + PMEVN_CASE(15, case_macro); \ >>>>> + PMEVN_CASE(16, case_macro); \ >>>>> + PMEVN_CASE(17, case_macro); \ >>>>> + PMEVN_CASE(18, case_macro); \ >>>>> + PMEVN_CASE(19, case_macro); \ >>>> >>>> Is 20 missing on purpose? >>>> >>> >>> That would have been fun to debug. Well spotted! >>> >>> I'll fix it in the next version. >>> >>> Thanks, >>> >> >> Tried perf top/record on this patch and are working fine. >> Output of perf record on "iperf -c 127.0.0.1 -t 30" is below. (single core) >> >> With Pseudo-NMI: >> 20.35% [k] lock_acquire >> 16.95% [k] lock_release >> 11.02% [k] __arch_copy_from_user >> 7.78% [k] lock_is_held_type >> 2.12% [k] ipt_do_table >> 1.34% [k] kmem_cache_free >> 1.25% [k] _raw_spin_unlock_irqrestore >> 1.21% [k] __nf_conntrack_find_get >> 1.06% [k] get_page_from_freelist >> 1.04% [k] ktime_get >> 1.03% [k] kfree >> 1.00% [k] nf_conntrack_tcp_packet >> 0.96% [k] tcp_sendmsg_locked >> 0.87% [k] __softirqentry_text_start >> 0.87% [k] process_backlog >> 0.76% [k] __local_bh_enable_ip >> 0.75% [k] ip_finish_output2 >> 0.68% [k] __tcp_transmit_skb >> 0.62% [k] enqueue_to_backlog >> 0.60% [k] __lock_acquire.isra.17 >> 0.58% [k] __free_pages_ok >> 0.54% [k] nf_conntrack_in >> >> With IRQ: >> 16.49% [k] __arch_copy_from_user >> 12.38% [k] _raw_spin_unlock_irqrestore >> 9.41% [k] lock_acquire >> 6.92% [k] lock_release >> 3.71% [k] lock_is_held_type >> 2.80% [k] ipt_do_table >> 2.06% [k] get_page_from_freelist >> 1.82% [k] ktime_get >> 1.73% [k] process_backlog >> 1.27% [k] nf_conntrack_tcp_packet >> 1.21% [k] enqueue_to_backlog >> 1.17% [k] __tcp_transmit_skb >> 1.12% [k] ip_finish_output2 >> 1.11% [k] tcp_sendmsg_locked >> 1.06% [k] __free_pages_ok >> 1.05% [k] tcp_ack >> 0.99% [k] __netif_receive_skb_core >> 0.88% [k] __nf_conntrack_find_get >> 0.71% [k] nf_conntrack_in >> 0.61% [k] kmem_cache_free >> 0.59% [k] kfree >> 0.57% [k] __alloc_pages_nodemask >> >> Thanks Juilen and Wei, >> Tested-by: Shijith Thotton <sthotton@marvell.com> >> > > Thanks for testing this and confirming the improvement. > > I'm gonna post a new version soon. Is it alright if I apply this tag for > the other arm64 patches that enable the use of Pseudo-NMI for the PMU? > (I'm mostly thinking of patches 8 and 9 since there haven't been > comments on them and won't have behavioural changes in the next version). > Yes please. Thanks, Shijith
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 96e90e2..7759f8a 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -369,6 +369,77 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) #define ARMV8_IDX_TO_COUNTER(x) \ (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) +/* + * This code is really good + */ + +#define PMEVN_CASE(n, case_macro) \ + case n: case_macro(n); break + +#define PMEVN_SWITCH(x, case_macro) \ + do { \ + switch (x) { \ + PMEVN_CASE(0, case_macro); \ + PMEVN_CASE(1, case_macro); \ + PMEVN_CASE(2, case_macro); \ + PMEVN_CASE(3, case_macro); \ + PMEVN_CASE(4, case_macro); \ + PMEVN_CASE(5, case_macro); \ + PMEVN_CASE(6, case_macro); \ + PMEVN_CASE(7, case_macro); \ + PMEVN_CASE(8, case_macro); \ + PMEVN_CASE(9, case_macro); \ + PMEVN_CASE(10, case_macro); \ + PMEVN_CASE(11, case_macro); \ + PMEVN_CASE(12, case_macro); \ + PMEVN_CASE(13, case_macro); \ + PMEVN_CASE(14, case_macro); \ + PMEVN_CASE(15, case_macro); \ + PMEVN_CASE(16, case_macro); \ + PMEVN_CASE(17, case_macro); \ + PMEVN_CASE(18, case_macro); \ + PMEVN_CASE(19, case_macro); \ + PMEVN_CASE(21, case_macro); \ + PMEVN_CASE(22, case_macro); \ + PMEVN_CASE(23, case_macro); \ + PMEVN_CASE(24, case_macro); \ + PMEVN_CASE(25, case_macro); \ + PMEVN_CASE(26, case_macro); \ + PMEVN_CASE(27, case_macro); \ + PMEVN_CASE(28, case_macro); \ + PMEVN_CASE(29, case_macro); \ + PMEVN_CASE(30, case_macro); \ + default: WARN(1, "Inavlid PMEV* index"); \ + } \ + } while (0) + +#define RETURN_READ_PMEVCNTRN(n) \ + return read_sysreg(pmevcntr##n##_el0); +static unsigned long read_pmevcntrn(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN); + return 0; +} +#undef RETURN_READ_PMEVCNTRN + +#define WRITE_PMEVCNTRN(n) \ + write_sysreg(val, pmevcntr##n##_el0); +static void write_pmevcntrn(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVCNTRN); +} +#undef WRITE_PMEVCNTRN + +#define WRITE_PMEVTYPERN(n) \ + write_sysreg(val, pmevtyper##n##_el0); +static void write_pmevtypern(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVTYPERN); +} +#undef WRITE_PMEVTYPERN + +#undef PMEVN_SWITCH + static inline u32 armv8pmu_pmcr_read(void) { return read_sysreg(pmcr_el0); @@ -397,17 +468,11 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline void armv8pmu_select_counter(int idx) +static inline u32 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(counter, pmselr_el0); - isb(); -} -static inline u32 armv8pmu_read_evcntr(int idx) -{ - armv8pmu_select_counter(idx); - return read_sysreg(pmxevcntr_el0); + return read_pmevcntrn(counter); } static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) @@ -441,8 +506,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event) static inline void armv8pmu_write_evcntr(int idx, u32 value) { - armv8pmu_select_counter(idx); - write_sysreg(value, pmxevcntr_el0); + u32 counter = ARMV8_IDX_TO_COUNTER(idx); + + write_pmevcntrn(counter, value); } static inline void armv8pmu_write_hw_counter(struct perf_event *event, @@ -483,9 +549,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) static inline void armv8pmu_write_evtype(int idx, u32 val) { - armv8pmu_select_counter(idx); + u32 counter = ARMV8_IDX_TO_COUNTER(idx); + val &= ARMV8_PMU_EVTYPE_MASK; - write_sysreg(val, pmxevtyper_el0); + write_pmevtypern(counter, val); } static inline void armv8pmu_write_event_type(struct perf_event *event) @@ -505,7 +572,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event) armv8pmu_write_evtype(idx - 1, hwc->config_base); armv8pmu_write_evtype(idx, chain_evt); } else { - armv8pmu_write_evtype(idx, hwc->config_base); + if (idx == ARMV8_IDX_CYCLE_COUNTER) + write_sysreg(hwc->config_base, pmccfiltr_el0); + else + armv8pmu_write_evtype(idx, hwc->config_base); } }