diff mbox series

[V16,3/8] drivers: perf: arm_pmuv3: Enable branch stack sampling framework

Message ID 20240125094119.2542332-4-anshuman.khandual@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64/perf: Enable branch stack sampling | expand

Commit Message

Anshuman Khandual Jan. 25, 2024, 9:41 a.m. UTC
Branch stack sampling support i.e capturing branch records during execution
in core perf, rides along with normal HW events being scheduled on the PMU.
This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
with required HW implementation.

ARMV8 PMU hardware support for branch stack sampling is indicated via a new
feature flag called 'has_branch_stack' that can be ascertained via probing.
This modifies current gate in armpmu_event_init() which blocks branch stack
sampling based perf events unconditionally. Instead allows such perf events
getting initialized on supporting PMU hardware.

Branch stack sampling is enabled and disabled along with regular PMU events
. This adds required function callbacks in armv8pmu_branch_xxx() format, to
drive the PMU branch stack hardware when supported. This also adds fallback
stub definitions for these callbacks for PMUs which would not have required
support.

If a task gets scheduled out, the current branch records get saved in the
task's context data, which can be later used to fill in the records upon an
event overflow. Hence, we enable PERF_ATTACH_TASK_DATA (event->attach_state
based flag) for branch stack requesting perf events. But this also requires
adding support for pmu::sched_task() callback to arm_pmu.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
Changes in V16:

- Renamed arm_brbe.h as arm_pmuv3_branch.h
- Updated perf_sample_save_brstack()'s new argument requirements with NULL

 drivers/perf/arm_pmu.c          |  57 ++++++++++++-
 drivers/perf/arm_pmuv3.c        | 141 +++++++++++++++++++++++++++++++-
 drivers/perf/arm_pmuv3_branch.h |  50 +++++++++++
 include/linux/perf/arm_pmu.h    |  29 ++++++-
 include/linux/perf/arm_pmuv3.h  |   1 -
 5 files changed, 273 insertions(+), 5 deletions(-)
 create mode 100644 drivers/perf/arm_pmuv3_branch.h

Comments

Suzuki K Poulose Jan. 25, 2024, 1:44 p.m. UTC | #1
On 25/01/2024 09:41, Anshuman Khandual wrote:
> Branch stack sampling support i.e capturing branch records during execution
> in core perf, rides along with normal HW events being scheduled on the PMU.
> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
> with required HW implementation.
> 
> ARMV8 PMU hardware support for branch stack sampling is indicated via a new
> feature flag called 'has_branch_stack' that can be ascertained via probing.
> This modifies current gate in armpmu_event_init() which blocks branch stack
> sampling based perf events unconditionally. Instead allows such perf events
> getting initialized on supporting PMU hardware.
> 
> Branch stack sampling is enabled and disabled along with regular PMU events
> . This adds required function callbacks in armv8pmu_branch_xxx() format, to
> drive the PMU branch stack hardware when supported. This also adds fallback
> stub definitions for these callbacks for PMUs which would not have required
> support.
> 
> If a task gets scheduled out, the current branch records get saved in the
> task's context data, which can be later used to fill in the records upon an
> event overflow. Hence, we enable PERF_ATTACH_TASK_DATA (event->attach_state
> based flag) for branch stack requesting perf events. But this also requires
> adding support for pmu::sched_task() callback to arm_pmu.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
> Changes in V16:
> 
> - Renamed arm_brbe.h as arm_pmuv3_branch.h
> - Updated perf_sample_save_brstack()'s new argument requirements with NULL
> 
>   drivers/perf/arm_pmu.c          |  57 ++++++++++++-
>   drivers/perf/arm_pmuv3.c        | 141 +++++++++++++++++++++++++++++++-
>   drivers/perf/arm_pmuv3_branch.h |  50 +++++++++++
>   include/linux/perf/arm_pmu.h    |  29 ++++++-
>   include/linux/perf/arm_pmuv3.h  |   1 -
>   5 files changed, 273 insertions(+), 5 deletions(-)
>   create mode 100644 drivers/perf/arm_pmuv3_branch.h
> 
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 8458fe2cebb4..16f488ae7747 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -317,6 +317,15 @@ armpmu_del(struct perf_event *event, int flags)
>   	struct hw_perf_event *hwc = &event->hw;
>   	int idx = hwc->idx;
>   
> +	if (has_branch_stack(event)) {
> +		WARN_ON_ONCE(!hw_events->brbe_users);
> +		hw_events->brbe_users--;

^^ Should we do this only if the event matches the sample type ? Put the 
other way around, what does brbe_users track ? The number of events that
"share" the BRBE instance ? Or the number of active events that
has_branch_stack() ?

> +		if (!hw_events->brbe_users) {
> +			hw_events->brbe_context = NULL;
> +			hw_events->brbe_sample_type = 0;
> +		}
> +	}
> +
>   	armpmu_stop(event, PERF_EF_UPDATE);
>   	hw_events->events[idx] = NULL;
>   	armpmu->clear_event_idx(hw_events, event);
> @@ -333,6 +342,38 @@ armpmu_add(struct perf_event *event, int flags)
>   	struct hw_perf_event *hwc = &event->hw;
>   	int idx;
>   
> +	if (has_branch_stack(event)) {
> +		/*
> +		 * Reset branch records buffer if a new CPU bound event
> +		 * gets scheduled on a PMU. Otherwise existing branch
> +		 * records present in the buffer might just leak into
> +		 * such events.
> +		 *
> +		 * Also reset current 'hw_events->brbe_context' because
> +		 * any previous task bound event now would have lost an
> +		 * opportunity for continuous branch records.
> +		 */
> +		if (!event->ctx->task) {
> +			hw_events->brbe_context = NULL;
> +			if (armpmu->branch_reset)
> +				armpmu->branch_reset();
> +		}
> +
> +		/*
> +		 * Reset branch records buffer if a new task event gets
> +		 * scheduled on a PMU which might have existing records.
> +		 * Otherwise older branch records present in the buffer
> +		 * might leak into the new task event.
> +		 */
> +		if (event->ctx->task && hw_events->brbe_context != event->ctx) {
> +			hw_events->brbe_context = event->ctx;
> +			if (armpmu->branch_reset)
> +				armpmu->branch_reset();
> +		}
> +		hw_events->brbe_users++;
> +		hw_events->brbe_sample_type = event->attr.branch_sample_type;
> +	}
> +
>   	/* An event following a process won't be stopped earlier */
>   	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
>   		return -ENOENT;
> @@ -511,13 +552,24 @@ static int armpmu_event_init(struct perf_event *event)
>   		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
>   		return -ENOENT;
>   
> -	/* does not support taken branch sampling */
> -	if (has_branch_stack(event))
> +	/*
> +	 * Branch stack sampling events are allowed
> +	 * only on PMU which has required support.
> +	 */
> +	if (has_branch_stack(event) && !armpmu->has_branch_stack)
>   		return -EOPNOTSUPP;
>   
>   	return __hw_perf_event_init(event);
>   }
>   
> +static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
> +{
> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
> +
> +	if (armpmu->sched_task)
> +		armpmu->sched_task(pmu_ctx, sched_in);
> +}
> +
>   static void armpmu_enable(struct pmu *pmu)
>   {
>   	struct arm_pmu *armpmu = to_arm_pmu(pmu);
> @@ -864,6 +916,7 @@ struct arm_pmu *armpmu_alloc(void)
>   	}
>   
>   	pmu->pmu = (struct pmu) {
> +		.sched_task	= armpmu_sched_task,
>   		.pmu_enable	= armpmu_enable,
>   		.pmu_disable	= armpmu_disable,
>   		.event_init	= armpmu_event_init,
> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
> index 23fa6c5da82c..9e17764a0929 100644
> --- a/drivers/perf/arm_pmuv3.c
> +++ b/drivers/perf/arm_pmuv3.c
> @@ -26,6 +26,7 @@
>   #include <linux/nmi.h>
>   
>   #include <asm/arm_pmuv3.h>
> +#include "arm_pmuv3_branch.h"
>   
>   /* ARMv8 Cortex-A53 specific event types. */
>   #define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
> @@ -829,14 +830,56 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
>   	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
>   
>   	kvm_vcpu_pmu_resync_el0();
> +	if (cpu_pmu->has_branch_stack)
> +		armv8pmu_branch_enable(cpu_pmu);

Is there a reason why do this after kvm_vcpu_pmu_resync_el0() ?
Ideally, we should get counting as soon as the PMU is on ?

>   }
>   
>   static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
>   {
> +	if (cpu_pmu->has_branch_stack)
> +		armv8pmu_branch_disable();
> +
>   	/* Disable all counters */
>   	armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
>   }
>   
> +static void read_branch_records(struct pmu_hw_events *cpuc,
> +				struct perf_event *event,
> +				struct perf_sample_data *data,
> +				bool *branch_captured)
> +{
> +	/*
> +	 * CPU specific branch records buffer must have been allocated already
> +	 * for the hardware records to be captured and processed further.
> +	 */
> +	if (WARN_ON(!cpuc->branches))
> +		return;
> +
> +	/*
> +	 * Overflowed event's branch_sample_type does not match the configured
> +	 * branch filters in the BRBE HW. So the captured branch records here
> +	 * cannot be co-related to the overflowed event. Report to the user as
> +	 * if no branch records have been captured, and flush branch records.
> +	 * The same scenario is applicable when the current task context does
> +	 * not match with overflown event.
> +	 */
> +	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
> +	    (event->ctx->task && cpuc->brbe_context != event->ctx))
> +		return;
> +
> +	/*
> +	 * Read the branch records from the hardware once after the PMU IRQ
> +	 * has been triggered but subsequently same records can be used for
> +	 * other events that might have been overflowed simultaneously thus
> +	 * saving much CPU cycles.
> +	 */
> +	if (!*branch_captured) {
> +		armv8pmu_branch_read(cpuc, event);
> +		*branch_captured = true;
> +	}
> +	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack, NULL);
> +}
> +
>   static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>   {
>   	u32 pmovsr;
> @@ -844,6 +887,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>   	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>   	struct pt_regs *regs;
>   	int idx;
> +	bool branch_captured = false;
>   
>   	/*
>   	 * Get and reset the IRQ flags
> @@ -887,6 +931,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>   		if (!armpmu_event_set_period(event))
>   			continue;
>   
> +		/*
> +		 * PMU IRQ should remain asserted until all branch records
> +		 * are captured and processed into struct perf_sample_data.
> +		 */
> +		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)

nit: Do we really need the cpu_pmu->has_branch_stack check ? The event
wouldn't reach here if the PMU doesn't supported it ?

> +			read_branch_records(cpuc, event, &data, &branch_captured);
> +
>   		/*
>   		 * Perf event overflow will queue the processing of the event as
>   		 * an irq_work which will be taken care of in the handling of
> @@ -896,6 +947,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>   			cpu_pmu->disable(event);
>   	}
>   	armv8pmu_start(cpu_pmu);
> +	if (cpu_pmu->has_branch_stack)
> +		armv8pmu_branch_reset();
>   
>   	return IRQ_HANDLED;
>   }
> @@ -985,6 +1038,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>   	return event->hw.idx;
>   }
>   
> +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
> +{
> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
> +	void *task_ctx = pmu_ctx->task_ctx_data;
> +
> +	if (armpmu->has_branch_stack) {
> +		/* Save branch records in task_ctx on sched out */
> +		if (task_ctx && !sched_in) {
> +			armv8pmu_branch_save(armpmu, task_ctx);
> +			return;
> +		}
> +
> +		/* Reset branch records on sched in */
> +		if (sched_in)
> +			armv8pmu_branch_reset();
> +	}
> +}
> +
>   /*
>    * Add an event filter to a given event.
>    */
> @@ -1077,6 +1148,9 @@ static void armv8pmu_reset(void *info)
>   		pmcr |= ARMV8_PMU_PMCR_LP;
>   
>   	armv8pmu_pmcr_write(pmcr);
> +
> +	if (cpu_pmu->has_branch_stack)
> +		armv8pmu_branch_reset();
>   }
>   
>   static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
> @@ -1114,6 +1188,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>   
>   	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>   
> +	if (has_branch_stack(event)) {
> +		if (!armv8pmu_branch_attr_valid(event))
> +			return -EOPNOTSUPP;
> +
> +		/*
> +		 * If a task gets scheduled out, the current branch records
> +		 * get saved in the task's context data, which can be later
> +		 * used to fill in the records upon an event overflow. Let's
> +		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
> +		 * all branch stack sampling perf events.
> +		 */
> +		event->attach_state |= PERF_ATTACH_TASK_DATA;
> +	}
> +
>   	/*
>   	 * CHAIN events only work when paired with an adjacent counter, and it
>   	 * never makes sense for a user to open one in isolation, as they'll be
> @@ -1229,6 +1317,41 @@ static void __armv8pmu_probe_pmu(void *info)
>   		cpu_pmu->reg_pmmir = read_pmmir();
>   	else
>   		cpu_pmu->reg_pmmir = 0;
> +
> +	/*
> +	 * BRBE is being probed on a single cpu for a
> +	 * given PMU. The remaining cpus, are assumed
> +	 * to have the exact same BRBE implementation.
> +	 */
> +	armv8pmu_branch_probe(cpu_pmu);
> +}
> +
> +static int branch_records_alloc(struct arm_pmu *armpmu)
> +{
> +	struct branch_records __percpu *records;
> +	int cpu;
> +
> +	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
> +	if (!records)
> +		return -ENOMEM;
> +
> +	/*
> +	 * percpu memory allocated for 'records' gets completely consumed
> +	 * here, and never required to be freed up later. So permanently
> +	 * losing access to this anchor i.e 'records' is acceptable.
> +	 *
> +	 * Otherwise this allocation handle would have to be saved up for
> +	 * free_percpu() release later if required.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		struct pmu_hw_events *events_cpu;
> +		struct branch_records *records_cpu;
> +
> +		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
> +		records_cpu = per_cpu_ptr(records, cpu);
> +		events_cpu->branches = records_cpu;
> +	}
> +	return 0;
>   }
>   
>   static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
> @@ -1245,7 +1368,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>   	if (ret)
>   		return ret;
>   
> -	return probe.present ? 0 : -ENODEV;
> +	if (!probe.present)
> +		return -ENODEV;
> +
> +	if (cpu_pmu->has_branch_stack) {
> +		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
> +		if (ret)
> +			return ret;
> +
> +		ret = branch_records_alloc(cpu_pmu);
> +		if (ret) {
> +			armv8pmu_task_ctx_cache_free(cpu_pmu);
> +			return ret;
> +		}
> +	}
> +	return 0;
>   }
>   
>   static void armv8pmu_disable_user_access_ipi(void *unused)
> @@ -1304,6 +1441,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
>   	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
>   
>   	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
> +	cpu_pmu->sched_task		= armv8pmu_sched_task;
> +	cpu_pmu->branch_reset		= armv8pmu_branch_reset;
>   
>   	cpu_pmu->name			= name;
>   	cpu_pmu->map_event		= map_event;
> diff --git a/drivers/perf/arm_pmuv3_branch.h b/drivers/perf/arm_pmuv3_branch.h
> new file mode 100644
> index 000000000000..609e4d4ccac6
> --- /dev/null
> +++ b/drivers/perf/arm_pmuv3_branch.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Branch Record Buffer Extension Helpers.
> + *
> + * Copyright (C) 2022-2023 ARM Limited
> + *
> + * Author: Anshuman Khandual <anshuman.khandual@arm.com>
> + */
> +#include <linux/perf/arm_pmu.h>
> +
> +static inline void armv8pmu_branch_reset(void)
> +{
> +}
> +
> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
> +{
> +}
> +
> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
> +{
> +	WARN_ON_ONCE(!has_branch_stack(event));
> +	return false;
> +}
> +
> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
> +{
> +}
> +
> +static inline void armv8pmu_branch_disable(void)
> +{
> +}
> +
> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
> +					struct perf_event *event)
> +{
> +	WARN_ON_ONCE(!has_branch_stack(event));
> +}
> +
> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
> +{
> +}
> +
> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
> +{
> +	return 0;
> +}
> +
> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
> +{
> +}
> diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
> index b3b34f6670cf..8cfcc735c0f7 100644
> --- a/include/linux/perf/arm_pmu.h
> +++ b/include/linux/perf/arm_pmu.h
> @@ -46,6 +46,18 @@ static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_63BIT) == ARMPMU_EVT_63BIT);
>   	},								\
>   }
>   
> +/*
> + * Maximum branch record entries which could be processed
> + * for core perf branch stack sampling support, regardless
> + * of the hardware support available on a given ARM PMU.
> + */
> +#define MAX_BRANCH_RECORDS 64
> +
> +struct branch_records {
> +	struct perf_branch_stack	branch_stack;
> +	struct perf_branch_entry	branch_entries[MAX_BRANCH_RECORDS];
> +};
> +
>   /* The events for a given PMU register set. */
>   struct pmu_hw_events {
>   	/*
> @@ -66,6 +78,17 @@ struct pmu_hw_events {
>   	struct arm_pmu		*percpu_pmu;
>   
>   	int irq;
> +
> +	struct branch_records	*branches;
> +
> +	/* Active context for task events */
> +	void			*brbe_context;
> +
> +	/* Active events requesting branch records */

Please see my comment above.

> +	unsigned int		brbe_users;
> +
> +	/* Active branch sample type filters */
> +	unsigned long		brbe_sample_type;
>   };
>   
>   enum armpmu_attr_groups {
> @@ -96,8 +119,12 @@ struct arm_pmu {
>   	void		(*stop)(struct arm_pmu *);
>   	void		(*reset)(void *);
>   	int		(*map_event)(struct perf_event *event);
> +	void		(*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
> +	void		(*branch_reset)(void);
>   	int		num_events;
> -	bool		secure_access; /* 32-bit ARM only */
> +	unsigned int	secure_access	: 1, /* 32-bit ARM only */
> +			has_branch_stack: 1, /* 64-bit ARM only */
> +			reserved	: 30;
>   #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
>   	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
>   #define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE	0x4000
> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
> index 46377e134d67..c3e7d2cfb737 100644
> --- a/include/linux/perf/arm_pmuv3.h
> +++ b/include/linux/perf/arm_pmuv3.h
> @@ -308,5 +308,4 @@
>   		default: WARN(1, "Invalid PMEV* index\n");	\
>   		}						\
>   	} while (0)
> -

nit: Unrelated change ?

Suzuki

>   #endif
Anshuman Khandual Jan. 29, 2024, 4:35 a.m. UTC | #2
On 1/25/24 19:14, Suzuki K Poulose wrote:
> On 25/01/2024 09:41, Anshuman Khandual wrote:
>> Branch stack sampling support i.e capturing branch records during execution
>> in core perf, rides along with normal HW events being scheduled on the PMU.
>> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
>> with required HW implementation.
>>
>> ARMV8 PMU hardware support for branch stack sampling is indicated via a new
>> feature flag called 'has_branch_stack' that can be ascertained via probing.
>> This modifies current gate in armpmu_event_init() which blocks branch stack
>> sampling based perf events unconditionally. Instead allows such perf events
>> getting initialized on supporting PMU hardware.
>>
>> Branch stack sampling is enabled and disabled along with regular PMU events
>> . This adds required function callbacks in armv8pmu_branch_xxx() format, to
>> drive the PMU branch stack hardware when supported. This also adds fallback
>> stub definitions for these callbacks for PMUs which would not have required
>> support.
>>
>> If a task gets scheduled out, the current branch records get saved in the
>> task's context data, which can be later used to fill in the records upon an
>> event overflow. Hence, we enable PERF_ATTACH_TASK_DATA (event->attach_state
>> based flag) for branch stack requesting perf events. But this also requires
>> adding support for pmu::sched_task() callback to arm_pmu.
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>> Changes in V16:
>>
>> - Renamed arm_brbe.h as arm_pmuv3_branch.h
>> - Updated perf_sample_save_brstack()'s new argument requirements with NULL
>>
>>   drivers/perf/arm_pmu.c          |  57 ++++++++++++-
>>   drivers/perf/arm_pmuv3.c        | 141 +++++++++++++++++++++++++++++++-
>>   drivers/perf/arm_pmuv3_branch.h |  50 +++++++++++
>>   include/linux/perf/arm_pmu.h    |  29 ++++++-
>>   include/linux/perf/arm_pmuv3.h  |   1 -
>>   5 files changed, 273 insertions(+), 5 deletions(-)
>>   create mode 100644 drivers/perf/arm_pmuv3_branch.h
>>
>> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
>> index 8458fe2cebb4..16f488ae7747 100644
>> --- a/drivers/perf/arm_pmu.c
>> +++ b/drivers/perf/arm_pmu.c
>> @@ -317,6 +317,15 @@ armpmu_del(struct perf_event *event, int flags)
>>       struct hw_perf_event *hwc = &event->hw;
>>       int idx = hwc->idx;
>>   +    if (has_branch_stack(event)) {
>> +        WARN_ON_ONCE(!hw_events->brbe_users);
>> +        hw_events->brbe_users--;
> 
> ^^ Should we do this only if the event matches the sample type ? Put the other way around, what does brbe_users track ? The number of events that
> "share" the BRBE instance ? Or the number of active events that
> has_branch_stack() ?

Active perf events with has_branch_stack() irrespective of whether
there is branch_sample_type match or not i.e branch records might
be consumed or not.

> 
>> +        if (!hw_events->brbe_users) {
>> +            hw_events->brbe_context = NULL;
>> +            hw_events->brbe_sample_type = 0;
>> +        }
>> +    }
>> +
>>       armpmu_stop(event, PERF_EF_UPDATE);
>>       hw_events->events[idx] = NULL;
>>       armpmu->clear_event_idx(hw_events, event);
>> @@ -333,6 +342,38 @@ armpmu_add(struct perf_event *event, int flags)
>>       struct hw_perf_event *hwc = &event->hw;
>>       int idx;
>>   +    if (has_branch_stack(event)) {
>> +        /*
>> +         * Reset branch records buffer if a new CPU bound event
>> +         * gets scheduled on a PMU. Otherwise existing branch
>> +         * records present in the buffer might just leak into
>> +         * such events.
>> +         *
>> +         * Also reset current 'hw_events->brbe_context' because
>> +         * any previous task bound event now would have lost an
>> +         * opportunity for continuous branch records.
>> +         */
>> +        if (!event->ctx->task) {
>> +            hw_events->brbe_context = NULL;
>> +            if (armpmu->branch_reset)
>> +                armpmu->branch_reset();
>> +        }
>> +
>> +        /*
>> +         * Reset branch records buffer if a new task event gets
>> +         * scheduled on a PMU which might have existing records.
>> +         * Otherwise older branch records present in the buffer
>> +         * might leak into the new task event.
>> +         */
>> +        if (event->ctx->task && hw_events->brbe_context != event->ctx) {
>> +            hw_events->brbe_context = event->ctx;
>> +            if (armpmu->branch_reset)
>> +                armpmu->branch_reset();
>> +        }
>> +        hw_events->brbe_users++;
>> +        hw_events->brbe_sample_type = event->attr.branch_sample_type;
>> +    }
>> +
>>       /* An event following a process won't be stopped earlier */
>>       if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
>>           return -ENOENT;
>> @@ -511,13 +552,24 @@ static int armpmu_event_init(struct perf_event *event)
>>           !cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
>>           return -ENOENT;
>>   -    /* does not support taken branch sampling */
>> -    if (has_branch_stack(event))
>> +    /*
>> +     * Branch stack sampling events are allowed
>> +     * only on PMU which has required support.
>> +     */
>> +    if (has_branch_stack(event) && !armpmu->has_branch_stack)
>>           return -EOPNOTSUPP;
>>         return __hw_perf_event_init(event);
>>   }
>>   +static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>> +{
>> +    struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>> +
>> +    if (armpmu->sched_task)
>> +        armpmu->sched_task(pmu_ctx, sched_in);
>> +}
>> +
>>   static void armpmu_enable(struct pmu *pmu)
>>   {
>>       struct arm_pmu *armpmu = to_arm_pmu(pmu);
>> @@ -864,6 +916,7 @@ struct arm_pmu *armpmu_alloc(void)
>>       }
>>         pmu->pmu = (struct pmu) {
>> +        .sched_task    = armpmu_sched_task,
>>           .pmu_enable    = armpmu_enable,
>>           .pmu_disable    = armpmu_disable,
>>           .event_init    = armpmu_event_init,
>> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
>> index 23fa6c5da82c..9e17764a0929 100644
>> --- a/drivers/perf/arm_pmuv3.c
>> +++ b/drivers/perf/arm_pmuv3.c
>> @@ -26,6 +26,7 @@
>>   #include <linux/nmi.h>
>>     #include <asm/arm_pmuv3.h>
>> +#include "arm_pmuv3_branch.h"
>>     /* ARMv8 Cortex-A53 specific event types. */
>>   #define ARMV8_A53_PERFCTR_PREF_LINEFILL                0xC2
>> @@ -829,14 +830,56 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
>>       armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
>>         kvm_vcpu_pmu_resync_el0();
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_enable(cpu_pmu);
> 
> Is there a reason why do this after kvm_vcpu_pmu_resync_el0() ?
> Ideally, we should get counting as soon as the PMU is on ?

But if the kernel is also being traced, branches from kvm_vcpu_pmu_resync_el0()
might get into final BRBE samples as well. Placing branch_enable() at the last
help avoid such situations. Although the same could also be reasoned about
normal PMU event counters being enabled via armv8pmu_pmcr_write() as well. So
armv8pmu_branch_enable() could be moved before kvm_vcpu_pmu_resync_el0().

> 
>>   }
>>     static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
>>   {
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_disable();
>> +
>>       /* Disable all counters */
>>       armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
>>   }
>>   +static void read_branch_records(struct pmu_hw_events *cpuc,
>> +                struct perf_event *event,
>> +                struct perf_sample_data *data,
>> +                bool *branch_captured)
>> +{
>> +    /*
>> +     * CPU specific branch records buffer must have been allocated already
>> +     * for the hardware records to be captured and processed further.
>> +     */
>> +    if (WARN_ON(!cpuc->branches))
>> +        return;
>> +
>> +    /*
>> +     * Overflowed event's branch_sample_type does not match the configured
>> +     * branch filters in the BRBE HW. So the captured branch records here
>> +     * cannot be co-related to the overflowed event. Report to the user as
>> +     * if no branch records have been captured, and flush branch records.
>> +     * The same scenario is applicable when the current task context does
>> +     * not match with overflown event.
>> +     */
>> +    if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
>> +        (event->ctx->task && cpuc->brbe_context != event->ctx))
>> +        return;
>> +
>> +    /*
>> +     * Read the branch records from the hardware once after the PMU IRQ
>> +     * has been triggered but subsequently same records can be used for
>> +     * other events that might have been overflowed simultaneously thus
>> +     * saving much CPU cycles.
>> +     */
>> +    if (!*branch_captured) {
>> +        armv8pmu_branch_read(cpuc, event);
>> +        *branch_captured = true;
>> +    }
>> +    perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack, NULL);
>> +}
>> +
>>   static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>   {
>>       u32 pmovsr;
>> @@ -844,6 +887,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>       struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>>       struct pt_regs *regs;
>>       int idx;
>> +    bool branch_captured = false;
>>         /*
>>        * Get and reset the IRQ flags
>> @@ -887,6 +931,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>           if (!armpmu_event_set_period(event))
>>               continue;
>>   +        /*
>> +         * PMU IRQ should remain asserted until all branch records
>> +         * are captured and processed into struct perf_sample_data.
>> +         */
>> +        if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
> 
> nit: Do we really need the cpu_pmu->has_branch_stack check ? The event
> wouldn't reach here if the PMU doesn't supported it ?

This is just an additional check - has_branch_stack() based event might
not have reached here otherwise i.e without the PMU supporting branch
records.

> 
>> +            read_branch_records(cpuc, event, &data, &branch_captured);
>> +
>>           /*
>>            * Perf event overflow will queue the processing of the event as
>>            * an irq_work which will be taken care of in the handling of
>> @@ -896,6 +947,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>               cpu_pmu->disable(event);
>>       }
>>       armv8pmu_start(cpu_pmu);
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_reset();
>>         return IRQ_HANDLED;
>>   }
>> @@ -985,6 +1038,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>>       return event->hw.idx;
>>   }
>>   +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>> +{
>> +    struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>> +    void *task_ctx = pmu_ctx->task_ctx_data;
>> +
>> +    if (armpmu->has_branch_stack) {
>> +        /* Save branch records in task_ctx on sched out */
>> +        if (task_ctx && !sched_in) {
>> +            armv8pmu_branch_save(armpmu, task_ctx);
>> +            return;
>> +        }
>> +
>> +        /* Reset branch records on sched in */
>> +        if (sched_in)
>> +            armv8pmu_branch_reset();
>> +    }
>> +}
>> +
>>   /*
>>    * Add an event filter to a given event.
>>    */
>> @@ -1077,6 +1148,9 @@ static void armv8pmu_reset(void *info)
>>           pmcr |= ARMV8_PMU_PMCR_LP;
>>         armv8pmu_pmcr_write(pmcr);
>> +
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_reset();
>>   }
>>     static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
>> @@ -1114,6 +1188,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>>         hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>>   +    if (has_branch_stack(event)) {
>> +        if (!armv8pmu_branch_attr_valid(event))
>> +            return -EOPNOTSUPP;
>> +
>> +        /*
>> +         * If a task gets scheduled out, the current branch records
>> +         * get saved in the task's context data, which can be later
>> +         * used to fill in the records upon an event overflow. Let's
>> +         * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
>> +         * all branch stack sampling perf events.
>> +         */
>> +        event->attach_state |= PERF_ATTACH_TASK_DATA;
>> +    }
>> +
>>       /*
>>        * CHAIN events only work when paired with an adjacent counter, and it
>>        * never makes sense for a user to open one in isolation, as they'll be
>> @@ -1229,6 +1317,41 @@ static void __armv8pmu_probe_pmu(void *info)
>>           cpu_pmu->reg_pmmir = read_pmmir();
>>       else
>>           cpu_pmu->reg_pmmir = 0;
>> +
>> +    /*
>> +     * BRBE is being probed on a single cpu for a
>> +     * given PMU. The remaining cpus, are assumed
>> +     * to have the exact same BRBE implementation.
>> +     */
>> +    armv8pmu_branch_probe(cpu_pmu);
>> +}
>> +
>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>> +{
>> +    struct branch_records __percpu *records;
>> +    int cpu;
>> +
>> +    records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
>> +    if (!records)
>> +        return -ENOMEM;
>> +
>> +    /*
>> +     * percpu memory allocated for 'records' gets completely consumed
>> +     * here, and never required to be freed up later. So permanently
>> +     * losing access to this anchor i.e 'records' is acceptable.
>> +     *
>> +     * Otherwise this allocation handle would have to be saved up for
>> +     * free_percpu() release later if required.
>> +     */
>> +    for_each_possible_cpu(cpu) {
>> +        struct pmu_hw_events *events_cpu;
>> +        struct branch_records *records_cpu;
>> +
>> +        events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
>> +        records_cpu = per_cpu_ptr(records, cpu);
>> +        events_cpu->branches = records_cpu;
>> +    }
>> +    return 0;
>>   }
>>     static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>> @@ -1245,7 +1368,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>       if (ret)
>>           return ret;
>>   -    return probe.present ? 0 : -ENODEV;
>> +    if (!probe.present)
>> +        return -ENODEV;
>> +
>> +    if (cpu_pmu->has_branch_stack) {
>> +        ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
>> +        if (ret)
>> +            return ret;
>> +
>> +        ret = branch_records_alloc(cpu_pmu);
>> +        if (ret) {
>> +            armv8pmu_task_ctx_cache_free(cpu_pmu);
>> +            return ret;
>> +        }
>> +    }
>> +    return 0;
>>   }
>>     static void armv8pmu_disable_user_access_ipi(void *unused)
>> @@ -1304,6 +1441,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
>>       cpu_pmu->set_event_filter    = armv8pmu_set_event_filter;
>>         cpu_pmu->pmu.event_idx        = armv8pmu_user_event_idx;
>> +    cpu_pmu->sched_task        = armv8pmu_sched_task;
>> +    cpu_pmu->branch_reset        = armv8pmu_branch_reset;
>>         cpu_pmu->name            = name;
>>       cpu_pmu->map_event        = map_event;
>> diff --git a/drivers/perf/arm_pmuv3_branch.h b/drivers/perf/arm_pmuv3_branch.h
>> new file mode 100644
>> index 000000000000..609e4d4ccac6
>> --- /dev/null
>> +++ b/drivers/perf/arm_pmuv3_branch.h
>> @@ -0,0 +1,50 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Branch Record Buffer Extension Helpers.
>> + *
>> + * Copyright (C) 2022-2023 ARM Limited
>> + *
>> + * Author: Anshuman Khandual <anshuman.khandual@arm.com>
>> + */
>> +#include <linux/perf/arm_pmu.h>
>> +
>> +static inline void armv8pmu_branch_reset(void)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> +
>> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
>> +{
>> +    WARN_ON_ONCE(!has_branch_stack(event));
>> +    return false;
>> +}
>> +
>> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_disable(void)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
>> +                    struct perf_event *event)
>> +{
>> +    WARN_ON_ONCE(!has_branch_stack(event));
>> +}
>> +
>> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
>> +{
>> +}
>> +
>> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
>> +{
>> +    return 0;
>> +}
>> +
>> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
>> index b3b34f6670cf..8cfcc735c0f7 100644
>> --- a/include/linux/perf/arm_pmu.h
>> +++ b/include/linux/perf/arm_pmu.h
>> @@ -46,6 +46,18 @@ static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_63BIT) == ARMPMU_EVT_63BIT);
>>       },                                \
>>   }
>>   +/*
>> + * Maximum branch record entries which could be processed
>> + * for core perf branch stack sampling support, regardless
>> + * of the hardware support available on a given ARM PMU.
>> + */
>> +#define MAX_BRANCH_RECORDS 64
>> +
>> +struct branch_records {
>> +    struct perf_branch_stack    branch_stack;
>> +    struct perf_branch_entry    branch_entries[MAX_BRANCH_RECORDS];
>> +};
>> +
>>   /* The events for a given PMU register set. */
>>   struct pmu_hw_events {
>>       /*
>> @@ -66,6 +78,17 @@ struct pmu_hw_events {
>>       struct arm_pmu        *percpu_pmu;
>>         int irq;
>> +
>> +    struct branch_records    *branches;
>> +
>> +    /* Active context for task events */
>> +    void            *brbe_context;
>> +
>> +    /* Active events requesting branch records */
> 
> Please see my comment above.

This is true - brbe_users tracks number of active perf events requesting
branch records, irrespective of whether their branch_sample_type matches
with each other or not.

> 
>> +    unsigned int        brbe_users;
>> +
>> +    /* Active branch sample type filters */
>> +    unsigned long        brbe_sample_type;
>>   };
>>     enum armpmu_attr_groups {
>> @@ -96,8 +119,12 @@ struct arm_pmu {
>>       void        (*stop)(struct arm_pmu *);
>>       void        (*reset)(void *);
>>       int        (*map_event)(struct perf_event *event);
>> +    void        (*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
>> +    void        (*branch_reset)(void);
>>       int        num_events;
>> -    bool        secure_access; /* 32-bit ARM only */
>> +    unsigned int    secure_access    : 1, /* 32-bit ARM only */
>> +            has_branch_stack: 1, /* 64-bit ARM only */
>> +            reserved    : 30;
>>   #define ARMV8_PMUV3_MAX_COMMON_EVENTS        0x40
>>       DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
>>   #define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE    0x4000
>> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
>> index 46377e134d67..c3e7d2cfb737 100644
>> --- a/include/linux/perf/arm_pmuv3.h
>> +++ b/include/linux/perf/arm_pmuv3.h
>> @@ -308,5 +308,4 @@
>>           default: WARN(1, "Invalid PMEV* index\n");    \
>>           }                        \
>>       } while (0)
>> -
> 
> nit: Unrelated change ?

Right, will fix and fold.
Mark Rutland Feb. 21, 2024, 5:25 p.m. UTC | #3
Hi Anshuman,

On Thu, Jan 25, 2024 at 03:11:14PM +0530, Anshuman Khandual wrote:
> Branch stack sampling support i.e capturing branch records during execution
> in core perf, rides along with normal HW events being scheduled on the PMU.
> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
> with required HW implementation.

Please can we start a bit more clearly, e.g.

| drivers: perf: arm_pmu: add instructure for branch stack sampling
| 
| In order to support the Branch Record Buffer Extension (BRBE), we need to
| extend the arm_pmu framework with some basic infrastructure for branch stack
| sampling which arm_pmu drivers can opt-in to using. Subsequent patches will
| use this to add support for BRBE in the PMUv3 driver.

> ARMV8 PMU hardware support for branch stack sampling is indicated via a new
> feature flag called 'has_branch_stack' that can be ascertained via probing.
> This modifies current gate in armpmu_event_init() which blocks branch stack
> sampling based perf events unconditionally. Instead allows such perf events
> getting initialized on supporting PMU hardware.

This paragraph can be deleted. The addition of 'has_branch_stack' and its use
in armpmu_event_init() is trivial and obvious in-context, and this distracts
from the important parts of this patch.

> Branch stack sampling is enabled and disabled along with regular PMU events
> . This adds required function callbacks in armv8pmu_branch_xxx() format, to
> drive the PMU branch stack hardware when supported. This also adds fallback
> stub definitions for these callbacks for PMUs which would not have required
> support.

Those additions to the PMUv3 driver should all be in the next patch.

We don't add anything for the other PMU drivers that don't support branch
sampling, so why do we need to do *anything* to the PMUv3 driver here, given we
add the support in the next patch? Those additions only make this patch bigger
and more confusing (and hence more painful to review).

> If a task gets scheduled out, the current branch records get saved in the
> task's context data, which can be later used to fill in the records upon an
> event overflow. Hence, we enable PERF_ATTACH_TASK_DATA (event->attach_state
> based flag) for branch stack requesting perf events. But this also requires
> adding support for pmu::sched_task() callback to arm_pmu.

I think what this is trying to say is:

| With BRBE, the hardware records branches into a hardware FIFO, which will be
| sampled by software when perf events overflow. A task may be context-switched
| an arbitrary number of times between overflows, and to avoid losing samples
| we need to save the current records when a task is context-switched out. To
| do these we'll need to use the pmu::sched_task() callback, and we'll need to
| allocate some per-task storage space using PERF_ATTACH_TASK_DATA.

> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
> Changes in V16:
> 
> - Renamed arm_brbe.h as arm_pmuv3_branch.h
> - Updated perf_sample_save_brstack()'s new argument requirements with NULL
> 
>  drivers/perf/arm_pmu.c          |  57 ++++++++++++-
>  drivers/perf/arm_pmuv3.c        | 141 +++++++++++++++++++++++++++++++-
>  drivers/perf/arm_pmuv3_branch.h |  50 +++++++++++
>  include/linux/perf/arm_pmu.h    |  29 ++++++-
>  include/linux/perf/arm_pmuv3.h  |   1 -
>  5 files changed, 273 insertions(+), 5 deletions(-)
>  create mode 100644 drivers/perf/arm_pmuv3_branch.h
> 
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 8458fe2cebb4..16f488ae7747 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -317,6 +317,15 @@ armpmu_del(struct perf_event *event, int flags)
>  	struct hw_perf_event *hwc = &event->hw;
>  	int idx = hwc->idx;
>  
> +	if (has_branch_stack(event)) {
> +		WARN_ON_ONCE(!hw_events->brbe_users);
> +		hw_events->brbe_users--;
> +		if (!hw_events->brbe_users) {
> +			hw_events->brbe_context = NULL;
> +			hw_events->brbe_sample_type = 0;
> +		}
> +	}
> +

If this is going to leak into the core arm_pmu code, use "branch_stack" rather
than "brbe" for these field names.

However, I reckon we could just have two new callbacks on arm_pmu:

	branch_stack_add(struct perf_event *event, ...);
	branch_stack_del(struct perf_event *event, ...);

... and hide all of the details in the PMUv3 (or BRBE) driver for now, and the
code above can just do:

	if (has_branch_stack(event))
		branch_stack_del(event, ...);

... and likewise in armpmu_add().

That way the actuel management logic for the context and so on can be added in
the next patch, where the lifetime would be *much* clearer.

>  	armpmu_stop(event, PERF_EF_UPDATE);
>  	hw_events->events[idx] = NULL;
>  	armpmu->clear_event_idx(hw_events, event);
> @@ -333,6 +342,38 @@ armpmu_add(struct perf_event *event, int flags)
>  	struct hw_perf_event *hwc = &event->hw;
>  	int idx;
>  
> +	if (has_branch_stack(event)) {
> +		/*
> +		 * Reset branch records buffer if a new CPU bound event
> +		 * gets scheduled on a PMU. Otherwise existing branch
> +		 * records present in the buffer might just leak into
> +		 * such events.
> +		 *
> +		 * Also reset current 'hw_events->brbe_context' because
> +		 * any previous task bound event now would have lost an
> +		 * opportunity for continuous branch records.
> +		 */

Doesn't this mean some user silently loses events? Why is that ok?

> +		if (!event->ctx->task) {
> +			hw_events->brbe_context = NULL;
> +			if (armpmu->branch_reset)
> +				armpmu->branch_reset();
> +		}
> +
> +		/*
> +		 * Reset branch records buffer if a new task event gets
> +		 * scheduled on a PMU which might have existing records.
> +		 * Otherwise older branch records present in the buffer
> +		 * might leak into the new task event.
> +		 */
> +		if (event->ctx->task && hw_events->brbe_context != event->ctx) {
> +			hw_events->brbe_context = event->ctx;
> +			if (armpmu->branch_reset)
> +				armpmu->branch_reset();
> +		}

Same question here.

How does this work on other architectures?

What do we do if the CPU-bound and task-bound events want different filters,
etc?

This is the sort of gnarly detail that should be explained (or at least
introduced) in the commit message.

> +		hw_events->brbe_users++;
> +		hw_events->brbe_sample_type = event->attr.branch_sample_type;

What exactly is brbe_sample_type, and why does it get overriden *every time* we
add a new event? What happens when events have different values for
brbe_sample_type? Or is that forbidden somehow?

> +	}
> +
>  	/* An event following a process won't be stopped earlier */
>  	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
>  		return -ENOENT;

Unless this cpumask check has been made redundant, it means that the code above
it is obviously wrong, since that pokes the BRBE HW and increments brbe_users
*before* we decide whether the event can be installed on this CPU. That'll blow
up on big.LITTLE,  e.g. we try and install a 'big' CPU event on a 'little' CPU,
poke the BRBE HW and increment brbe_users, then *after* that we abort
installing the event.

Even ignoring big.LITTLE, we can fail immediately after this when we don't have
enough counters, since the following code is:

|        /* An event following a process won't be stopped earlier */
|        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
|                return -ENOENT;
|
|        /* If we don't have a space for the counter then finish early. */
|        idx = armpmu->get_event_idx(hw_events, event);
|        if (idx < 0)
|                return idx;

... which'll go wrong if you try to open 1 more event than the CPU has
counters.

> @@ -511,13 +552,24 @@ static int armpmu_event_init(struct perf_event *event)
>  		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
>  		return -ENOENT;
>  
> -	/* does not support taken branch sampling */
> -	if (has_branch_stack(event))
> +	/*
> +	 * Branch stack sampling events are allowed
> +	 * only on PMU which has required support.
> +	 */
> +	if (has_branch_stack(event) && !armpmu->has_branch_stack)
>  		return -EOPNOTSUPP;
>  	return __hw_perf_event_init(event);
>  }
>  

I think we can delete the comment entirely here, but the code itself looks
fine here.

> +static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
> +{
> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
> +
> +	if (armpmu->sched_task)
> +		armpmu->sched_task(pmu_ctx, sched_in);
> +}

This looks fine.

>  static void armpmu_enable(struct pmu *pmu)
>  {
>  	struct arm_pmu *armpmu = to_arm_pmu(pmu);
> @@ -864,6 +916,7 @@ struct arm_pmu *armpmu_alloc(void)
>  	}
>  
>  	pmu->pmu = (struct pmu) {
> +		.sched_task	= armpmu_sched_task,
>  		.pmu_enable	= armpmu_enable,
>  		.pmu_disable	= armpmu_disable,
>  		.event_init	= armpmu_event_init,
> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
> index 23fa6c5da82c..9e17764a0929 100644
> --- a/drivers/perf/arm_pmuv3.c
> +++ b/drivers/perf/arm_pmuv3.c
> @@ -26,6 +26,7 @@
>  #include <linux/nmi.h>
>  
>  #include <asm/arm_pmuv3.h>
> +#include "arm_pmuv3_branch.h"

As above, I do not thing that the PMUv3 driver should change at all in this
patch. As of this patch it achieves nothing, and it makes it really hard to
understand what's going on because the important aspects are spread randomly
across this patch and the next patch which actually adds the BRBE management.

Please factor the PMUv3 changes out into the patch adding the actual BRBE code.

[...]

> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
> index 46377e134d67..c3e7d2cfb737 100644
> --- a/include/linux/perf/arm_pmuv3.h
> +++ b/include/linux/perf/arm_pmuv3.h
> @@ -308,5 +308,4 @@
>  		default: WARN(1, "Invalid PMEV* index\n");	\
>  		}						\
>  	} while (0)
> -
>  #endif

Unrelated whitespace change.

Mark.
Anshuman Khandual March 1, 2024, 5:37 a.m. UTC | #4
On 2/21/24 22:55, Mark Rutland wrote:
> Hi Anshuman,
> 
> On Thu, Jan 25, 2024 at 03:11:14PM +0530, Anshuman Khandual wrote:
>> Branch stack sampling support i.e capturing branch records during execution
>> in core perf, rides along with normal HW events being scheduled on the PMU.
>> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
>> with required HW implementation.
> 
> Please can we start a bit more clearly, e.g.
> 
> | drivers: perf: arm_pmu: add instructure for branch stack sampling
> | 
> | In order to support the Branch Record Buffer Extension (BRBE), we need to
> | extend the arm_pmu framework with some basic infrastructure for branch stack
> | sampling which arm_pmu drivers can opt-in to using. Subsequent patches will
> | use this to add support for BRBE in the PMUv3 driver.

Added this paragraph at the very beginning.

> 
>> ARMV8 PMU hardware support for branch stack sampling is indicated via a new
>> feature flag called 'has_branch_stack' that can be ascertained via probing.
>> This modifies current gate in armpmu_event_init() which blocks branch stack
>> sampling based perf events unconditionally. Instead allows such perf events
>> getting initialized on supporting PMU hardware.
> 
> This paragraph can be deleted. The addition of 'has_branch_stack' and its use
> in armpmu_event_init() is trivial and obvious in-context, and this distracts
> from the important parts of this patch.

Okay, dropped the above paragraph.

> 
>> Branch stack sampling is enabled and disabled along with regular PMU events
>> . This adds required function callbacks in armv8pmu_branch_xxx() format, to
>> drive the PMU branch stack hardware when supported. This also adds fallback
>> stub definitions for these callbacks for PMUs which would not have required
>> support.
> 
> Those additions to the PMUv3 driver should all be in the next patch.

Sure, will do that.

> 
> We don't add anything for the other PMU drivers that don't support branch
> sampling, so why do we need to do *anything* to the PMUv3 driver here, given we
> add the support in the next patch? Those additions only make this patch bigger
> and more confusing (and hence more painful to review).

Understood.

> 
>> If a task gets scheduled out, the current branch records get saved in the
>> task's context data, which can be later used to fill in the records upon an
>> event overflow. Hence, we enable PERF_ATTACH_TASK_DATA (event->attach_state
>> based flag) for branch stack requesting perf events. But this also requires
>> adding support for pmu::sched_task() callback to arm_pmu.
> 
> I think what this is trying to say is:
> 
> | With BRBE, the hardware records branches into a hardware FIFO, which will be
> | sampled by software when perf events overflow. A task may be context-switched
> | an arbitrary number of times between overflows, and to avoid losing samples
> | we need to save the current records when a task is context-switched out. To
> | do these we'll need to use the pmu::sched_task() callback, and we'll need to
> | allocate some per-task storage space using PERF_ATTACH_TASK_DATA.

Replaced as suggested.

> 
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>> Changes in V16:
>>
>> - Renamed arm_brbe.h as arm_pmuv3_branch.h
>> - Updated perf_sample_save_brstack()'s new argument requirements with NULL
>>
>>  drivers/perf/arm_pmu.c          |  57 ++++++++++++-
>>  drivers/perf/arm_pmuv3.c        | 141 +++++++++++++++++++++++++++++++-
>>  drivers/perf/arm_pmuv3_branch.h |  50 +++++++++++
>>  include/linux/perf/arm_pmu.h    |  29 ++++++-
>>  include/linux/perf/arm_pmuv3.h  |   1 -
>>  5 files changed, 273 insertions(+), 5 deletions(-)
>>  create mode 100644 drivers/perf/arm_pmuv3_branch.h
>>
>> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
>> index 8458fe2cebb4..16f488ae7747 100644
>> --- a/drivers/perf/arm_pmu.c
>> +++ b/drivers/perf/arm_pmu.c
>> @@ -317,6 +317,15 @@ armpmu_del(struct perf_event *event, int flags)
>>  	struct hw_perf_event *hwc = &event->hw;
>>  	int idx = hwc->idx;
>>  
>> +	if (has_branch_stack(event)) {
>> +		WARN_ON_ONCE(!hw_events->brbe_users);
>> +		hw_events->brbe_users--;
>> +		if (!hw_events->brbe_users) {
>> +			hw_events->brbe_context = NULL;
>> +			hw_events->brbe_sample_type = 0;
>> +		}
>> +	}
>> +
> 
> If this is going to leak into the core arm_pmu code, use "branch_stack" rather
> than "brbe" for these field names.

Right, makes sense. I too was contemplating for that rename, will change.

> 
> However, I reckon we could just have two new callbacks on arm_pmu:
> 
> 	branch_stack_add(struct perf_event *event, ...);
> 	branch_stack_del(struct perf_event *event, ...);
> 
> ... and hide all of the details in the PMUv3 (or BRBE) driver for now, and the
> code above can just do:
> 
> 	if (has_branch_stack(event))
> 		branch_stack_del(event, ...);
> 
> ... and likewise in armpmu_add().
> 
> That way the actuel management logic for the context and so on can be added in
> the next patch, where the lifetime would be *much* clearer.

Right, will change as required.

> 
>>  	armpmu_stop(event, PERF_EF_UPDATE);
>>  	hw_events->events[idx] = NULL;
>>  	armpmu->clear_event_idx(hw_events, event);
>> @@ -333,6 +342,38 @@ armpmu_add(struct perf_event *event, int flags)
>>  	struct hw_perf_event *hwc = &event->hw;
>>  	int idx;
>>  
>> +	if (has_branch_stack(event)) {
>> +		/*
>> +		 * Reset branch records buffer if a new CPU bound event
>> +		 * gets scheduled on a PMU. Otherwise existing branch
>> +		 * records present in the buffer might just leak into
>> +		 * such events.
>> +		 *
>> +		 * Also reset current 'hw_events->brbe_context' because
>> +		 * any previous task bound event now would have lost an
>> +		 * opportunity for continuous branch records.
>> +		 */
> 
> Doesn't this mean some user silently loses events? Why is that ok?

Previous task bound event that has been on the CPU will loose current branch
records available in BRBE when this happens. Buffer needs reset for records
integrity for the upcoming CPU bound event. Following options are available
in such cases. 

- Let it loose some samples, anyways it's going to be rare (proposed here) 
- Call armv8pmu_branch_save() to save them off on the event, before reset
- Tell the event that it has lost some samples - PERF_RECORD_LOST ?

Please suggest which would be a better solution ? OR there might be some other
approach for this scenario ?

> 
>> +		if (!event->ctx->task) {
>> +			hw_events->brbe_context = NULL;
>> +			if (armpmu->branch_reset)
>> +				armpmu->branch_reset();
>> +		}
>> +
>> +		/*
>> +		 * Reset branch records buffer if a new task event gets
>> +		 * scheduled on a PMU which might have existing records.
>> +		 * Otherwise older branch records present in the buffer
>> +		 * might leak into the new task event.
>> +		 */
>> +		if (event->ctx->task && hw_events->brbe_context != event->ctx) {
>> +			hw_events->brbe_context = event->ctx;
>> +			if (armpmu->branch_reset)
>> +				armpmu->branch_reset();
>> +		}
> 
> Same question here.

As explained above.

> 
> How does this work on other architectures?

I had gone through some of them before but don't recollect the details right now.
I will get back on this later.

> 
> What do we do if the CPU-bound and task-bound events want different filters,
> etc?

Unless the same task comes back again on the CPU, buffer needs to be reset in all
other scenarios to guard against captured branch records integrity for the target
event. Then filter differences does not really matter.

> 
> This is the sort of gnarly detail that should be explained (or at least
> introduced) in the commit message.

Understood, will try and update the commit message accordingly.

> 
>> +		hw_events->brbe_users++;
>> +		hw_events->brbe_sample_type = event->attr.branch_sample_type;
> 
> What exactly is brbe_sample_type, and why does it get overriden *every time* we
> add a new event? What happens when events have different values for
> brbe_sample_type? Or is that forbidden somehow?

brbe_sample_type contains the final perf branch filter that gets into BRBE HW for
the recording session. The proposed solution here goes with the last perf event's
'attr.branch_sample_type' when they get collected for the given PMU via pmu_add()
callback.
	
	hw_events->brbe_sample_type = event->attr.branch_sample_type

So in a scenario where multiple branch events are programmed with different filter
requests, the captured branch records during PMU IRQ might not match the requests
for many events that were scheduled together. Hence we only give the branch records
to the matching events.

static void read_branch_records()
{
...
        /*
         * Overflowed event's branch_sample_type does not match the configured
         * branch filters in the BRBE HW. So the captured branch records here
         * cannot be co-related to the overflowed event. Report to the user as
         * if no branch records have been captured, and flush branch records.
         * The same scenario is applicable when the current task context does
         * not match with overflown event.
         */
        if ((cpuc->branch_sample_type != event->attr.branch_sample_type) ||
            (event->ctx->task && cpuc->branch_context != event->ctx))
                return;
...
}

Please note that we don't prohibit the events from being grouped together on the PMU
i.e pmu_add() does not fail when filters do not match. But there might be some other
approaches that could be taken in such scenarios.

A. Fail pmu_add() when branch_sample_type does not match

B. OR together all event's event->attr.branch_sample_type on a given PMU

	- Then captured records need to be post processed to find applicable samples
	  matching event's original filter request

	- But it might add some more latency to PMU IRQ handling ?

But please do let me know if there are better solutions that can be taken up.

> 
>> +	}
>> +
>>  	/* An event following a process won't be stopped earlier */
>>  	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
>>  		return -ENOENT;
> 
> Unless this cpumask check has been made redundant, it means that the code above
> it is obviously wrong, since that pokes the BRBE HW and increments brbe_users
> *before* we decide whether the event can be installed on this CPU. That'll blow
> up on big.LITTLE,  e.g. we try and install a 'big' CPU event on a 'little' CPU,
> poke the BRBE HW and increment brbe_users, then *after* that we abort
> installing the event.

Agreed, aborting to install the event on the cpu after incrementing brbe_users
will be problematic.

> 
> Even ignoring big.LITTLE, we can fail immediately after this when we don't have
> enough counters, since the following code is:
> 
> |        /* An event following a process won't be stopped earlier */
> |        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
> |                return -ENOENT;
> |
> |        /* If we don't have a space for the counter then finish early. */
> |        idx = armpmu->get_event_idx(hw_events, event);
> |        if (idx < 0)
> |                return idx;
> 
> ... which'll go wrong if you try to open 1 more event than the CPU has
> counters.

Agreed, the event needs to clear that test as well before incrementing brbe_users.

Should the branch stack context needs to be installed only after the event has
cleared get_event_idx() successfully along with HW counters availability check
etc before proceeding to install on the CPU ? IOW just move the block bit down

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index ac07911263a9..d657ce337f10 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -336,9 +336,6 @@ armpmu_add(struct perf_event *event, int flags)
        struct hw_perf_event *hwc = &event->hw;
        int idx;
 
-       if (has_branch_stack(event))
-               armpmu->branch_stack_add(event, hw_events);
-
        /* An event following a process won't be stopped earlier */
        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
                return -ENOENT;
@@ -348,6 +345,9 @@ armpmu_add(struct perf_event *event, int flags)
        if (idx < 0)
                return idx;
 
+       if (has_branch_stack(event))
+               armpmu->branch_stack_add(event, hw_events);
+
        /*
         * If there is an event in the counter we are going to use then make
         * sure it is disabled.


> 
>> @@ -511,13 +552,24 @@ static int armpmu_event_init(struct perf_event *event)
>>  		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
>>  		return -ENOENT;
>>  
>> -	/* does not support taken branch sampling */
>> -	if (has_branch_stack(event))
>> +	/*
>> +	 * Branch stack sampling events are allowed
>> +	 * only on PMU which has required support.
>> +	 */
>> +	if (has_branch_stack(event) && !armpmu->has_branch_stack)
>>  		return -EOPNOTSUPP;
>>  	return __hw_perf_event_init(event);
>>  }
>>  
> 
> I think we can delete the comment entirely here, but the code itself looks
> fine here.

Sure, will delete the above comment.

> 
>> +static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>> +{
>> +	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>> +
>> +	if (armpmu->sched_task)
>> +		armpmu->sched_task(pmu_ctx, sched_in);
>> +}
> 
> This looks fine.
> 
>>  static void armpmu_enable(struct pmu *pmu)
>>  {
>>  	struct arm_pmu *armpmu = to_arm_pmu(pmu);
>> @@ -864,6 +916,7 @@ struct arm_pmu *armpmu_alloc(void)
>>  	}
>>  
>>  	pmu->pmu = (struct pmu) {
>> +		.sched_task	= armpmu_sched_task,
>>  		.pmu_enable	= armpmu_enable,
>>  		.pmu_disable	= armpmu_disable,
>>  		.event_init	= armpmu_event_init,
>> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
>> index 23fa6c5da82c..9e17764a0929 100644
>> --- a/drivers/perf/arm_pmuv3.c
>> +++ b/drivers/perf/arm_pmuv3.c
>> @@ -26,6 +26,7 @@
>>  #include <linux/nmi.h>
>>  
>>  #include <asm/arm_pmuv3.h>
>> +#include "arm_pmuv3_branch.h"
> 
> As above, I do not thing that the PMUv3 driver should change at all in this
> patch. As of this patch it achieves nothing, and it makes it really hard to
> understand what's going on because the important aspects are spread randomly
> across this patch and the next patch which actually adds the BRBE management.
> 
> Please factor the PMUv3 changes out into the patch adding the actual BRBE code.

Sure, will keep the following changes in this patch.

A. drivers/perf/arm_pmu.c

	- armpmu_add() --> armpmu->branch_stack_add()
	- armpmu_del() --> armpmu->branch_stack_del()
	- Allowing has_branch_stack() events in armpmu_event_init()
	- Adding callback arm_pmu->pmu->sched_task = armpmu_sched_task

B. include/linux/perf/arm_pmu.h

	- Adding branch elements into pmu_hw_events
	- Adding branch callbacks into arm_pmu
	- Adding sched_task() into arm_pmu
	- Adding has_branch_stack into arm_pmu

Move everything else into the next patch implementing BRBE.

drivers/perf/arm_pmuv3.c 
drivers/perf/arm_pmuv3_branch.h

> 
> [...]
> 
>> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
>> index 46377e134d67..c3e7d2cfb737 100644
>> --- a/include/linux/perf/arm_pmuv3.h
>> +++ b/include/linux/perf/arm_pmuv3.h
>> @@ -308,5 +308,4 @@
>>  		default: WARN(1, "Invalid PMEV* index\n");	\
>>  		}						\
>>  	} while (0)
>> -
>>  #endif
> 
> Unrelated whitespace change.

Already folded this in.
Mark Rutland March 1, 2024, 1:52 p.m. UTC | #5
On Fri, Mar 01, 2024 at 11:07:32AM +0530, Anshuman Khandual wrote:
> On 2/21/24 22:55, Mark Rutland wrote:
> > On Thu, Jan 25, 2024 at 03:11:14PM +0530, Anshuman Khandual wrote:

> >> @@ -333,6 +342,38 @@ armpmu_add(struct perf_event *event, int flags)
> >>  	struct hw_perf_event *hwc = &event->hw;
> >>  	int idx;
> >>  
> >> +	if (has_branch_stack(event)) {
> >> +		/*
> >> +		 * Reset branch records buffer if a new CPU bound event
> >> +		 * gets scheduled on a PMU. Otherwise existing branch
> >> +		 * records present in the buffer might just leak into
> >> +		 * such events.
> >> +		 *
> >> +		 * Also reset current 'hw_events->brbe_context' because
> >> +		 * any previous task bound event now would have lost an
> >> +		 * opportunity for continuous branch records.
> >> +		 */
> > 
> > Doesn't this mean some user silently loses events? Why is that ok?
> 
> Previous task bound event that has been on the CPU will loose current branch
> records available in BRBE when this happens. Buffer needs reset for records
> integrity for the upcoming CPU bound event. Following options are available
> in such cases. 
> 
> - Let it loose some samples, anyways it's going to be rare (proposed here) 
> - Call armv8pmu_branch_save() to save them off on the event, before reset
> - Tell the event that it has lost some samples - PERF_RECORD_LOST ?
> 
> Please suggest which would be a better solution ? OR there might be some other
> approach for this scenario ?

TBH, I'm not immediately sure what the best option is here, and this is part of
the bigger problem of "how do multiple events with branch sampling interact?".

I'll need to go explore that problem space (and see what other architectures
do). For now, it would be good if you could handle the patch restructuring
(i.e. splitting the PMUv3 and arm_pmu changes) sorted first, and then we can
consider the BRBE sharing / lifetime problems atop that.

So for now (i.e. for v17), leave this as-is; 

[...]

> > 
> >> +		hw_events->brbe_users++;
> >> +		hw_events->brbe_sample_type = event->attr.branch_sample_type;
> > 
> > What exactly is brbe_sample_type, and why does it get overriden *every time* we
> > add a new event? What happens when events have different values for
> > brbe_sample_type? Or is that forbidden somehow?
> 
> brbe_sample_type contains the final perf branch filter that gets into BRBE HW for
> the recording session. The proposed solution here goes with the last perf event's
> 'attr.branch_sample_type' when they get collected for the given PMU via pmu_add()
> callback.
> 	
> 	hw_events->brbe_sample_type = event->attr.branch_sample_type
> 
> So in a scenario where multiple branch events are programmed with different filter
> requests, the captured branch records during PMU IRQ might not match the requests
> for many events that were scheduled together. Hence we only give the branch records
> to the matching events.
> 
> static void read_branch_records()
> {
> ...
>         /*
>          * Overflowed event's branch_sample_type does not match the configured
>          * branch filters in the BRBE HW. So the captured branch records here
>          * cannot be co-related to the overflowed event. Report to the user as
>          * if no branch records have been captured, and flush branch records.
>          * The same scenario is applicable when the current task context does
>          * not match with overflown event.
>          */
>         if ((cpuc->branch_sample_type != event->attr.branch_sample_type) ||
>             (event->ctx->task && cpuc->branch_context != event->ctx))
>                 return;
> ...
> }

I see; it's good that we filter that in read_branch_records(), but this doesn't
feel right from a lifetime perspective. For example, if you install a pinned
per-cpu event with branch sample type A, then a task temporarily installs a
task-bound event with branch sample type B, the type in HW will be left as B
and the cpu-bound event will never get samples again.

So I think we'll have to change *something* here, but that's part of the bigger
question above, so please leave this as-is for now.

> Please note that we don't prohibit the events from being grouped together on the PMU
> i.e pmu_add() does not fail when filters do not match. But there might be some other
> approaches that could be taken in such scenarios.
>
> A. Fail pmu_add() when branch_sample_type does not match
> 
> B. OR together all event's event->attr.branch_sample_type on a given PMU
> 
> 	- Then captured records need to be post processed to find applicable samples
> 	  matching event's original filter request
> 
> 	- But it might add some more latency to PMU IRQ handling ?
> 
> But please do let me know if there are better solutions that can be taken up.

It looks like LBR always does SW filtering, and I don't think it's actually
that expensive, so B looks like a nicer option.

However, I think that's part of that bigger question above, so for now please
leave that as-is.

> > 
> >> +	}
> >> +
> >>  	/* An event following a process won't be stopped earlier */
> >>  	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
> >>  		return -ENOENT;
> > 
> > Unless this cpumask check has been made redundant, it means that the code above
> > it is obviously wrong, since that pokes the BRBE HW and increments brbe_users
> > *before* we decide whether the event can be installed on this CPU. That'll blow
> > up on big.LITTLE,  e.g. we try and install a 'big' CPU event on a 'little' CPU,
> > poke the BRBE HW and increment brbe_users, then *after* that we abort
> > installing the event.
> 
> Agreed, aborting to install the event on the cpu after incrementing brbe_users
> will be problematic.
> 
> > Even ignoring big.LITTLE, we can fail immediately after this when we don't have
> > enough counters, since the following code is:
> > 
> > |        /* An event following a process won't be stopped earlier */
> > |        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
> > |                return -ENOENT;
> > |
> > |        /* If we don't have a space for the counter then finish early. */
> > |        idx = armpmu->get_event_idx(hw_events, event);
> > |        if (idx < 0)
> > |                return idx;
> > 
> > ... which'll go wrong if you try to open 1 more event than the CPU has
> > counters.
> 
> Agreed, the event needs to clear that test as well before incrementing brbe_users.
> 
> Should the branch stack context needs to be installed only after the event has
> cleared get_event_idx() successfully along with HW counters availability check
> etc before proceeding to install on the CPU ? IOW just move the block bit down

Yes; for now that should be sufficient.

[...]

> >> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
> >> index 23fa6c5da82c..9e17764a0929 100644
> >> --- a/drivers/perf/arm_pmuv3.c
> >> +++ b/drivers/perf/arm_pmuv3.c
> >> @@ -26,6 +26,7 @@
> >>  #include <linux/nmi.h>
> >>  
> >>  #include <asm/arm_pmuv3.h>
> >> +#include "arm_pmuv3_branch.h"
> > 
> > As above, I do not thing that the PMUv3 driver should change at all in this
> > patch. As of this patch it achieves nothing, and it makes it really hard to
> > understand what's going on because the important aspects are spread randomly
> > across this patch and the next patch which actually adds the BRBE management.
> > 
> > Please factor the PMUv3 changes out into the patch adding the actual BRBE code.
> 
> Sure, will keep the following changes in this patch.
> 
> A. drivers/perf/arm_pmu.c
> 
> 	- armpmu_add() --> armpmu->branch_stack_add()
> 	- armpmu_del() --> armpmu->branch_stack_del()
> 	- Allowing has_branch_stack() events in armpmu_event_init()
> 	- Adding callback arm_pmu->pmu->sched_task = armpmu_sched_task
> 
> B. include/linux/perf/arm_pmu.h
> 
> 	- Adding branch elements into pmu_hw_events
> 	- Adding branch callbacks into arm_pmu
> 	- Adding sched_task() into arm_pmu
> 	- Adding has_branch_stack into arm_pmu
> 
> Move everything else into the next patch implementing BRBE.
> 
> drivers/perf/arm_pmuv3.c 
> drivers/perf/arm_pmuv3_branch.h

That looks good. Depending on what we do about BRBE sharing we might need an
armpmu::branch_stack_init() callback for event_init(), so if you end up needing
one now that's also fine.

Thanks,
Mark.
diff mbox series

Patch

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 8458fe2cebb4..16f488ae7747 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -317,6 +317,15 @@  armpmu_del(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
+	if (has_branch_stack(event)) {
+		WARN_ON_ONCE(!hw_events->brbe_users);
+		hw_events->brbe_users--;
+		if (!hw_events->brbe_users) {
+			hw_events->brbe_context = NULL;
+			hw_events->brbe_sample_type = 0;
+		}
+	}
+
 	armpmu_stop(event, PERF_EF_UPDATE);
 	hw_events->events[idx] = NULL;
 	armpmu->clear_event_idx(hw_events, event);
@@ -333,6 +342,38 @@  armpmu_add(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	int idx;
 
+	if (has_branch_stack(event)) {
+		/*
+		 * Reset branch records buffer if a new CPU bound event
+		 * gets scheduled on a PMU. Otherwise existing branch
+		 * records present in the buffer might just leak into
+		 * such events.
+		 *
+		 * Also reset current 'hw_events->brbe_context' because
+		 * any previous task bound event now would have lost an
+		 * opportunity for continuous branch records.
+		 */
+		if (!event->ctx->task) {
+			hw_events->brbe_context = NULL;
+			if (armpmu->branch_reset)
+				armpmu->branch_reset();
+		}
+
+		/*
+		 * Reset branch records buffer if a new task event gets
+		 * scheduled on a PMU which might have existing records.
+		 * Otherwise older branch records present in the buffer
+		 * might leak into the new task event.
+		 */
+		if (event->ctx->task && hw_events->brbe_context != event->ctx) {
+			hw_events->brbe_context = event->ctx;
+			if (armpmu->branch_reset)
+				armpmu->branch_reset();
+		}
+		hw_events->brbe_users++;
+		hw_events->brbe_sample_type = event->attr.branch_sample_type;
+	}
+
 	/* An event following a process won't be stopped earlier */
 	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
 		return -ENOENT;
@@ -511,13 +552,24 @@  static int armpmu_event_init(struct perf_event *event)
 		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
 		return -ENOENT;
 
-	/* does not support taken branch sampling */
-	if (has_branch_stack(event))
+	/*
+	 * Branch stack sampling events are allowed
+	 * only on PMU which has required support.
+	 */
+	if (has_branch_stack(event) && !armpmu->has_branch_stack)
 		return -EOPNOTSUPP;
 
 	return __hw_perf_event_init(event);
 }
 
+static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+
+	if (armpmu->sched_task)
+		armpmu->sched_task(pmu_ctx, sched_in);
+}
+
 static void armpmu_enable(struct pmu *pmu)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(pmu);
@@ -864,6 +916,7 @@  struct arm_pmu *armpmu_alloc(void)
 	}
 
 	pmu->pmu = (struct pmu) {
+		.sched_task	= armpmu_sched_task,
 		.pmu_enable	= armpmu_enable,
 		.pmu_disable	= armpmu_disable,
 		.event_init	= armpmu_event_init,
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 23fa6c5da82c..9e17764a0929 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -26,6 +26,7 @@ 
 #include <linux/nmi.h>
 
 #include <asm/arm_pmuv3.h>
+#include "arm_pmuv3_branch.h"
 
 /* ARMv8 Cortex-A53 specific event types. */
 #define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
@@ -829,14 +830,56 @@  static void armv8pmu_start(struct arm_pmu *cpu_pmu)
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
 
 	kvm_vcpu_pmu_resync_el0();
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_enable(cpu_pmu);
 }
 
 static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
 {
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_disable();
+
 	/* Disable all counters */
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
 }
 
+static void read_branch_records(struct pmu_hw_events *cpuc,
+				struct perf_event *event,
+				struct perf_sample_data *data,
+				bool *branch_captured)
+{
+	/*
+	 * CPU specific branch records buffer must have been allocated already
+	 * for the hardware records to be captured and processed further.
+	 */
+	if (WARN_ON(!cpuc->branches))
+		return;
+
+	/*
+	 * Overflowed event's branch_sample_type does not match the configured
+	 * branch filters in the BRBE HW. So the captured branch records here
+	 * cannot be co-related to the overflowed event. Report to the user as
+	 * if no branch records have been captured, and flush branch records.
+	 * The same scenario is applicable when the current task context does
+	 * not match with overflown event.
+	 */
+	if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
+	    (event->ctx->task && cpuc->brbe_context != event->ctx))
+		return;
+
+	/*
+	 * Read the branch records from the hardware once after the PMU IRQ
+	 * has been triggered but subsequently same records can be used for
+	 * other events that might have been overflowed simultaneously thus
+	 * saving much CPU cycles.
+	 */
+	if (!*branch_captured) {
+		armv8pmu_branch_read(cpuc, event);
+		*branch_captured = true;
+	}
+	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack, NULL);
+}
+
 static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	u32 pmovsr;
@@ -844,6 +887,7 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 	struct pt_regs *regs;
 	int idx;
+	bool branch_captured = false;
 
 	/*
 	 * Get and reset the IRQ flags
@@ -887,6 +931,13 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 		if (!armpmu_event_set_period(event))
 			continue;
 
+		/*
+		 * PMU IRQ should remain asserted until all branch records
+		 * are captured and processed into struct perf_sample_data.
+		 */
+		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
+			read_branch_records(cpuc, event, &data, &branch_captured);
+
 		/*
 		 * Perf event overflow will queue the processing of the event as
 		 * an irq_work which will be taken care of in the handling of
@@ -896,6 +947,8 @@  static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 			cpu_pmu->disable(event);
 	}
 	armv8pmu_start(cpu_pmu);
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_reset();
 
 	return IRQ_HANDLED;
 }
@@ -985,6 +1038,24 @@  static int armv8pmu_user_event_idx(struct perf_event *event)
 	return event->hw.idx;
 }
 
+static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+	void *task_ctx = pmu_ctx->task_ctx_data;
+
+	if (armpmu->has_branch_stack) {
+		/* Save branch records in task_ctx on sched out */
+		if (task_ctx && !sched_in) {
+			armv8pmu_branch_save(armpmu, task_ctx);
+			return;
+		}
+
+		/* Reset branch records on sched in */
+		if (sched_in)
+			armv8pmu_branch_reset();
+	}
+}
+
 /*
  * Add an event filter to a given event.
  */
@@ -1077,6 +1148,9 @@  static void armv8pmu_reset(void *info)
 		pmcr |= ARMV8_PMU_PMCR_LP;
 
 	armv8pmu_pmcr_write(pmcr);
+
+	if (cpu_pmu->has_branch_stack)
+		armv8pmu_branch_reset();
 }
 
 static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
@@ -1114,6 +1188,20 @@  static int __armv8_pmuv3_map_event(struct perf_event *event,
 
 	hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
 
+	if (has_branch_stack(event)) {
+		if (!armv8pmu_branch_attr_valid(event))
+			return -EOPNOTSUPP;
+
+		/*
+		 * If a task gets scheduled out, the current branch records
+		 * get saved in the task's context data, which can be later
+		 * used to fill in the records upon an event overflow. Let's
+		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
+		 * all branch stack sampling perf events.
+		 */
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+	}
+
 	/*
 	 * CHAIN events only work when paired with an adjacent counter, and it
 	 * never makes sense for a user to open one in isolation, as they'll be
@@ -1229,6 +1317,41 @@  static void __armv8pmu_probe_pmu(void *info)
 		cpu_pmu->reg_pmmir = read_pmmir();
 	else
 		cpu_pmu->reg_pmmir = 0;
+
+	/*
+	 * BRBE is being probed on a single cpu for a
+	 * given PMU. The remaining cpus, are assumed
+	 * to have the exact same BRBE implementation.
+	 */
+	armv8pmu_branch_probe(cpu_pmu);
+}
+
+static int branch_records_alloc(struct arm_pmu *armpmu)
+{
+	struct branch_records __percpu *records;
+	int cpu;
+
+	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
+	if (!records)
+		return -ENOMEM;
+
+	/*
+	 * percpu memory allocated for 'records' gets completely consumed
+	 * here, and never required to be freed up later. So permanently
+	 * losing access to this anchor i.e 'records' is acceptable.
+	 *
+	 * Otherwise this allocation handle would have to be saved up for
+	 * free_percpu() release later if required.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct pmu_hw_events *events_cpu;
+		struct branch_records *records_cpu;
+
+		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
+		records_cpu = per_cpu_ptr(records, cpu);
+		events_cpu->branches = records_cpu;
+	}
+	return 0;
 }
 
 static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
@@ -1245,7 +1368,21 @@  static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 	if (ret)
 		return ret;
 
-	return probe.present ? 0 : -ENODEV;
+	if (!probe.present)
+		return -ENODEV;
+
+	if (cpu_pmu->has_branch_stack) {
+		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
+		if (ret)
+			return ret;
+
+		ret = branch_records_alloc(cpu_pmu);
+		if (ret) {
+			armv8pmu_task_ctx_cache_free(cpu_pmu);
+			return ret;
+		}
+	}
+	return 0;
 }
 
 static void armv8pmu_disable_user_access_ipi(void *unused)
@@ -1304,6 +1441,8 @@  static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
 
 	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
+	cpu_pmu->sched_task		= armv8pmu_sched_task;
+	cpu_pmu->branch_reset		= armv8pmu_branch_reset;
 
 	cpu_pmu->name			= name;
 	cpu_pmu->map_event		= map_event;
diff --git a/drivers/perf/arm_pmuv3_branch.h b/drivers/perf/arm_pmuv3_branch.h
new file mode 100644
index 000000000000..609e4d4ccac6
--- /dev/null
+++ b/drivers/perf/arm_pmuv3_branch.h
@@ -0,0 +1,50 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Branch Record Buffer Extension Helpers.
+ *
+ * Copyright (C) 2022-2023 ARM Limited
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#include <linux/perf/arm_pmu.h>
+
+static inline void armv8pmu_branch_reset(void)
+{
+}
+
+static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
+{
+}
+
+static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+	return false;
+}
+
+static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
+{
+}
+
+static inline void armv8pmu_branch_disable(void)
+{
+}
+
+static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+}
+
+static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
+{
+}
+
+static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
+{
+	return 0;
+}
+
+static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
+{
+}
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index b3b34f6670cf..8cfcc735c0f7 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -46,6 +46,18 @@  static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_63BIT) == ARMPMU_EVT_63BIT);
 	},								\
 }
 
+/*
+ * Maximum branch record entries which could be processed
+ * for core perf branch stack sampling support, regardless
+ * of the hardware support available on a given ARM PMU.
+ */
+#define MAX_BRANCH_RECORDS 64
+
+struct branch_records {
+	struct perf_branch_stack	branch_stack;
+	struct perf_branch_entry	branch_entries[MAX_BRANCH_RECORDS];
+};
+
 /* The events for a given PMU register set. */
 struct pmu_hw_events {
 	/*
@@ -66,6 +78,17 @@  struct pmu_hw_events {
 	struct arm_pmu		*percpu_pmu;
 
 	int irq;
+
+	struct branch_records	*branches;
+
+	/* Active context for task events */
+	void			*brbe_context;
+
+	/* Active events requesting branch records */
+	unsigned int		brbe_users;
+
+	/* Active branch sample type filters */
+	unsigned long		brbe_sample_type;
 };
 
 enum armpmu_attr_groups {
@@ -96,8 +119,12 @@  struct arm_pmu {
 	void		(*stop)(struct arm_pmu *);
 	void		(*reset)(void *);
 	int		(*map_event)(struct perf_event *event);
+	void		(*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+	void		(*branch_reset)(void);
 	int		num_events;
-	bool		secure_access; /* 32-bit ARM only */
+	unsigned int	secure_access	: 1, /* 32-bit ARM only */
+			has_branch_stack: 1, /* 64-bit ARM only */
+			reserved	: 30;
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
 	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
 #define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE	0x4000
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 46377e134d67..c3e7d2cfb737 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -308,5 +308,4 @@ 
 		default: WARN(1, "Invalid PMEV* index\n");	\
 		}						\
 	} while (0)
-
 #endif