diff mbox series

[RFC,6/9] drivers/perf: riscv: Implement SBI PMU snapshot function

Message ID 20231205024310.1593100-7-atishp@rivosinc.com (mailing list archive)
State Superseded
Headers show
Series RISC-V SBI v2.0 PMU improvements and Perf sampling in KVM guest | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-6-test-1 success .github/scripts/patches/build_rv32_defconfig.sh
conchuod/patch-6-test-2 success .github/scripts/patches/build_rv64_clang_allmodconfig.sh
conchuod/patch-6-test-3 success .github/scripts/patches/build_rv64_gcc_allmodconfig.sh
conchuod/patch-6-test-4 success .github/scripts/patches/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-6-test-5 success .github/scripts/patches/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-6-test-6 warning .github/scripts/patches/checkpatch.sh
conchuod/patch-6-test-7 success .github/scripts/patches/dtb_warn_rv64.sh
conchuod/patch-6-test-8 success .github/scripts/patches/header_inline.sh
conchuod/patch-6-test-9 success .github/scripts/patches/kdoc.sh
conchuod/patch-6-test-10 success .github/scripts/patches/module_param.sh
conchuod/patch-6-test-11 success .github/scripts/patches/verify_fixes.sh
conchuod/patch-6-test-12 success .github/scripts/patches/verify_signedoff.sh

Commit Message

Atish Kumar Patra Dec. 5, 2023, 2:43 a.m. UTC
SBI v2.0 SBI introduced PMU snapshot feature which adds the following
features.

1. Read counter values directly from the shared memory instead of
csr read.
2. Start multiple counters with initial values with one SBI call.

These functionalities optimizes the number of traps to the higher
privilege mode. If the kernel is in VS mode while the hypervisor
deploy trap & emulate method, this would minimize all the hpmcounter
CSR read traps. If the kernel is running in S-mode, the benfits
reduced to CSR latency vs DRAM/cache latency as there is no trap
involved while accessing the hpmcounter CSRs.

In both modes, it does saves the number of ecalls while starting
multiple counter together with an initial values. This is a likely
scenario if multiple counters overflow at the same time.

Signed-off-by: Atish Patra <atishp@rivosinc.com>
---
 drivers/perf/riscv_pmu.c       |   1 +
 drivers/perf/riscv_pmu_sbi.c   | 203 ++++++++++++++++++++++++++++++---
 include/linux/perf/riscv_pmu.h |   6 +
 3 files changed, 197 insertions(+), 13 deletions(-)

Comments

Conor Dooley Dec. 7, 2023, 1:05 p.m. UTC | #1
Hey Atish,

On Mon, Dec 04, 2023 at 06:43:07PM -0800, Atish Patra wrote:
> SBI v2.0 SBI introduced PMU snapshot feature which adds the following
> features.
> 
> 1. Read counter values directly from the shared memory instead of
> csr read.
> 2. Start multiple counters with initial values with one SBI call.
> 
> These functionalities optimizes the number of traps to the higher
> privilege mode. If the kernel is in VS mode while the hypervisor
> deploy trap & emulate method, this would minimize all the hpmcounter
> CSR read traps. If the kernel is running in S-mode, the benfits
> reduced to CSR latency vs DRAM/cache latency as there is no trap
> involved while accessing the hpmcounter CSRs.
> 
> In both modes, it does saves the number of ecalls while starting
> multiple counter together with an initial values. This is a likely
> scenario if multiple counters overflow at the same time.
> 
> Signed-off-by: Atish Patra <atishp@rivosinc.com>
> ---
>  drivers/perf/riscv_pmu.c       |   1 +
>  drivers/perf/riscv_pmu_sbi.c   | 203 ++++++++++++++++++++++++++++++---
>  include/linux/perf/riscv_pmu.h |   6 +
>  3 files changed, 197 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
> index 0dda70e1ef90..5b57acb770d3 100644
> --- a/drivers/perf/riscv_pmu.c
> +++ b/drivers/perf/riscv_pmu.c
> @@ -412,6 +412,7 @@ struct riscv_pmu *riscv_pmu_alloc(void)
>  		cpuc->n_events = 0;
>  		for (i = 0; i < RISCV_MAX_COUNTERS; i++)
>  			cpuc->events[i] = NULL;
> +		cpuc->snapshot_addr = NULL;
>  	}
>  	pmu->pmu = (struct pmu) {
>  		.event_init	= riscv_pmu_event_init,
> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> index 1c9049e6b574..1b8b6de63b69 100644
> --- a/drivers/perf/riscv_pmu_sbi.c
> +++ b/drivers/perf/riscv_pmu_sbi.c
> @@ -36,6 +36,9 @@ PMU_FORMAT_ATTR(event, "config:0-47");
>  PMU_FORMAT_ATTR(firmware, "config:63");
>  
>  static bool sbi_v2_available;
> +static DEFINE_STATIC_KEY_FALSE(sbi_pmu_snapshot_available);
> +#define sbi_pmu_snapshot_available() \
> +	static_branch_unlikely(&sbi_pmu_snapshot_available)
>  
>  static struct attribute *riscv_arch_formats_attr[] = {
>  	&format_attr_event.attr,
> @@ -485,14 +488,101 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig)
>  	return ret;
>  }
>  
> +static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu)
> +{
> +	int cpu;

> +	struct cpu_hw_events *cpu_hw_evt;

This is only used inside the scope of the for loop.

> +
> +	for_each_possible_cpu(cpu) {
> +		cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
> +		if (!cpu_hw_evt->snapshot_addr)
> +			continue;

Could you add a blank line here please?

> +		free_page((unsigned long)cpu_hw_evt->snapshot_addr);
> +		cpu_hw_evt->snapshot_addr = NULL;
> +		cpu_hw_evt->snapshot_addr_phys = 0;

Why do these need to be explicitly zeroed?

> +	}
> +}
> +
> +static int pmu_sbi_snapshot_alloc(struct riscv_pmu *pmu)
> +{
> +	int cpu;

> +	struct page *snapshot_page;
> +	struct cpu_hw_events *cpu_hw_evt;

Same here re scope

> +
> +	for_each_possible_cpu(cpu) {
> +		cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
> +		if (cpu_hw_evt->snapshot_addr)
> +			continue;

Same here re blank line

> +		snapshot_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
> +		if (!snapshot_page) {
> +			pmu_sbi_snapshot_free(pmu);
> +			return -ENOMEM;
> +		}
> +		cpu_hw_evt->snapshot_addr = page_to_virt(snapshot_page);
> +		cpu_hw_evt->snapshot_addr_phys = page_to_phys(snapshot_page);
> +	}
> +
> +	return 0;
> +}
> +
> +static void pmu_sbi_snapshot_disable(void)
> +{
> +	sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, -1,
> +		  -1, 0, 0, 0, 0);
> +}
> +
> +static int pmu_sbi_snapshot_setup(struct riscv_pmu *pmu, int cpu)
> +{
> +	struct cpu_hw_events *cpu_hw_evt;
> +	struct sbiret ret = {0};
> +	int rc;
> +
> +	cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
> +	if (!cpu_hw_evt->snapshot_addr_phys)
> +		return -EINVAL;
> +
> +	if (cpu_hw_evt->snapshot_set_done)
> +		return 0;
> +
> +#if defined(CONFIG_32BIT)

Why does this need to be an `#if defined()`? Does the code not compile
if you use IS_ENABLED()?

> +	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, cpu_hw_evt->snapshot_addr_phys,
> +		       (u64)(cpu_hw_evt->snapshot_addr_phys) >> 32, 0, 0, 0, 0);
> +#else
> +	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, cpu_hw_evt->snapshot_addr_phys,
> +			0, 0, 0, 0, 0);
> +#endif

> +	/* Free up the snapshot area memory and fall back to default SBI */

What does "fall back to the default SBI mean"? SBI is an interface so I
don't understand what it means in this context. Secondly, 
> +	if (ret.error) {
> +		if (ret.error != SBI_ERR_NOT_SUPPORTED)
> +			pr_warn("%s: pmu snapshot setup failed with error %ld\n", __func__,
> +				ret.error);

Why is the function relevant here? Is the error message in-and-of-itself
not sufficient here? Where else would one be setting up the snapshots
other than the setup function?

> +		rc = sbi_err_map_linux_errno(ret.error);

> +		if (rc)
> +			return rc;

Is it even possible for !rc at this point? You've already checked that
ret.error is non zero, so this just becomes
`return sbi_err_map_linux_errno(ret.error);`?

> +	}
> +
> +	cpu_hw_evt->snapshot_set_done = true;
> +
> +	return 0;
> +}
> +
>  static u64 pmu_sbi_ctr_read(struct perf_event *event)
>  {
>  	struct hw_perf_event *hwc = &event->hw;
>  	int idx = hwc->idx;
>  	struct sbiret ret;
>  	u64 val = 0;
> +	struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
> +	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> +	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
>  	union sbi_pmu_ctr_info info = pmu_ctr_list[idx];
>  
> +	/* Read the value from the shared memory directly */

Statement of the obvious, no?

> +	if (sbi_pmu_snapshot_available()) {
> +		val = sdata->ctr_values[idx];
> +		goto done;

s/goto done/return val/
There's no cleanup to be done here, what purpose does the goto serve?

> +	}
> +
>  	if (pmu_sbi_is_fw_event(event)) {
>  		ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ,
>  				hwc->idx, 0, 0, 0, 0, 0);
> @@ -512,6 +602,7 @@ static u64 pmu_sbi_ctr_read(struct perf_event *event)
>  			val = ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 31 | val;
>  	}
>  
> +done:
>  	return val;
>  }
>  
> @@ -539,6 +630,7 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival)
>  	struct hw_perf_event *hwc = &event->hw;
>  	unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
>  
> +	/* There is no benefit setting SNAPSHOT FLAG for a single counter */
>  #if defined(CONFIG_32BIT)
>  	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, hwc->idx,
>  			1, flag, ival, ival >> 32, 0);
> @@ -559,16 +651,29 @@ static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag)
>  {
>  	struct sbiret ret;
>  	struct hw_perf_event *hwc = &event->hw;
> +	struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
> +	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> +	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
>  
>  	if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) &&
>  	    (hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
>  		pmu_sbi_reset_scounteren((void *)event);
>  
> +	if (sbi_pmu_snapshot_available())
> +		flag |= SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
> +
>  	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0);
> -	if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
> -		flag != SBI_PMU_STOP_FLAG_RESET)
> +	if (!ret.error && sbi_pmu_snapshot_available()) {

> +		/* Snapshot is taken relative to the counter idx base. Apply a fixup. */
> +		if (hwc->idx > 0) {
> +			sdata->ctr_values[hwc->idx] = sdata->ctr_values[0];
> +			sdata->ctr_values[0] = 0;

Why is this being zeroed in this manner? Why is zeroing it not required
if hwc->idx == 0? You've got a comment there that could probably do with
elaboration.

> +		}
> +	} else if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
> +		flag != SBI_PMU_STOP_FLAG_RESET) {
>  		pr_err("Stopping counter idx %d failed with error %d\n",
>  			hwc->idx, sbi_err_map_linux_errno(ret.error));
> +	}
>  }
>  
>  static int pmu_sbi_find_num_ctrs(void)
> @@ -626,10 +731,14 @@ static inline void pmu_sbi_stop_all(struct riscv_pmu *pmu)
>  static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
>  {
>  	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> +	unsigned long flag = 0;
> +
> +	if (sbi_pmu_snapshot_available())
> +		flag = SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
>  
>  	/* No need to check the error here as we can't do anything about the error */
>  	sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, 0,
> -		  cpu_hw_evt->used_hw_ctrs[0], 0, 0, 0, 0);
> +		  cpu_hw_evt->used_hw_ctrs[0], flag, 0, 0, 0);
>  }
>  
>  /*
> @@ -638,11 +747,10 @@ static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
>   * while the overflowed counters need to be started with updated initialization
>   * value.
>   */
> -static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
> -					       unsigned long ctr_ovf_mask)
> +static noinline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt,
> +						   unsigned long ctr_ovf_mask)
>  {
>  	int idx = 0;
> -	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
>  	struct perf_event *event;
>  	unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
>  	unsigned long ctr_start_mask = 0;
> @@ -677,6 +785,49 @@ static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
>  	}
>  }
>  
> +static noinline void pmu_sbi_start_ovf_ctrs_snapshot(struct cpu_hw_events *cpu_hw_evt,
> +						   unsigned long ctr_ovf_mask)
> +{
> +	int idx = 0;
> +	struct perf_event *event;
> +	unsigned long flag = SBI_PMU_START_FLAG_INIT_FROM_SNAPSHOT;
> +	uint64_t max_period;
> +	struct hw_perf_event *hwc;
> +	u64 init_val = 0;
> +	unsigned long ctr_start_mask = 0;
> +	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
> +
> +	for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) {
> +		if (ctr_ovf_mask & (1 << idx)) {
> +			event = cpu_hw_evt->events[idx];
> +			hwc = &event->hw;
> +			max_period = riscv_pmu_ctr_get_width_mask(event);
> +			init_val = local64_read(&hwc->prev_count) & max_period;
> +			sdata->ctr_values[idx] = init_val;
> +		}
> +		/* We donot need to update the non-overflow counters the previous

		/*
		 * We don't need to update the non-overflow counters as the previous


> +		 * value should have been there already.
> +		 */
> +	}
> +
> +	ctr_start_mask = cpu_hw_evt->used_hw_ctrs[0];
> +
> +	/* Start all the counters in a single shot */
> +	sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, 0, ctr_start_mask,
> +		  flag, 0, 0, 0);
> +}
> +
> +static void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
> +					unsigned long ctr_ovf_mask)
> +{
> +	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> +
> +	if (sbi_pmu_snapshot_available())
> +		pmu_sbi_start_ovf_ctrs_snapshot(cpu_hw_evt, ctr_ovf_mask);
> +	else
> +		pmu_sbi_start_ovf_ctrs_sbi(cpu_hw_evt, ctr_ovf_mask);
> +}
> +
>  static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
>  {
>  	struct perf_sample_data data;
> @@ -690,6 +841,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
>  	unsigned long overflowed_ctrs = 0;
>  	struct cpu_hw_events *cpu_hw_evt = dev;
>  	u64 start_clock = sched_clock();
> +	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
>  
>  	if (WARN_ON_ONCE(!cpu_hw_evt))
>  		return IRQ_NONE;
> @@ -711,8 +863,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
>  	pmu_sbi_stop_hw_ctrs(pmu);
>  
>  	/* Overflow status register should only be read after counter are stopped */
> -	ALT_SBI_PMU_OVERFLOW(overflow);
> -
> +	if (sbi_pmu_snapshot_available())
> +		overflow = sdata->ctr_overflow_mask;
> +	else
> +		ALT_SBI_PMU_OVERFLOW(overflow);
>  	/*
>  	 * Overflow interrupt pending bit should only be cleared after stopping
>  	 * all the counters to avoid any race condition.
> @@ -774,6 +928,7 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
>  {
>  	struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
>  	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> +	int ret = 0;
>  
>  	/*
>  	 * We keep enabling userspace access to CYCLE, TIME and INSTRET via the
> @@ -794,7 +949,10 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
>  		enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE);
>  	}
>  
> -	return 0;
> +	if (sbi_pmu_snapshot_available())
> +		ret = pmu_sbi_snapshot_setup(pmu, cpu);
> +
> +	return ret;

I'd just write this as

	if (sbi_pmu_snapshot_available())
		return pmu_sbi_snapshot_setup(pmu, cpu);

	return 0;

and drop the newly added variable I think.

>  }
>  
>  static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
> @@ -807,6 +965,9 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
>  	/* Disable all counters access for user mode now */
>  	csr_write(CSR_SCOUNTEREN, 0x0);
>  
> +	if (sbi_pmu_snapshot_available())
> +		pmu_sbi_snapshot_disable();
> +
>  	return 0;
>  }
>  
> @@ -1076,10 +1237,6 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>  	pmu->event_unmapped = pmu_sbi_event_unmapped;
>  	pmu->csr_index = pmu_sbi_csr_index;
>  
> -	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
> -	if (ret)
> -		return ret;
> -
>  	ret = riscv_pm_pmu_register(pmu);
>  	if (ret)
>  		goto out_unregister;
> @@ -1088,8 +1245,28 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>  	if (ret)
>  		goto out_unregister;
>  
> +	/* SBI PMU Snasphot is only available in SBI v2.0 */

s/Snasphot/Snapshot/

> +	if (sbi_v2_available) {
> +		ret = pmu_sbi_snapshot_alloc(pmu);
> +		if (ret)
> +			goto out_unregister;

A blank line here aids readability by breaking up the reuse of ret.

> +		ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id());
> +		if (!ret) {
> +			pr_info("SBI PMU snapshot is available to optimize the PMU traps\n");

Why the verbose message? Could we standardise on one wording for the SBI
function probing stuff? Most users seem to be "SBI FOO extension detected".
Only IPI has additional wording and PMU differs slightly.

> +			/* We enable it once here for the boot cpu. If snapshot shmem fails during

Again, comment style here. What does "snapshot shmem" mean? I think
there's a missing action here. Registration? Allocation?

> +			 * cpu hotplug on, it should bail out.

Should or will? What action does "bail out" correspond to?

Thanks,
Conor.

> +			 */
> +			static_branch_enable(&sbi_pmu_snapshot_available);
> +		}
> +		/* Snapshot is an optional feature. Continue if not available */
> +	}
> +
>  	register_sysctl("kernel", sbi_pmu_sysctl_table);
>  
> +	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
> +	if (ret)
> +		return ret;
> +
>  	return 0;
>  
>  out_unregister:
> diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
> index 43282e22ebe1..c3fa90970042 100644
> --- a/include/linux/perf/riscv_pmu.h
> +++ b/include/linux/perf/riscv_pmu.h
> @@ -39,6 +39,12 @@ struct cpu_hw_events {
>  	DECLARE_BITMAP(used_hw_ctrs, RISCV_MAX_COUNTERS);
>  	/* currently enabled firmware counters */
>  	DECLARE_BITMAP(used_fw_ctrs, RISCV_MAX_COUNTERS);
> +	/* The virtual address of the shared memory where counter snapshot will be taken */
> +	void *snapshot_addr;
> +	/* The physical address of the shared memory where counter snapshot will be taken */
> +	phys_addr_t snapshot_addr_phys;
> +	/* Boolean flag to indicate setup is already done */
> +	bool snapshot_set_done;
>  };
>  
>  struct riscv_pmu {
> -- 
> 2.34.1
>
Atish Kumar Patra Dec. 17, 2023, 1:39 a.m. UTC | #2
On Thu, Dec 7, 2023 at 5:06 AM Conor Dooley <conor.dooley@microchip.com> wrote:
>
> Hey Atish,
>
> On Mon, Dec 04, 2023 at 06:43:07PM -0800, Atish Patra wrote:
> > SBI v2.0 SBI introduced PMU snapshot feature which adds the following
> > features.
> >
> > 1. Read counter values directly from the shared memory instead of
> > csr read.
> > 2. Start multiple counters with initial values with one SBI call.
> >
> > These functionalities optimizes the number of traps to the higher
> > privilege mode. If the kernel is in VS mode while the hypervisor
> > deploy trap & emulate method, this would minimize all the hpmcounter
> > CSR read traps. If the kernel is running in S-mode, the benfits
> > reduced to CSR latency vs DRAM/cache latency as there is no trap
> > involved while accessing the hpmcounter CSRs.
> >
> > In both modes, it does saves the number of ecalls while starting
> > multiple counter together with an initial values. This is a likely
> > scenario if multiple counters overflow at the same time.
> >
> > Signed-off-by: Atish Patra <atishp@rivosinc.com>
> > ---
> >  drivers/perf/riscv_pmu.c       |   1 +
> >  drivers/perf/riscv_pmu_sbi.c   | 203 ++++++++++++++++++++++++++++++---
> >  include/linux/perf/riscv_pmu.h |   6 +
> >  3 files changed, 197 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
> > index 0dda70e1ef90..5b57acb770d3 100644
> > --- a/drivers/perf/riscv_pmu.c
> > +++ b/drivers/perf/riscv_pmu.c
> > @@ -412,6 +412,7 @@ struct riscv_pmu *riscv_pmu_alloc(void)
> >               cpuc->n_events = 0;
> >               for (i = 0; i < RISCV_MAX_COUNTERS; i++)
> >                       cpuc->events[i] = NULL;
> > +             cpuc->snapshot_addr = NULL;
> >       }
> >       pmu->pmu = (struct pmu) {
> >               .event_init     = riscv_pmu_event_init,
> > diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> > index 1c9049e6b574..1b8b6de63b69 100644
> > --- a/drivers/perf/riscv_pmu_sbi.c
> > +++ b/drivers/perf/riscv_pmu_sbi.c
> > @@ -36,6 +36,9 @@ PMU_FORMAT_ATTR(event, "config:0-47");
> >  PMU_FORMAT_ATTR(firmware, "config:63");
> >
> >  static bool sbi_v2_available;
> > +static DEFINE_STATIC_KEY_FALSE(sbi_pmu_snapshot_available);
> > +#define sbi_pmu_snapshot_available() \
> > +     static_branch_unlikely(&sbi_pmu_snapshot_available)
> >
> >  static struct attribute *riscv_arch_formats_attr[] = {
> >       &format_attr_event.attr,
> > @@ -485,14 +488,101 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig)
> >       return ret;
> >  }
> >
> > +static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu)
> > +{
> > +     int cpu;
>
> > +     struct cpu_hw_events *cpu_hw_evt;
>
> This is only used inside the scope of the for loop.
>

Do you intend to suggest using mixed declarations ? Personally, I
prefer all the declarations upfront for readability.
Let me know if you think that's an issue or violates coding style.

> > +
> > +     for_each_possible_cpu(cpu) {
> > +             cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
> > +             if (!cpu_hw_evt->snapshot_addr)
> > +                     continue;
>
> Could you add a blank line here please?

Done.

>
> > +             free_page((unsigned long)cpu_hw_evt->snapshot_addr);
> > +             cpu_hw_evt->snapshot_addr = NULL;
> > +             cpu_hw_evt->snapshot_addr_phys = 0;
>
> Why do these need to be explicitly zeroed?
>

We may get an allocation failure while allocating for all cpus. That's why,
we need to free the page and zero out the pointers for all the
possible cpus in that case.

> > +     }
> > +}
> > +
> > +static int pmu_sbi_snapshot_alloc(struct riscv_pmu *pmu)
> > +{
> > +     int cpu;
>
> > +     struct page *snapshot_page;
> > +     struct cpu_hw_events *cpu_hw_evt;
>
> Same here re scope
>

same reply as above.

> > +
> > +     for_each_possible_cpu(cpu) {
> > +             cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
> > +             if (cpu_hw_evt->snapshot_addr)
> > +                     continue;
>
> Same here re blank line
>

Done.

> > +             snapshot_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
> > +             if (!snapshot_page) {
> > +                     pmu_sbi_snapshot_free(pmu);
> > +                     return -ENOMEM;
> > +             }
> > +             cpu_hw_evt->snapshot_addr = page_to_virt(snapshot_page);
> > +             cpu_hw_evt->snapshot_addr_phys = page_to_phys(snapshot_page);
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static void pmu_sbi_snapshot_disable(void)
> > +{
> > +     sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, -1,
> > +               -1, 0, 0, 0, 0);
> > +}
> > +
> > +static int pmu_sbi_snapshot_setup(struct riscv_pmu *pmu, int cpu)
> > +{
> > +     struct cpu_hw_events *cpu_hw_evt;
> > +     struct sbiret ret = {0};
> > +     int rc;
> > +
> > +     cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
> > +     if (!cpu_hw_evt->snapshot_addr_phys)
> > +             return -EINVAL;
> > +
> > +     if (cpu_hw_evt->snapshot_set_done)
> > +             return 0;
> > +
> > +#if defined(CONFIG_32BIT)
>
> Why does this need to be an `#if defined()`? Does the code not compile
> if you use IS_ENABLED()?
>

changed it to IS_ENABLED.

> > +     ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, cpu_hw_evt->snapshot_addr_phys,
> > +                    (u64)(cpu_hw_evt->snapshot_addr_phys) >> 32, 0, 0, 0, 0);
> > +#else
> > +     ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, cpu_hw_evt->snapshot_addr_phys,
> > +                     0, 0, 0, 0, 0);
> > +#endif
>
> > +     /* Free up the snapshot area memory and fall back to default SBI */
>
> What does "fall back to the default SBI mean"? SBI is an interface so I
> don't understand what it means in this context. Secondly,

In absence of SBI PMU snapshot, the driver would try to read the
counters directly and end up traps.
Also, it would not use the SBI PMU snapshot flags in the SBI start/stop calls.
Snapshot is an alternative mechanism to minimize the traps. I just
wanted to highlight that.

How about this ?
"Free up the snapshot area memory and fall back to default SBI PMU
calls without snapshot */


> > +     if (ret.error) {
> > +             if (ret.error != SBI_ERR_NOT_SUPPORTED)
> > +                     pr_warn("%s: pmu snapshot setup failed with error %ld\n", __func__,
> > +                             ret.error);
>
> Why is the function relevant here? Is the error message in-and-of-itself
> not sufficient here? Where else would one be setting up the snapshots
> other than the setup function?
>

The SBI implementation (i.e OpenSBI) may or may not provide a snapshot
feature. This error message indicates
that SBI implementation supports PMU snapshot but setup failed for
some other error.

> > +             rc = sbi_err_map_linux_errno(ret.error);
>
> > +             if (rc)
> > +                     return rc;
>
> Is it even possible for !rc at this point? You've already checked that
> ret.error is non zero, so this just becomes
> `return sbi_err_map_linux_errno(ret.error);`?
>

Good catch. Thanks. Fixed it.

> > +     }
> > +
> > +     cpu_hw_evt->snapshot_set_done = true;
> > +
> > +     return 0;
> > +}
> > +
> >  static u64 pmu_sbi_ctr_read(struct perf_event *event)
> >  {
> >       struct hw_perf_event *hwc = &event->hw;
> >       int idx = hwc->idx;
> >       struct sbiret ret;
> >       u64 val = 0;
> > +     struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
> > +     struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> > +     struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
> >       union sbi_pmu_ctr_info info = pmu_ctr_list[idx];
> >
> > +     /* Read the value from the shared memory directly */
>
> Statement of the obvious, no?
>

Probably. Just wanted to be explicit for the reader who didn't read
the spec to understand how snapshot works.

> > +     if (sbi_pmu_snapshot_available()) {
> > +             val = sdata->ctr_values[idx];
> > +             goto done;
>
> s/goto done/return val/
> There's no cleanup to be done here, what purpose does the goto serve?
>

Sure. Done.

> > +     }
> > +
> >       if (pmu_sbi_is_fw_event(event)) {
> >               ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ,
> >                               hwc->idx, 0, 0, 0, 0, 0);
> > @@ -512,6 +602,7 @@ static u64 pmu_sbi_ctr_read(struct perf_event *event)
> >                       val = ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 31 | val;
> >       }
> >
> > +done:
> >       return val;
> >  }
> >
> > @@ -539,6 +630,7 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival)
> >       struct hw_perf_event *hwc = &event->hw;
> >       unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
> >
> > +     /* There is no benefit setting SNAPSHOT FLAG for a single counter */
> >  #if defined(CONFIG_32BIT)
> >       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, hwc->idx,
> >                       1, flag, ival, ival >> 32, 0);
> > @@ -559,16 +651,29 @@ static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag)
> >  {
> >       struct sbiret ret;
> >       struct hw_perf_event *hwc = &event->hw;
> > +     struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
> > +     struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> > +     struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
> >
> >       if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) &&
> >           (hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
> >               pmu_sbi_reset_scounteren((void *)event);
> >
> > +     if (sbi_pmu_snapshot_available())
> > +             flag |= SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
> > +
> >       ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0);
> > -     if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
> > -             flag != SBI_PMU_STOP_FLAG_RESET)
> > +     if (!ret.error && sbi_pmu_snapshot_available()) {
>
> > +             /* Snapshot is taken relative to the counter idx base. Apply a fixup. */
> > +             if (hwc->idx > 0) {
> > +                     sdata->ctr_values[hwc->idx] = sdata->ctr_values[0];
> > +                     sdata->ctr_values[0] = 0;
>
> Why is this being zeroed in this manner? Why is zeroing it not required
> if hwc->idx == 0? You've got a comment there that could probably do with
> elaboration.
>

hwc->idx is the counter_idx_base here. If it is zero, that means the
counter0 value is updated
in the shared memory. However, if the base > 0, we need to update the
relative counter value
from the shared memory. Does it make sense ?

> > +             }
> > +     } else if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
> > +             flag != SBI_PMU_STOP_FLAG_RESET) {
> >               pr_err("Stopping counter idx %d failed with error %d\n",
> >                       hwc->idx, sbi_err_map_linux_errno(ret.error));
> > +     }
> >  }
> >
> >  static int pmu_sbi_find_num_ctrs(void)
> > @@ -626,10 +731,14 @@ static inline void pmu_sbi_stop_all(struct riscv_pmu *pmu)
> >  static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
> >  {
> >       struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> > +     unsigned long flag = 0;
> > +
> > +     if (sbi_pmu_snapshot_available())
> > +             flag = SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
> >
> >       /* No need to check the error here as we can't do anything about the error */
> >       sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, 0,
> > -               cpu_hw_evt->used_hw_ctrs[0], 0, 0, 0, 0);
> > +               cpu_hw_evt->used_hw_ctrs[0], flag, 0, 0, 0);
> >  }
> >
> >  /*
> > @@ -638,11 +747,10 @@ static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
> >   * while the overflowed counters need to be started with updated initialization
> >   * value.
> >   */
> > -static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
> > -                                            unsigned long ctr_ovf_mask)
> > +static noinline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt,
> > +                                                unsigned long ctr_ovf_mask)
> >  {
> >       int idx = 0;
> > -     struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> >       struct perf_event *event;
> >       unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
> >       unsigned long ctr_start_mask = 0;
> > @@ -677,6 +785,49 @@ static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
> >       }
> >  }
> >
> > +static noinline void pmu_sbi_start_ovf_ctrs_snapshot(struct cpu_hw_events *cpu_hw_evt,
> > +                                                unsigned long ctr_ovf_mask)
> > +{
> > +     int idx = 0;
> > +     struct perf_event *event;
> > +     unsigned long flag = SBI_PMU_START_FLAG_INIT_FROM_SNAPSHOT;
> > +     uint64_t max_period;
> > +     struct hw_perf_event *hwc;
> > +     u64 init_val = 0;
> > +     unsigned long ctr_start_mask = 0;
> > +     struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
> > +
> > +     for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) {
> > +             if (ctr_ovf_mask & (1 << idx)) {
> > +                     event = cpu_hw_evt->events[idx];
> > +                     hwc = &event->hw;
> > +                     max_period = riscv_pmu_ctr_get_width_mask(event);
> > +                     init_val = local64_read(&hwc->prev_count) & max_period;
> > +                     sdata->ctr_values[idx] = init_val;
> > +             }
> > +             /* We donot need to update the non-overflow counters the previous
>
>                 /*
>                  * We don't need to update the non-overflow counters as the previous
>
>
> > +              * value should have been there already.
> > +              */
> > +     }
> > +
> > +     ctr_start_mask = cpu_hw_evt->used_hw_ctrs[0];
> > +
> > +     /* Start all the counters in a single shot */
> > +     sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, 0, ctr_start_mask,
> > +               flag, 0, 0, 0);
> > +}
> > +
> > +static void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
> > +                                     unsigned long ctr_ovf_mask)
> > +{
> > +     struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> > +
> > +     if (sbi_pmu_snapshot_available())
> > +             pmu_sbi_start_ovf_ctrs_snapshot(cpu_hw_evt, ctr_ovf_mask);
> > +     else
> > +             pmu_sbi_start_ovf_ctrs_sbi(cpu_hw_evt, ctr_ovf_mask);
> > +}
> > +
> >  static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
> >  {
> >       struct perf_sample_data data;
> > @@ -690,6 +841,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
> >       unsigned long overflowed_ctrs = 0;
> >       struct cpu_hw_events *cpu_hw_evt = dev;
> >       u64 start_clock = sched_clock();
> > +     struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
> >
> >       if (WARN_ON_ONCE(!cpu_hw_evt))
> >               return IRQ_NONE;
> > @@ -711,8 +863,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
> >       pmu_sbi_stop_hw_ctrs(pmu);
> >
> >       /* Overflow status register should only be read after counter are stopped */
> > -     ALT_SBI_PMU_OVERFLOW(overflow);
> > -
> > +     if (sbi_pmu_snapshot_available())
> > +             overflow = sdata->ctr_overflow_mask;
> > +     else
> > +             ALT_SBI_PMU_OVERFLOW(overflow);
> >       /*
> >        * Overflow interrupt pending bit should only be cleared after stopping
> >        * all the counters to avoid any race condition.
> > @@ -774,6 +928,7 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
> >  {
> >       struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
> >       struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> > +     int ret = 0;
> >
> >       /*
> >        * We keep enabling userspace access to CYCLE, TIME and INSTRET via the
> > @@ -794,7 +949,10 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
> >               enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE);
> >       }
> >
> > -     return 0;
> > +     if (sbi_pmu_snapshot_available())
> > +             ret = pmu_sbi_snapshot_setup(pmu, cpu);
> > +
> > +     return ret;
>
> I'd just write this as
>
>         if (sbi_pmu_snapshot_available())
>                 return pmu_sbi_snapshot_setup(pmu, cpu);
>
>         return 0;
>
> and drop the newly added variable I think.
>

Sure. Just a preference thingy.

> >  }
> >
> >  static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
> > @@ -807,6 +965,9 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
> >       /* Disable all counters access for user mode now */
> >       csr_write(CSR_SCOUNTEREN, 0x0);
> >
> > +     if (sbi_pmu_snapshot_available())
> > +             pmu_sbi_snapshot_disable();
> > +
> >       return 0;
> >  }
> >
> > @@ -1076,10 +1237,6 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
> >       pmu->event_unmapped = pmu_sbi_event_unmapped;
> >       pmu->csr_index = pmu_sbi_csr_index;
> >
> > -     ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
> > -     if (ret)
> > -             return ret;
> > -
> >       ret = riscv_pm_pmu_register(pmu);
> >       if (ret)
> >               goto out_unregister;
> > @@ -1088,8 +1245,28 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
> >       if (ret)
> >               goto out_unregister;
> >
> > +     /* SBI PMU Snasphot is only available in SBI v2.0 */
>
> s/Snasphot/Snapshot/
>

Thanks. Fixed.

> > +     if (sbi_v2_available) {
> > +             ret = pmu_sbi_snapshot_alloc(pmu);
> > +             if (ret)
> > +                     goto out_unregister;
>
> A blank line here aids readability by breaking up the reuse of ret.

done.

>
> > +             ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id());
> > +             if (!ret) {
> > +                     pr_info("SBI PMU snapshot is available to optimize the PMU traps\n");
>
> Why the verbose message? Could we standardise on one wording for the SBI
> function probing stuff? Most users seem to be "SBI FOO extension detected".
> Only IPI has additional wording and PMU differs slightly.

Additional information is for users to understand PMU functionality
uses less traps on this system.
We can just resort to and expect users to read upon the purpose of the
snapshot from the spec.
"SBI PMU snapshot available"

>
> > +                     /* We enable it once here for the boot cpu. If snapshot shmem fails during
>
> Again, comment style here. What does "snapshot shmem" mean? I think
> there's a missing action here. Registration? Allocation?
>

Fixed it. It is supposed to be "snapshot shmem setup"

> > +                      * cpu hotplug on, it should bail out.
>
> Should or will? What action does "bail out" correspond to?
>

bail out the cpu hotplug process. We don't support heterogeneous pmus
for snapshot.
If the SBI implementation returns success for SBI_EXT_PMU_SNAPSHOT_SET_SHMEM
boot cpu but fails for other cpus while bringing them up, it is
problematic to handle that.

> Thanks,
> Conor.
>
> > +                      */
> > +                     static_branch_enable(&sbi_pmu_snapshot_available);
> > +             }
> > +             /* Snapshot is an optional feature. Continue if not available */
> > +     }
> > +
> >       register_sysctl("kernel", sbi_pmu_sysctl_table);
> >
> > +     ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
> > +     if (ret)
> > +             return ret;
> > +
> >       return 0;
> >
> >  out_unregister:
> > diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
> > index 43282e22ebe1..c3fa90970042 100644
> > --- a/include/linux/perf/riscv_pmu.h
> > +++ b/include/linux/perf/riscv_pmu.h
> > @@ -39,6 +39,12 @@ struct cpu_hw_events {
> >       DECLARE_BITMAP(used_hw_ctrs, RISCV_MAX_COUNTERS);
> >       /* currently enabled firmware counters */
> >       DECLARE_BITMAP(used_fw_ctrs, RISCV_MAX_COUNTERS);
> > +     /* The virtual address of the shared memory where counter snapshot will be taken */
> > +     void *snapshot_addr;
> > +     /* The physical address of the shared memory where counter snapshot will be taken */
> > +     phys_addr_t snapshot_addr_phys;
> > +     /* Boolean flag to indicate setup is already done */
> > +     bool snapshot_set_done;
> >  };
> >
> >  struct riscv_pmu {
> > --
> > 2.34.1
> >
Conor Dooley Dec. 17, 2023, 12:10 p.m. UTC | #3
On Sat, Dec 16, 2023 at 05:39:12PM -0800, Atish Kumar Patra wrote:
> On Thu, Dec 7, 2023 at 5:06 AM Conor Dooley <conor.dooley@microchip.com> wrote:
> > On Mon, Dec 04, 2023 at 06:43:07PM -0800, Atish Patra wrote:

> > > +static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu)
> > > +{
> > > +     int cpu;
> >
> > > +     struct cpu_hw_events *cpu_hw_evt;
> >
> > This is only used inside the scope of the for loop.
> >
> 
> Do you intend to suggest using mixed declarations ? Personally, I
> prefer all the declarations upfront for readability.
> Let me know if you think that's an issue or violates coding style.

I was suggesting

int cpu;

for_each_possible_cpu(cpu)
	struct cpu_hw_events *cpu_hw_evt = per....()

I've been asked to do this in some subsystems I submitted code to,
and checkpatch etc do not complain about it. I don't think there is any
specific commentary in the coding style about minimising the scope of
variables however.

> > > +     /* Free up the snapshot area memory and fall back to default SBI */
> >
> > What does "fall back to the default SBI mean"? SBI is an interface so I
> > don't understand what it means in this context. Secondly,
> 
> In absence of SBI PMU snapshot, the driver would try to read the
> counters directly and end up traps.
> Also, it would not use the SBI PMU snapshot flags in the SBI start/stop calls.
> Snapshot is an alternative mechanism to minimize the traps. I just
> wanted to highlight that.
> 
> How about this ?
> "Free up the snapshot area memory and fall back to default SBI PMU
> calls without snapshot */

Yeah, that's fine (modulo the */ placement). The original comment just
seemed truncated.

> > > +     if (ret.error) {
> > > +             if (ret.error != SBI_ERR_NOT_SUPPORTED)
> > > +                     pr_warn("%s: pmu snapshot setup failed with error %ld\n", __func__,
> > > +                             ret.error);
> >
> > Why is the function relevant here? Is the error message in-and-of-itself
> > not sufficient here? Where else would one be setting up the snapshots
> > other than the setup function?
> >
> 
> The SBI implementation (i.e OpenSBI) may or may not provide a snapshot
> feature. This error message indicates
> that SBI implementation supports PMU snapshot but setup failed for
> some other error.

I don't see what this has to do with printing out the function. This is
a unique error message, and there is no other place where the setup is
done AFAICT.

> > > +             /* Snapshot is taken relative to the counter idx base. Apply a fixup. */
> > > +             if (hwc->idx > 0) {
> > > +                     sdata->ctr_values[hwc->idx] = sdata->ctr_values[0];
> > > +                     sdata->ctr_values[0] = 0;
> >
> > Why is this being zeroed in this manner? Why is zeroing it not required
> > if hwc->idx == 0? You've got a comment there that could probably do with
> > elaboration.
> >
> 
> hwc->idx is the counter_idx_base here. If it is zero, that means the
> counter0 value is updated
> in the shared memory. However, if the base > 0, we need to update the
> relative counter value
> from the shared memory. Does it make sense ?

Please expand on the comment so that it contains this information.

> > > +             ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id());
> > > +             if (!ret) {
> > > +                     pr_info("SBI PMU snapshot is available to optimize the PMU traps\n");
> >
> > Why the verbose message? Could we standardise on one wording for the SBI
> > function probing stuff? Most users seem to be "SBI FOO extension detected".
> > Only IPI has additional wording and PMU differs slightly.
> 
> Additional information is for users to understand PMU functionality
> uses less traps on this system.
> We can just resort to and expect users to read upon the purpose of the
> snapshot from the spec.
> "SBI PMU snapshot available"

What I was asking for was alignment with the majority of other SBI
extensions that use the format I mentioned above.

> 
> >
> > > +                     /* We enable it once here for the boot cpu. If snapshot shmem fails during
> >
> > Again, comment style here. What does "snapshot shmem" mean? I think
> > there's a missing action here. Registration? Allocation?
> >
> 
> Fixed it. It is supposed to be "snapshot shmem setup"
> 
> > > +                      * cpu hotplug on, it should bail out.
> >
> > Should or will? What action does "bail out" correspond to?
> >
> 
> bail out the cpu hotplug process. We don't support heterogeneous pmus
> for snapshot.
> If the SBI implementation returns success for SBI_EXT_PMU_SNAPSHOT_SET_SHMEM
> boot cpu but fails for other cpus while bringing them up, it is
> problematic to handle that.

"bail out" should be replaced by a more technical explanation of what is
going to happen. "should" is a weird word to use, either the cpuhotplug
code does or does not deal with this case, and since that code is also
in the kernel, this patchset should ensure that it does handle the case,
no? If the kernel does handle it "should" should be replaced with more
definitive wording.

Thanks,
Conor.
Atish Kumar Patra Dec. 18, 2023, 12:57 a.m. UTC | #4
On Sun, Dec 17, 2023 at 4:10 AM Conor Dooley <conor@kernel.org> wrote:
>
> On Sat, Dec 16, 2023 at 05:39:12PM -0800, Atish Kumar Patra wrote:
> > On Thu, Dec 7, 2023 at 5:06 AM Conor Dooley <conor.dooley@microchip.com> wrote:
> > > On Mon, Dec 04, 2023 at 06:43:07PM -0800, Atish Patra wrote:
>
> > > > +static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu)
> > > > +{
> > > > +     int cpu;
> > >
> > > > +     struct cpu_hw_events *cpu_hw_evt;
> > >
> > > This is only used inside the scope of the for loop.
> > >
> >
> > Do you intend to suggest using mixed declarations ? Personally, I
> > prefer all the declarations upfront for readability.
> > Let me know if you think that's an issue or violates coding style.
>
> I was suggesting
>
> int cpu;
>
> for_each_possible_cpu(cpu)
>         struct cpu_hw_events *cpu_hw_evt = per....()
>

That's what I meant by mixed declarations.

> I've been asked to do this in some subsystems I submitted code to,
> and checkpatch etc do not complain about it. I don't think there is any
> specific commentary in the coding style about minimising the scope of
> variables however.
>

I didn't know any subsystem which prefers mixed declaration vs upfront.

> > > > +     /* Free up the snapshot area memory and fall back to default SBI */
> > >
> > > What does "fall back to the default SBI mean"? SBI is an interface so I
> > > don't understand what it means in this context. Secondly,
> >
> > In absence of SBI PMU snapshot, the driver would try to read the
> > counters directly and end up traps.
> > Also, it would not use the SBI PMU snapshot flags in the SBI start/stop calls.
> > Snapshot is an alternative mechanism to minimize the traps. I just
> > wanted to highlight that.
> >
> > How about this ?
> > "Free up the snapshot area memory and fall back to default SBI PMU
> > calls without snapshot */
>
> Yeah, that's fine (modulo the */ placement). The original comment just
> seemed truncated.
>

ok.

> > > > +     if (ret.error) {
> > > > +             if (ret.error != SBI_ERR_NOT_SUPPORTED)
> > > > +                     pr_warn("%s: pmu snapshot setup failed with error %ld\n", __func__,
> > > > +                             ret.error);
> > >
> > > Why is the function relevant here? Is the error message in-and-of-itself
> > > not sufficient here? Where else would one be setting up the snapshots
> > > other than the setup function?
> > >
> >
> > The SBI implementation (i.e OpenSBI) may or may not provide a snapshot
> > feature. This error message indicates
> > that SBI implementation supports PMU snapshot but setup failed for
> > some other error.
>
> I don't see what this has to do with printing out the function. This is
> a unique error message, and there is no other place where the setup is
> done AFAICT.
>

Ahh you were concerned about the function name in the log. I
misunderstood it at first.
The function name is not relevant and has been already removed.

> > > > +             /* Snapshot is taken relative to the counter idx base. Apply a fixup. */
> > > > +             if (hwc->idx > 0) {
> > > > +                     sdata->ctr_values[hwc->idx] = sdata->ctr_values[0];
> > > > +                     sdata->ctr_values[0] = 0;
> > >
> > > Why is this being zeroed in this manner? Why is zeroing it not required
> > > if hwc->idx == 0? You've got a comment there that could probably do with
> > > elaboration.
> > >
> >
> > hwc->idx is the counter_idx_base here. If it is zero, that means the
> > counter0 value is updated
> > in the shared memory. However, if the base > 0, we need to update the
> > relative counter value
> > from the shared memory. Does it make sense ?
>
> Please expand on the comment so that it contains this information.
>

Sure.

> > > > +             ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id());
> > > > +             if (!ret) {
> > > > +                     pr_info("SBI PMU snapshot is available to optimize the PMU traps\n");
> > >
> > > Why the verbose message? Could we standardise on one wording for the SBI
> > > function probing stuff? Most users seem to be "SBI FOO extension detected".
> > > Only IPI has additional wording and PMU differs slightly.
> >
> > Additional information is for users to understand PMU functionality
> > uses less traps on this system.
> > We can just resort to and expect users to read upon the purpose of the
> > snapshot from the spec.
> > "SBI PMU snapshot available"
>
> What I was asking for was alignment with the majority of other SBI
> extensions that use the format I mentioned above.
>

PMU snapshot is a function and my previous suggestion aligns PMU
extension availability log message.
I can change it to "SBI PMU snapshot detected"

> >
> > >
> > > > +                     /* We enable it once here for the boot cpu. If snapshot shmem fails during
> > >
> > > Again, comment style here. What does "snapshot shmem" mean? I think
> > > there's a missing action here. Registration? Allocation?
> > >
> >
> > Fixed it. It is supposed to be "snapshot shmem setup"
> >
> > > > +                      * cpu hotplug on, it should bail out.
> > >
> > > Should or will? What action does "bail out" correspond to?
> > >
> >
> > bail out the cpu hotplug process. We don't support heterogeneous pmus
> > for snapshot.
> > If the SBI implementation returns success for SBI_EXT_PMU_SNAPSHOT_SET_SHMEM
> > boot cpu but fails for other cpus while bringing them up, it is
> > problematic to handle that.
>
> "bail out" should be replaced by a more technical explanation of what is
> going to happen. "should" is a weird word to use, either the cpuhotplug
> code does or does not deal with this case, and since that code is also
> in the kernel, this patchset should ensure that it does handle the case,
> no? If the kernel does handle it "should" should be replaced with more
> definitive wording.
>

ok. I will improve the comment to explain a bit more.

> Thanks,
> Conor.
diff mbox series

Patch

diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
index 0dda70e1ef90..5b57acb770d3 100644
--- a/drivers/perf/riscv_pmu.c
+++ b/drivers/perf/riscv_pmu.c
@@ -412,6 +412,7 @@  struct riscv_pmu *riscv_pmu_alloc(void)
 		cpuc->n_events = 0;
 		for (i = 0; i < RISCV_MAX_COUNTERS; i++)
 			cpuc->events[i] = NULL;
+		cpuc->snapshot_addr = NULL;
 	}
 	pmu->pmu = (struct pmu) {
 		.event_init	= riscv_pmu_event_init,
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 1c9049e6b574..1b8b6de63b69 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -36,6 +36,9 @@  PMU_FORMAT_ATTR(event, "config:0-47");
 PMU_FORMAT_ATTR(firmware, "config:63");
 
 static bool sbi_v2_available;
+static DEFINE_STATIC_KEY_FALSE(sbi_pmu_snapshot_available);
+#define sbi_pmu_snapshot_available() \
+	static_branch_unlikely(&sbi_pmu_snapshot_available)
 
 static struct attribute *riscv_arch_formats_attr[] = {
 	&format_attr_event.attr,
@@ -485,14 +488,101 @@  static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig)
 	return ret;
 }
 
+static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu)
+{
+	int cpu;
+	struct cpu_hw_events *cpu_hw_evt;
+
+	for_each_possible_cpu(cpu) {
+		cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+		if (!cpu_hw_evt->snapshot_addr)
+			continue;
+		free_page((unsigned long)cpu_hw_evt->snapshot_addr);
+		cpu_hw_evt->snapshot_addr = NULL;
+		cpu_hw_evt->snapshot_addr_phys = 0;
+	}
+}
+
+static int pmu_sbi_snapshot_alloc(struct riscv_pmu *pmu)
+{
+	int cpu;
+	struct page *snapshot_page;
+	struct cpu_hw_events *cpu_hw_evt;
+
+	for_each_possible_cpu(cpu) {
+		cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+		if (cpu_hw_evt->snapshot_addr)
+			continue;
+		snapshot_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+		if (!snapshot_page) {
+			pmu_sbi_snapshot_free(pmu);
+			return -ENOMEM;
+		}
+		cpu_hw_evt->snapshot_addr = page_to_virt(snapshot_page);
+		cpu_hw_evt->snapshot_addr_phys = page_to_phys(snapshot_page);
+	}
+
+	return 0;
+}
+
+static void pmu_sbi_snapshot_disable(void)
+{
+	sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, -1,
+		  -1, 0, 0, 0, 0);
+}
+
+static int pmu_sbi_snapshot_setup(struct riscv_pmu *pmu, int cpu)
+{
+	struct cpu_hw_events *cpu_hw_evt;
+	struct sbiret ret = {0};
+	int rc;
+
+	cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+	if (!cpu_hw_evt->snapshot_addr_phys)
+		return -EINVAL;
+
+	if (cpu_hw_evt->snapshot_set_done)
+		return 0;
+
+#if defined(CONFIG_32BIT)
+	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, cpu_hw_evt->snapshot_addr_phys,
+		       (u64)(cpu_hw_evt->snapshot_addr_phys) >> 32, 0, 0, 0, 0);
+#else
+	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, cpu_hw_evt->snapshot_addr_phys,
+			0, 0, 0, 0, 0);
+#endif
+	/* Free up the snapshot area memory and fall back to default SBI */
+	if (ret.error) {
+		if (ret.error != SBI_ERR_NOT_SUPPORTED)
+			pr_warn("%s: pmu snapshot setup failed with error %ld\n", __func__,
+				ret.error);
+		rc = sbi_err_map_linux_errno(ret.error);
+		if (rc)
+			return rc;
+	}
+
+	cpu_hw_evt->snapshot_set_done = true;
+
+	return 0;
+}
+
 static u64 pmu_sbi_ctr_read(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 	struct sbiret ret;
 	u64 val = 0;
+	struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
+	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
 	union sbi_pmu_ctr_info info = pmu_ctr_list[idx];
 
+	/* Read the value from the shared memory directly */
+	if (sbi_pmu_snapshot_available()) {
+		val = sdata->ctr_values[idx];
+		goto done;
+	}
+
 	if (pmu_sbi_is_fw_event(event)) {
 		ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ,
 				hwc->idx, 0, 0, 0, 0, 0);
@@ -512,6 +602,7 @@  static u64 pmu_sbi_ctr_read(struct perf_event *event)
 			val = ((u64)riscv_pmu_ctr_read_csr(info.csr + 0x80)) << 31 | val;
 	}
 
+done:
 	return val;
 }
 
@@ -539,6 +630,7 @@  static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival)
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
 
+	/* There is no benefit setting SNAPSHOT FLAG for a single counter */
 #if defined(CONFIG_32BIT)
 	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, hwc->idx,
 			1, flag, ival, ival >> 32, 0);
@@ -559,16 +651,29 @@  static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag)
 {
 	struct sbiret ret;
 	struct hw_perf_event *hwc = &event->hw;
+	struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
+	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
 
 	if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) &&
 	    (hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		pmu_sbi_reset_scounteren((void *)event);
 
+	if (sbi_pmu_snapshot_available())
+		flag |= SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
+
 	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0);
-	if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
-		flag != SBI_PMU_STOP_FLAG_RESET)
+	if (!ret.error && sbi_pmu_snapshot_available()) {
+		/* Snapshot is taken relative to the counter idx base. Apply a fixup. */
+		if (hwc->idx > 0) {
+			sdata->ctr_values[hwc->idx] = sdata->ctr_values[0];
+			sdata->ctr_values[0] = 0;
+		}
+	} else if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
+		flag != SBI_PMU_STOP_FLAG_RESET) {
 		pr_err("Stopping counter idx %d failed with error %d\n",
 			hwc->idx, sbi_err_map_linux_errno(ret.error));
+	}
 }
 
 static int pmu_sbi_find_num_ctrs(void)
@@ -626,10 +731,14 @@  static inline void pmu_sbi_stop_all(struct riscv_pmu *pmu)
 static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
 {
 	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+	unsigned long flag = 0;
+
+	if (sbi_pmu_snapshot_available())
+		flag = SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
 
 	/* No need to check the error here as we can't do anything about the error */
 	sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, 0,
-		  cpu_hw_evt->used_hw_ctrs[0], 0, 0, 0, 0);
+		  cpu_hw_evt->used_hw_ctrs[0], flag, 0, 0, 0);
 }
 
 /*
@@ -638,11 +747,10 @@  static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
  * while the overflowed counters need to be started with updated initialization
  * value.
  */
-static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
-					       unsigned long ctr_ovf_mask)
+static noinline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt,
+						   unsigned long ctr_ovf_mask)
 {
 	int idx = 0;
-	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
 	struct perf_event *event;
 	unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
 	unsigned long ctr_start_mask = 0;
@@ -677,6 +785,49 @@  static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
 	}
 }
 
+static noinline void pmu_sbi_start_ovf_ctrs_snapshot(struct cpu_hw_events *cpu_hw_evt,
+						   unsigned long ctr_ovf_mask)
+{
+	int idx = 0;
+	struct perf_event *event;
+	unsigned long flag = SBI_PMU_START_FLAG_INIT_FROM_SNAPSHOT;
+	uint64_t max_period;
+	struct hw_perf_event *hwc;
+	u64 init_val = 0;
+	unsigned long ctr_start_mask = 0;
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
+
+	for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) {
+		if (ctr_ovf_mask & (1 << idx)) {
+			event = cpu_hw_evt->events[idx];
+			hwc = &event->hw;
+			max_period = riscv_pmu_ctr_get_width_mask(event);
+			init_val = local64_read(&hwc->prev_count) & max_period;
+			sdata->ctr_values[idx] = init_val;
+		}
+		/* We donot need to update the non-overflow counters the previous
+		 * value should have been there already.
+		 */
+	}
+
+	ctr_start_mask = cpu_hw_evt->used_hw_ctrs[0];
+
+	/* Start all the counters in a single shot */
+	sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, 0, ctr_start_mask,
+		  flag, 0, 0, 0);
+}
+
+static void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
+					unsigned long ctr_ovf_mask)
+{
+	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+
+	if (sbi_pmu_snapshot_available())
+		pmu_sbi_start_ovf_ctrs_snapshot(cpu_hw_evt, ctr_ovf_mask);
+	else
+		pmu_sbi_start_ovf_ctrs_sbi(cpu_hw_evt, ctr_ovf_mask);
+}
+
 static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 {
 	struct perf_sample_data data;
@@ -690,6 +841,7 @@  static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 	unsigned long overflowed_ctrs = 0;
 	struct cpu_hw_events *cpu_hw_evt = dev;
 	u64 start_clock = sched_clock();
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
 
 	if (WARN_ON_ONCE(!cpu_hw_evt))
 		return IRQ_NONE;
@@ -711,8 +863,10 @@  static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 	pmu_sbi_stop_hw_ctrs(pmu);
 
 	/* Overflow status register should only be read after counter are stopped */
-	ALT_SBI_PMU_OVERFLOW(overflow);
-
+	if (sbi_pmu_snapshot_available())
+		overflow = sdata->ctr_overflow_mask;
+	else
+		ALT_SBI_PMU_OVERFLOW(overflow);
 	/*
 	 * Overflow interrupt pending bit should only be cleared after stopping
 	 * all the counters to avoid any race condition.
@@ -774,6 +928,7 @@  static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
 {
 	struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
 	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+	int ret = 0;
 
 	/*
 	 * We keep enabling userspace access to CYCLE, TIME and INSTRET via the
@@ -794,7 +949,10 @@  static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
 		enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE);
 	}
 
-	return 0;
+	if (sbi_pmu_snapshot_available())
+		ret = pmu_sbi_snapshot_setup(pmu, cpu);
+
+	return ret;
 }
 
 static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
@@ -807,6 +965,9 @@  static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
 	/* Disable all counters access for user mode now */
 	csr_write(CSR_SCOUNTEREN, 0x0);
 
+	if (sbi_pmu_snapshot_available())
+		pmu_sbi_snapshot_disable();
+
 	return 0;
 }
 
@@ -1076,10 +1237,6 @@  static int pmu_sbi_device_probe(struct platform_device *pdev)
 	pmu->event_unmapped = pmu_sbi_event_unmapped;
 	pmu->csr_index = pmu_sbi_csr_index;
 
-	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
-	if (ret)
-		return ret;
-
 	ret = riscv_pm_pmu_register(pmu);
 	if (ret)
 		goto out_unregister;
@@ -1088,8 +1245,28 @@  static int pmu_sbi_device_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_unregister;
 
+	/* SBI PMU Snasphot is only available in SBI v2.0 */
+	if (sbi_v2_available) {
+		ret = pmu_sbi_snapshot_alloc(pmu);
+		if (ret)
+			goto out_unregister;
+		ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id());
+		if (!ret) {
+			pr_info("SBI PMU snapshot is available to optimize the PMU traps\n");
+			/* We enable it once here for the boot cpu. If snapshot shmem fails during
+			 * cpu hotplug on, it should bail out.
+			 */
+			static_branch_enable(&sbi_pmu_snapshot_available);
+		}
+		/* Snapshot is an optional feature. Continue if not available */
+	}
+
 	register_sysctl("kernel", sbi_pmu_sysctl_table);
 
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
+	if (ret)
+		return ret;
+
 	return 0;
 
 out_unregister:
diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
index 43282e22ebe1..c3fa90970042 100644
--- a/include/linux/perf/riscv_pmu.h
+++ b/include/linux/perf/riscv_pmu.h
@@ -39,6 +39,12 @@  struct cpu_hw_events {
 	DECLARE_BITMAP(used_hw_ctrs, RISCV_MAX_COUNTERS);
 	/* currently enabled firmware counters */
 	DECLARE_BITMAP(used_fw_ctrs, RISCV_MAX_COUNTERS);
+	/* The virtual address of the shared memory where counter snapshot will be taken */
+	void *snapshot_addr;
+	/* The physical address of the shared memory where counter snapshot will be taken */
+	phys_addr_t snapshot_addr_phys;
+	/* Boolean flag to indicate setup is already done */
+	bool snapshot_set_done;
 };
 
 struct riscv_pmu {