diff mbox

[v4,4/5] drivers/perf: Add support for ARMv8.2 Statistical Profiling Extension

Message ID 1496676177-29356-5-git-send-email-will.deacon@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Will Deacon June 5, 2017, 3:22 p.m. UTC
The ARMv8.2 architecture introduces the optional Statistical Profiling
Extension (SPE).

SPE can be used to profile a population of operations in the CPU pipeline
after instruction decode. These are either architected instructions (i.e.
a dynamic instruction trace) or CPU-specific uops and the choice is fixed
statically in the hardware and advertised to userspace via caps/. Sampling
is controlled using a sampling interval, similar to a regular PMU counter,
but also with an optional random perturbation to avoid falling into patterns
where you continuously profile the same instruction in a hot loop.

After each operation is decoded, the interval counter is decremented. When
it hits zero, an operation is chosen for profiling and tracked within the
pipeline until it retires. Along the way, information such as TLB lookups,
cache misses, time spent to issue etc is captured in the form of a sample.
The sample is then filtered according to certain criteria (e.g. load
latency) that can be specified in the event config (described under
format/) and, if the sample satisfies the filter, it is written out to
memory as a record, otherwise it is discarded. Only one operation can
be sampled at a time.

The in-memory buffer is linear and virtually addressed, raising an
interrupt when it fills up. The PMU driver handles these interrupts to
give the appearance of a ring buffer, as expected by the AUX code.

The in-memory trace-like format is self-describing (though not parseable
in reverse) and written as a series of records, with each record
corresponding to a sample and consisting of a sequence of packets. These
packets are defined by the architecture, although some have CPU-specific
fields for recording information specific to the microarchitecture.

As a simple example, a record generated for a branch instruction may
consist of the following packets:

  0 (Address) : Virtual PC of the branch instruction
  1 (Type)    : Conditional direct branch
  2 (Counter) : Number of cycles taken from Dispatch to Issue
  3 (Address) : Virtual branch target + condition flags
  4 (Counter) : Number of cycles taken from Dispatch to Complete
  5 (Events)  : Mispredicted as not-taken
  6 (END)     : End of record

It is also possible to toggle properties such as timestamp packets in
each record.

This patch adds support for SPE in the form of a new perf driver.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/Kconfig       |    8 +
 drivers/perf/Makefile      |    1 +
 drivers/perf/arm_spe_pmu.c | 1243 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1252 insertions(+)
 create mode 100644 drivers/perf/arm_spe_pmu.c

Comments

Kim Phillips June 5, 2017, 3:55 p.m. UTC | #1
On Mon, 5 Jun 2017 16:22:56 +0100
Will Deacon <will.deacon@arm.com> wrote:

> +/* Perf callbacks */
> +static int arm_spe_pmu_event_init(struct perf_event *event)
> +{
> +	u64 reg;
> +	struct perf_event_attr *attr = &event->attr;
> +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> +
> +	/* This is, of course, deeply driver-specific */
> +	if (attr->type != event->pmu->type)
> +		return -ENOENT;
> +
> +	if (event->cpu >= 0 &&
> +	    !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
> +		return -ENOENT;
> +
> +	if (arm_spe_event_to_pmsevfr(event) & PMSEVFR_EL1_RES0)
> +		return -EOPNOTSUPP;
> +
> +	if (event->hw.sample_period < spe_pmu->min_period ||
> +	    event->hw.sample_period & PMSIRR_EL1_IVAL_MASK)
> +		return -EOPNOTSUPP;
> +
> +	if (attr->exclude_idle)
> +		return -EOPNOTSUPP;
> +
> +	/*
> +	 * Feedback-directed frequency throttling doesn't work when we
> +	 * have a buffer of samples. We'd need to manually count the
> +	 * samples in the buffer when it fills up and adjust the event
> +	 * count to reflect that. Instead, force the user to specify a
> +	 * sample period instead.
> +	 */
> +	if (attr->freq)
> +		return -EINVAL;
> +
> +	reg = arm_spe_event_to_pmsfcr(event);
> +	if ((reg & BIT(PMSFCR_EL1_FE_SHIFT)) &&
> +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_EVT))
> +		return -EOPNOTSUPP;
> +
> +	if ((reg & BIT(PMSFCR_EL1_FT_SHIFT)) &&
> +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_TYP))
> +		return -EOPNOTSUPP;
> +
> +	if ((reg & BIT(PMSFCR_EL1_FL_SHIFT)) &&
> +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
> +		return -EOPNOTSUPP;
> +
> +	return 0;
> +}

AFAICT, my comments from the last submission have still not been fully
addressed:

http://lists.infradead.org/pipermail/linux-arm-kernel/2017-May/508027.html

Thanks,

Kim
Will Deacon June 5, 2017, 4:11 p.m. UTC | #2
On Mon, Jun 05, 2017 at 10:55:16AM -0500, Kim Phillips wrote:
> On Mon, 5 Jun 2017 16:22:56 +0100
> Will Deacon <will.deacon@arm.com> wrote:
> 
> > +/* Perf callbacks */
> > +static int arm_spe_pmu_event_init(struct perf_event *event)
> > +{
> > +	u64 reg;
> > +	struct perf_event_attr *attr = &event->attr;
> > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > +
> > +	/* This is, of course, deeply driver-specific */
> > +	if (attr->type != event->pmu->type)
> > +		return -ENOENT;
> > +
> > +	if (event->cpu >= 0 &&
> > +	    !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
> > +		return -ENOENT;
> > +
> > +	if (arm_spe_event_to_pmsevfr(event) & PMSEVFR_EL1_RES0)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (event->hw.sample_period < spe_pmu->min_period ||
> > +	    event->hw.sample_period & PMSIRR_EL1_IVAL_MASK)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (attr->exclude_idle)
> > +		return -EOPNOTSUPP;
> > +
> > +	/*
> > +	 * Feedback-directed frequency throttling doesn't work when we
> > +	 * have a buffer of samples. We'd need to manually count the
> > +	 * samples in the buffer when it fills up and adjust the event
> > +	 * count to reflect that. Instead, force the user to specify a
> > +	 * sample period instead.
> > +	 */
> > +	if (attr->freq)
> > +		return -EINVAL;
> > +
> > +	reg = arm_spe_event_to_pmsfcr(event);
> > +	if ((reg & BIT(PMSFCR_EL1_FE_SHIFT)) &&
> > +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_EVT))
> > +		return -EOPNOTSUPP;
> > +
> > +	if ((reg & BIT(PMSFCR_EL1_FT_SHIFT)) &&
> > +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_TYP))
> > +		return -EOPNOTSUPP;
> > +
> > +	if ((reg & BIT(PMSFCR_EL1_FL_SHIFT)) &&
> > +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
> > +		return -EOPNOTSUPP;
> > +
> > +	return 0;
> > +}
> 
> AFAICT, my comments from the last submission have still not been fully
> addressed:
> 
> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-May/508027.html

To be frank, I really don't plan to address them and, even if I did, I would
trust Mark to NAK the change. If you're desperate for pr_debug, I'll add it
to keep you happy, but anything more than that needs to come in the form of
a separate patch submission addressing the wider problem of error reporting
from PMU drivers back to userspace. Patches welcome, but I suspect you're
still busy working on the tools code.

Do you have any constructive comments on the patch?

Will
Mark Rutland June 15, 2017, 2:57 p.m. UTC | #3
Hi Will,

Sorry for the delay on this review; it's taken me a while to ingest DDI
0586A and get a feel for how profiling PMUs work.

I have a number of comments below.

On Mon, Jun 05, 2017 at 04:22:56PM +0100, Will Deacon wrote:
> +/* ID registers */
> +#define PMSIDR_EL1			sys_reg(3, 0, 9, 9, 7)

Nit: could we please give the sysreg definitions a SYS_ prefix, for
consistency with other (architected) sysregs?

Ideally, as these are architected they'd live in <asm/sysreg.h>, but I'm
happy to factor that out later.

> +#define PMSIDR_EL1_FE_SHIFT		0
> +#define PMSIDR_EL1_FT_SHIFT		1
> +#define PMSIDR_EL1_FL_SHIFT		2
> +#define PMSIDR_EL1_ARCHINST_SHIFT	3
> +#define PMSIDR_EL1_LDS_SHIFT		4
> +#define PMSIDR_EL1_ERND_SHIFT		5
> +#define PMSIDR_EL1_INTERVAL_SHIFT	8
> +#define PMSIDR_EL1_INTERVAL_MASK	0xfUL
> +#define PMSIDR_EL1_MAXSIZE_SHIFT	12
> +#define PMSIDR_EL1_MAXSIZE_MASK		0xfUL
> +#define PMSIDR_EL1_COUNTSIZE_SHIFT	16
> +#define PMSIDR_EL1_COUNTSIZE_MASK	0xfUL
> +
> +#define PMBIDR_EL1			sys_reg(3, 0, 9, 10, 7)
> +#define PMBIDR_EL1_ALIGN_SHIFT		0
> +#define PMBIDR_EL1_ALIGN_MASK		0xfU
> +#define PMBIDR_EL1_P_SHIFT		4
> +#define PMBIDR_EL1_F_SHIFT		5
> +
> +/* Sampling controls */
> +#define PMSCR_EL1			sys_reg(3, 0, 9, 9, 0)
> +#define PMSCR_EL1_E0SPE_SHIFT		0
> +#define PMSCR_EL1_E1SPE_SHIFT		1
> +#define PMSCR_EL1_CX_SHIFT		3
> +#define PMSCR_EL1_PA_SHIFT		4
> +#define PMSCR_EL1_TS_SHIFT		5
> +#define PMSCR_EL1_PCT_SHIFT		6
> +
> +#define PMSICR_EL1			sys_reg(3, 0, 9, 9, 2)
> +
> +#define PMSIRR_EL1			sys_reg(3, 0, 9, 9, 3)
> +#define PMSIRR_EL1_RND_SHIFT		0
> +#define PMSIRR_EL1_IVAL_MASK		0xffUL

This is a little odd. This covers RND and (some) RES0 bits, not
INTERVAL, so it's the opposite polarity to the rest of the masks, and
incomplete (missing the upper 32 RES0 bits).

Could we please have this in the same polarity as the other masks,
precisely covering the INTERVAL field? i.e.

#define PMSIRR_EL1_INTERVAL_SHIFT	8
#define PMSIRR_EL1_INTERVAL_MASK	0xffffffUL

... I've proposed corresponding updates to the usages below.

> +
> +/* Filtering controls */
> +#define PMSFCR_EL1			sys_reg(3, 0, 9, 9, 4)
> +#define PMSFCR_EL1_FE_SHIFT		0
> +#define PMSFCR_EL1_FT_SHIFT		1
> +#define PMSFCR_EL1_FL_SHIFT		2
> +#define PMSFCR_EL1_B_SHIFT		16
> +#define PMSFCR_EL1_LD_SHIFT		17
> +#define PMSFCR_EL1_ST_SHIFT		18
> +
> +#define PMSEVFR_EL1			sys_reg(3, 0, 9, 9, 5)
> +#define PMSEVFR_EL1_RES0		0x0000ffff00ff0f55UL
> +
> +#define PMSLATFR_EL1			sys_reg(3, 0, 9, 9, 6)
> +#define PMSLATFR_EL1_MINLAT_SHIFT	0

I was going to suggest that we need a PMSLATFR_EL1_MINLAT_MASK, but I
see this is handled implicitly by ATTR_CFG_FLD_min_latency_{LO,HI} and
ATTR_CFG_GET_FLD().

> +
> +/* Buffer controls */
> +#define PMBLIMITR_EL1			sys_reg(3, 0, 9, 10, 0)
> +#define PMBLIMITR_EL1_E_SHIFT		0
> +#define PMBLIMITR_EL1_FM_SHIFT		1
> +#define PMBLIMITR_EL1_FM_MASK		0x3UL
> +#define PMBLIMITR_EL1_FM_STOP_IRQ	(0 << PMBLIMITR_EL1_FM_SHIFT)
> +
> +#define PMBPTR_EL1			sys_reg(3, 0, 9, 10, 1)
> +
> +/* Buffer error reporting */
> +#define PMBSR_EL1			sys_reg(3, 0, 9, 10, 3)
> +#define PMBSR_EL1_COLL_SHIFT		16
> +#define PMBSR_EL1_S_SHIFT		17
> +#define PMBSR_EL1_EA_SHIFT		18
> +#define PMBSR_EL1_DL_SHIFT		19
> +#define PMBSR_EL1_EC_SHIFT		26
> +#define PMBSR_EL1_EC_MASK		0x3fUL
> +
> +#define PMBSR_EL1_EC_BUF		(0x0UL << PMBSR_EL1_EC_SHIFT)
> +#define PMBSR_EL1_EC_FAULT_S1		(0x24UL << PMBSR_EL1_EC_SHIFT)
> +#define PMBSR_EL1_EC_FAULT_S2		(0x25UL << PMBSR_EL1_EC_SHIFT)
> +
> +#define PMBSR_EL1_FAULT_FSC_SHIFT	0
> +#define PMBSR_EL1_FAULT_FSC_MASK	0x3fUL

Nit: it might be worth having MSS in the name to show the field
hierarchy, e.g. PMBSR_EL1_MSS_FAULT_FSC_MASK

> +
> +#define PMBSR_EL1_BUF_BSC_SHIFT		0
> +#define PMBSR_EL1_BUF_BSC_MASK		0x3fUL
> +
> +#define PMBSR_EL1_BUF_BSC_FULL		(0x1UL << PMBSR_EL1_BUF_BSC_SHIFT)

Likewise here.

> +
> +#define psb_csync()			asm volatile("hint #17")

Other than my comments above, all the register/field/insn definitions
appear to be correct per DDI 0586A.

> +
> +struct arm_spe_pmu_buf {
> +	int					nr_pages;
> +	bool					snapshot;
> +	void					*base;
> +};
> +
> +struct arm_spe_pmu {
> +	struct pmu				pmu;
> +	struct platform_device			*pdev;
> +	cpumask_t				supported_cpus;
> +	struct hlist_node			hotplug_node;
> +
> +	int					irq; /* PPI */
> +
> +	u16					min_period;
> +	u16					cnt_width;

Elsewhere, we refer to this as the counter size (e.g.
SPE_PMU_CAP_CNT_SZ, and count_size in the sysfs interface).

Could we please pick one of "width" and "size" and use it consistently?

I guess "size" is preferable, given the architectural name is
"CountSize".

> +
> +#define SPE_PMU_FEAT_FILT_EVT			(1UL << 0)
> +#define SPE_PMU_FEAT_FILT_TYP			(1UL << 1)
> +#define SPE_PMU_FEAT_FILT_LAT			(1UL << 2)
> +#define SPE_PMU_FEAT_ARCH_INST			(1UL << 3)
> +#define SPE_PMU_FEAT_LDS			(1UL << 4)
> +#define SPE_PMU_FEAT_ERND			(1UL << 5)
> +#define SPE_PMU_FEAT_DEV_PROBED			(1UL << 63)
> +	u64					features;
> +
> +	u16					max_record_sz;
> +	u16					align;
> +	struct perf_output_handle __percpu	*handle;
> +};
> +
> +#define to_spe_pmu(p) (container_of(p, struct arm_spe_pmu, pmu))
> +
> +/* Convert a free-running index from perf into an SPE buffer offset */
> +#define PERF_IDX2OFF(idx, buf)	((idx) & (((buf)->nr_pages << PAGE_SHIFT) - 1))

The masking logic here assumes nr_pages is a power of two.

That's not always true, as arm_spe_pmu_setup_aux() only ensures that
nr_pages is a multiple of 2 * PAGE_SIZE, and only when using snapshot
mode.

For example, with ten 4K pages:

nr_pages:			0b1010
(nr_pages << PAGE_SHIFT):	0b1010000000000000
(nr_pages << PAGE_SHIFT) - 1:	0b1001111111111111

> +
> +/* Keep track of our dynamic hotplug state */
> +static enum cpuhp_state arm_spe_pmu_online;
> +
> +/* This sysfs gunk was really good fun to write. */
> +enum arm_spe_pmu_capabilities {
> +	SPE_PMU_CAP_ARCH_INST = 0,

No need for the initializer; enums start at zero.

> +	SPE_PMU_CAP_ERND,
> +	SPE_PMU_CAP_FEAT_MAX,
> +	SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX,
> +	SPE_PMU_CAP_MIN_IVAL,
> +};

IMO, it would be worth having s/CAP/CAP_FEAT/ for the HW features.

We could get rid of the confusing SPE_PMU_CAP_FEAT_MAX definition here,
if we were to:

> +
> +static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = {
> +	[SPE_PMU_CAP_ARCH_INST]	= SPE_PMU_FEAT_ARCH_INST,
> +	[SPE_PMU_CAP_ERND]	= SPE_PMU_FEAT_ERND,
> +};

.. change this to:

static int arm_spe_pmu_feat_caps[] = {
	...
};

... and:

> +
> +static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
> +{
> +	if (cap < SPE_PMU_CAP_FEAT_MAX)

... change this to:

	if (cap < ARRAY_SIZE(arm_spe_pmu_feat_caps))

> +		return !!(spe_pmu->features & arm_spe_pmu_feat_caps[cap]);
> +
> +	switch (cap) {
> +	case SPE_PMU_CAP_CNT_SZ:
> +		return spe_pmu->cnt_width;
> +	case SPE_PMU_CAP_MIN_IVAL:
> +		return spe_pmu->min_period;
> +	default:
> +		WARN(1, "unknown cap %d\n", cap);
> +	}
> +
> +	return 0;
> +}
> +
> +static ssize_t arm_spe_pmu_cap_show(struct device *dev,
> +				    struct device_attribute *attr,
> +				    char *buf)
> +{
> +	struct platform_device *pdev = to_platform_device(dev);
> +	struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
> +	struct dev_ext_attribute *ea =
> +		container_of(attr, struct dev_ext_attribute, attr);
> +	int cap = (long)ea->var;
> +
> +	return snprintf(buf, PAGE_SIZE, "%u\n",
> +		arm_spe_pmu_cap_get(spe_pmu, cap));
> +}
> +
> +#define SPE_EXT_ATTR_ENTRY(_name, _func, _var)				\
> +	&((struct dev_ext_attribute[]) {				\
> +		{ __ATTR(_name, S_IRUGO, _func, NULL), (void *)_var }	\
> +	})[0].attr.attr
> +
> +#define SPE_CAP_EXT_ATTR_ENTRY(_name, _var)				\
> +	SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var)
> +
> +static struct attribute *arm_spe_pmu_cap_attr[] = {
> +	SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST),
> +	SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND),
> +	SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ),
> +	SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL),
> +	NULL,
> +};

I'd have expected GCC to warn about (integer/enum) _var values being
cast straight to void *, given the size mismatch.

Is that not the case, or do we need an unsigned long cast in
SPE_CAP_EXT_ATTR_ENTRY()?

Maybe GCC only complains the other way around.

[...]

> +/* Convert between user ABI and register values */
> +static u64 arm_spe_event_to_pmscr(struct perf_event *event)
> +{
> +	struct perf_event_attr *attr = &event->attr;
> +	u64 reg = 0;
> +
> +	reg |= ATTR_CFG_GET_FLD(attr, ts_enable) << PMSCR_EL1_TS_SHIFT;
> +	reg |= ATTR_CFG_GET_FLD(attr, pa_enable) << PMSCR_EL1_PA_SHIFT;

We should limit PA access to privileged users.

> +
> +	if (!attr->exclude_user)
> +		reg |= BIT(PMSCR_EL1_E0SPE_SHIFT);
> +
> +	if (!attr->exclude_kernel)
> +		reg |= BIT(PMSCR_EL1_E1SPE_SHIFT);
> +
> +	if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
> +		reg |= BIT(PMSCR_EL1_CX_SHIFT);

... maybe likewise for CONTEXTIDR, too.

> +
> +	return reg;
> +}
> +
> +static void arm_spe_event_sanitise_period(struct perf_event *event)
> +{
> +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> +	u64 period = event->hw.sample_period & ~PMSIRR_EL1_IVAL_MASK;

... as noted above, the upper 32 bits are RES0 in addition to the low 8
bits, so we need to explicitly check bits 31:8, e.g.

	u64 period = event->hw.sample_period;
	period &= (PMSIRR_EL1_INTERVAL_MASK << PMSIRR_EL1_INTERVAL_SHIFT);

> +
> +	if (period < spe_pmu->min_period)
> +		period = spe_pmu->min_period;

We already verify this in arm_spe_pmu_event_init(), so we don't need to
check this here.

We can drop arm_spe_event_sanitise_period() entirely. Given we validate
the period at event_init() time, there's no need to sanitize the value.

> +
> +	event->hw.sample_period = period;
> +}
> +
> +static u64 arm_spe_event_to_pmsirr(struct perf_event *event)
> +{
> +	struct perf_event_attr *attr = &event->attr;
> +	u64 reg = 0;
> +
> +	arm_spe_event_sanitise_period(event);
> +
> +	reg |= ATTR_CFG_GET_FLD(attr, jitter) << PMSIRR_EL1_RND_SHIFT;
> +	reg |= event->hw.sample_period;
> +
> +	return reg;
> +}

Given the above:

	u64 reg = event->hw.sample_period;
	reg |= ATTR_CFG_GET_FLD(attr, jitter) << PMSIRR_EL1_RND_SHIFT;

	return reg;

[...]

> +static bool arm_spe_pmu_buffer_mgmt_pending(u64 pmbsr)
> +{
> +	const char *err_str;
> +
> +	/* Service required? */
> +	if (!(pmbsr & BIT(PMBSR_EL1_S_SHIFT)))
> +		return false;
> +
> +	/* We only expect buffer management events */
> +	switch (pmbsr & (PMBSR_EL1_EC_MASK << PMBSR_EL1_EC_SHIFT)) {
> +	case PMBSR_EL1_EC_BUF:
> +		/* Handled below */
> +		break;
> +	case PMBSR_EL1_EC_FAULT_S1:
> +	case PMBSR_EL1_EC_FAULT_S2:
> +		err_str = "Unexpected buffer fault";
> +		goto out_err;
> +	default:
> +		err_str = "Unknown error code";
> +		goto out_err;
> +	}

For the error cases, I take it the assumption is that we leave
PMBSR_EL1.S set, so that the HW doesn't start again?

> +
> +	/* Buffer management event */
> +	switch (pmbsr & (PMBSR_EL1_BUF_BSC_MASK << PMBSR_EL1_BUF_BSC_SHIFT)) {
> +	case PMBSR_EL1_BUF_BSC_FULL:
> +		/* Ensure new profiling data is visible to the CPU */
> +		psb_csync();
> +		dsb(nsh);

I think that NSH might not be sufficient, given how this function is
used by callers below. I'll comment specifically in those call-sites.

> +		return true;
> +	default:
> +		err_str = "Unknown buffer status code";
> +	}
> +
> +out_err:
> +	pr_err_ratelimited("%s on CPU %d [PMBSR=0x%08llx]\n", err_str,
> +			   smp_processor_id(), pmbsr);

It might be worth dumping pmbsr with %016lx. The upper 64 bits are
currently RES0, but they do exist.

> +	return false;
> +}
> +

Could we have a comment block here to describe (roughly) what 
we're trying to do for the snapshot case?

> +static u64 arm_spe_pmu_next_snapshot_off(struct perf_output_handle *handle)
> +{
> +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(handle->event->pmu);
> +	u64 head = PERF_IDX2OFF(handle->head, buf);
> +	u64 limit = buf->nr_pages * PAGE_SIZE;
> +
> +	/*
> +	 * The trace format isn't parseable in reverse, so clamp
> +	 * the limit to half of the buffer size in snapshot mode
> +	 * so that the worst case is half a buffer of records, as
> +	 * opposed to a single record.
> +	 */
> +	if (head < limit >> 1)
> +		limit >>= 1;

I was going to ask how we ensured nr_pages was 2 * SZ_4K * k, but I see
that arm_spe_pmu_setup_aux() ensures that when using snapshot mode.

> +
> +	/*
> +	 * If we're within max_record_sz of the limit, we must
> +	 * pad, move the head index and recompute the limit.
> +	 */
> +	if (limit - head < spe_pmu->max_record_sz) {
> +		memset(buf->base + head, 0, limit - head);

Could we have a mnemonic for the padding byte, and/or a helper that
wraps memset? e.g.

static void pad_buffer(void *start, u64 size)
{
	/* The padding packet is a single zero byte */
	memset(start, 0, size);
}

> +		handle->head = PERF_IDX2OFF(limit, buf);
> +		limit = ((buf->nr_pages * PAGE_SIZE) >> 1) + handle->head;
> +	}
> +
> +	return limit;
> +}
> +
> +static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
> +{
> +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> +	u64 head = PERF_IDX2OFF(handle->head, buf);
> +	u64 tail = PERF_IDX2OFF(handle->head + handle->size, buf);
> +	u64 wakeup = PERF_IDX2OFF(handle->wakeup, buf);
> +	u64 limit = buf->nr_pages * PAGE_SIZE;
> +
> +	/*
> +	 * Set the limit pointer to either the watermark or the
> +	 * current tail pointer; whichever comes first.
> +	 */
> +	if (handle->head + handle->size <= handle->wakeup) {
> +		/* The tail is next, so check for wrapping */
> +		if (tail >= head) {
> +			/*
> +			 * No wrapping, but need to align downwards to
> +			 * avoid corrupting unconsumed data.
> +			 */
> +			limit = round_down(tail, PAGE_SIZE);
> +
> +		}
> +	} else if (wakeup >= head) {

When wakeup == head, do we need to signal a wakeup event somehow?
Currently we'll pad the buffer, signal truncation, and end output, which
seems a little odd, but maybe that's what perf expects.

> +		/*
> +		 * The wakeup is next and doesn't wrap. Align upwards to
> +		 * ensure that we do indeed reach the watermark.
> +		 */
> +		limit = round_up(wakeup, PAGE_SIZE);
> +
> +		/*
> +		 * If rounding up crosses the tail, then we have to
> +		 * round down to avoid corrupting unconsumed data.
> +		 * Hopefully the tail will have moved by the time we
> +		 * hit the new limit.
> +		 */
> +		if (wakeup < tail && limit > tail)
> +			limit = round_down(wakeup, PAGE_SIZE);
> +	}

It took me a while to grok that we must consider the wakeup in
free-running counter space to avoid early wakeups, while we must
consider the tail in ring-buffer offset space to avoid clobbering data.

With that understanding, I think we have an issue here. If wakeup is
more than buffer size in the future, and the buffer is empty, I think we
set the limit too low.

In that case, we'd evaluate:

	handle->head + handle->size <= handle->wakeup

... as true, since size is at most buffer size. Thus we'd go into the
first if block. There we'd evaluate:

	tail >= head

... as true, since when the buffer is empty, head == tail. Thus, we'd
set the limit to:

	round_down(tail, PAGE_SIZE)

... which'll leave us with limit <= head, since head == tail. Thus,
we'll hit the case below:

> +
> +	/*
> +	 * If rounding down crosses the head, then the buffer is full,
> +	 * so pad to tail and end the session.
> +	 */
> +	if (limit <= head) {
> +		memset(buf->base + head, 0, handle->size);
> +		perf_aux_output_skip(handle, handle->size);
> +		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
> +		perf_aux_output_end(handle, 0);
> +		limit = 0;
> +	}
> +
> +	return limit;
> +}

... and end all output, even though the entire buffer was empty, and we
could have returned the end of the buffer as the limit.

It might be that something prevents wakeup from being that far in the
future, but in previous discussions we'd assumed that it could be any
arbitrary value.

I believe we can solve that, and simplify the logic as below. I've left
the wakeup < head and wakeup == head cases as above, ignored and
terminating respectively.

static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
{
	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
	const u64 bufsize = buf->nr_pages * PAGE_SIZE;
	u64 limit = bufsize;
	u64 head = PERF_IDX2OFF(handle->head, buf);
	u64 tail = PERF_IDX2OFF(handle->head + handle->size, buf);
	u64 wakeup = PERF_IDX2OFF(handle->wakeup, buf);

	if (!handle->size)
		goto no_space;
	
	/*
	 * Avoid clobbering unconsumed data. We know we have space, so
	 * if we see head == tail we know that the buffer is empty. If
	 * head > tail, then there's nothing to clobber prior to
	 * wrapping.
	 */
	if (head < tail)
		limit = round_down(tail, PAGE_SIZE);
	
	/*
	 * Wakeup may be arbitrarily far into future. If it's not in the
	 * current generation, either we'll wrap before hitting it, or
	 * it's in the past and has been handled already.
	 *
	 * If there's a wakeup before we wrap, arrange to be woken up by
	 * the page boundary following it. Keep the tail boundary if
	 * that's lower.
	 */
	if ((handle->wakeup / bufsize) == (handle->head / bufsize)) &&
	    head <= wakeup)
		limit = min(limit, round_up(wakeup, PAGE_SIZE));

	if (limit <= head)
		goto no_space;
	
	return limit;

no_space:
	memset(buf->base + head, 0, handle->size);
	perf_aux_output_skip(handle, handle->size);
	perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
	perf_aux_output_end(handle, 0);

	return 0;
}

> +
> +static u64 arm_spe_pmu_next_off(struct perf_output_handle *handle)
> +{
> +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(handle->event->pmu);
> +	u64 limit = __arm_spe_pmu_next_off(handle);
> +	u64 head = PERF_IDX2OFF(handle->head, buf);
> +
> +	/*
> +	 * If the head has come too close to the end of the buffer,
> +	 * then pad to the end and recompute the limit.
> +	 */
> +	if (limit && (limit - head < spe_pmu->max_record_sz)) {
> +		memset(buf->base + head, 0, limit - head);
> +		perf_aux_output_skip(handle, limit - head);
> +		limit = __arm_spe_pmu_next_off(handle);
> +	}
> +
> +	return limit;
> +}
> +
> +static void arm_spe_perf_aux_output_begin(struct perf_output_handle *handle,
> +					  struct perf_event *event)
> +{
> +	u64 base, limit;
> +	struct arm_spe_pmu_buf *buf;
> +
> +	/* Start a new aux session */
> +	buf = perf_aux_output_begin(handle, event);
> +	if (!buf) {
> +		event->hw.state |= PERF_HES_STOPPED;
> +		/*
> +		 * We still need to clear the limit pointer, since the
> +		 * profiler might only be disabled by virtue of a fault.
> +		 */
> +		limit = 0;
> +		goto out_write_limit;
> +	}
> +
> +	limit = buf->snapshot ? arm_spe_pmu_next_snapshot_off(handle)
> +			      : arm_spe_pmu_next_off(handle);
> +	if (limit)
> +		limit |= BIT(PMBLIMITR_EL1_E_SHIFT);
> +
> +	base = (u64)buf->base + PERF_IDX2OFF(handle->head, buf);
> +	write_sysreg_s(base, PMBPTR_EL1);
> +	limit += (u64)buf->base;
> +

I believe an isb() is necessary here to ensure the write to PMBPTR_EL1
occurs before the write to PMBLIMITR_EL1 enables the PMU. Otherwise, the
CPU could execute those out-of-order.

It's not clear to me whether that is sufficient for the PMU to observe
the new PMBPTR_EL1 before the new PMBLIMITR_EL1 value, though I assume
it must be.

> +out_write_limit:
> +	write_sysreg_s(limit, PMBLIMITR_EL1);
> +}
> +
> +static bool arm_spe_perf_aux_output_end(struct perf_output_handle *handle,
> +					struct perf_event *event,
> +					bool resume)
> +{
> +	u64 pmbptr, pmbsr, offset, size;
> +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> +	bool truncated;
> +
> +	/*
> +	 * We can be called via IRQ work trying to disable the PMU after
> +	 * a buffer full event. In this case, the aux session has already
> +	 * been stopped, so there's nothing to do here.
> +	 */
> +	if (!buf)
> +		return false;
> +
> +	/*
> +	 * If there isn't a pending management event and we're not stopping
> +	 * the current session, then just leave everything alone.
> +	 */
> +	pmbsr = read_sysreg_s(PMBSR_EL1);

When we call from arm_spe_pmu_irq_handler(), I think we need
synchronisation before reading PMBSR_EL1.

AFAICT from the spec, a context synchronisation event doesn't ensure
that the PMU's indirect write to PMBSR_EL1 is visible to the PE's direct
read above. I beleive we need a PSB CSYNC (and subsequent ISB) to ensure
that.

The only other caller is from arm_spe_pmu_stop(), which first calls
arm_spe_pmu_disable_and_drain_local(), so I guess the new barriers
should live in arm_spe_pmu_irq_handler(). I'll comment there.

> +	if (!arm_spe_pmu_buffer_mgmt_pending(pmbsr) && resume)
> +		return false; /* Spurious IRQ */
> +
> +	/* Ensure hardware updates to PMBPTR_EL1 are visible */
> +	isb();

Can we please move this into arm_spe_pmu_buffer_mgmt_pending(), after
the associated PSB CSYNC?

Then we can say that arm_spe_pmu_buffer_mgmt_pending() ensures all HW
updates have been synchronised (and made visible) if it returns true,
and it's easier to see that the synchronisation is correct.

> +
> +	/*
> +	 * Work out how much data has been written since the last update
> +	 * to the head index.
> +	 */
> +	pmbptr = round_down(read_sysreg_s(PMBPTR_EL1), spe_pmu->align);

I don't believe we need to align this.

Per the spec, PMBPTR_EL1[M:0] are RES0 in HW unless sync external abort
reporting is present, in which case they're valid. We write these bits
as zero, unless we have a bug elsewhere.

... so either the bits are zero, and we're fine, or an external abourt
has been hit. In the external abort case, we have no idea how far we
need to reverse the base pointer anyhow.

> +	offset = pmbptr - (u64)buf->base;
> +	size = offset - PERF_IDX2OFF(handle->head, buf);
> +
> +	if (buf->snapshot)
> +		handle->head = offset;

It's be worth a /* see arm_spe_pmu_next_snapshot_off() */ comment
or similar to explain what we're going for the snapshot case here.

> +
> +	/*
> +	 * Either the buffer is full or we're stopping the session. Check
> +	 * that we didn't write a partial record, since this can result
> +	 * in unparseable trace and we must disable the event.
> +	 */
> +	if (pmbsr & BIT(PMBSR_EL1_COLL_SHIFT))
> +		perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
> +
> +	truncated = pmbsr & BIT(PMBSR_EL1_DL_SHIFT);
> +	if (truncated)
> +		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
> +
> +	perf_aux_output_end(handle, size);

The comment block above perf_aux_output_end() says:

  It is the pmu driver's responsibility to observe ordering rules of the
  hardware, so that all the data is externally visible before this is
  called.

... but in arm_spe_pmu_buffer_mgmt_pending() we only ensured that the
data was visible in the current NSH domain (i.e. only to this CPU).

I followed the callchain for updating head:

perf_aux_output_end()
-> perf_event_aux_event()
-> perf_output_end()
-> perf_output_put_handle()

... I see that there's an smp_wmb() (i.e. a DMB ISHST) on that path, but
it's not clear to me if that's sufficient to ensure that the PMU's
writes are made visible to other CPUs.

Given the comment, I'd feel happier if we had something here or in
arm_spe_pmu_buffer_mgmt_pending() to ensure that the PMU's prior writes
are visible to other CPUs.

> +
> +	/*
> +	 * If we're not resuming the session, then we can clear the fault
> +	 * and we're done, otherwise we need to start a new session.
> +	 */
> +	if (!resume)
> +		write_sysreg_s(0, PMBSR_EL1);
> +	else if (!truncated)
> +		arm_spe_perf_aux_output_begin(handle, event);
> +
> +	return true;
> +}
> +
> +/* IRQ handling */
> +static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
> +{
> +	struct perf_output_handle *handle = dev;
> +
> +	if (!perf_get_aux(handle))
> +		return IRQ_NONE;
> +
> +	if (!arm_spe_perf_aux_output_end(handle, handle->event, true))
> +		return IRQ_NONE;

As commented in arm_spe_perf_aux_output_end(), I think we need a
psb_csync(); isb() sequence prior to the read of PMBSR_EL1 in
arm_spe_perf_aux_output_end() to ensure that it is up-to-date w.r.t. the
interrupt.

> +
> +	irq_work_run();
> +	isb(); /* Ensure the buffer is disabled if data loss has occurred */

What exactly are we synchronising here?

AFAICT, when truncation occurs we don't clear PMBLIMITR_EL1.E, so the
buffer is only implicitly disabled by the PMU's indirect write
PMBSR_EL1.S, which we must have already synchronised prior to reading
PMBSR_EL1.

... so I can't see why this is necessary.

> +	write_sysreg_s(0, PMBSR_EL1);

... and regardless, we clear PMBSR_EL1.S here, which'll start the PMU
again, even if truncation occured, which I don't think we want.

Can we have arm_spe_perf_aux_output_end() clear PMBLIMITR_EL1.E when
truncation occurs?

> +	return IRQ_HANDLED;
> +}
> +
> +/* Perf callbacks */
> +static int arm_spe_pmu_event_init(struct perf_event *event)
> +{
> +	u64 reg;
> +	struct perf_event_attr *attr = &event->attr;
> +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> +
> +	/* This is, of course, deeply driver-specific */
> +	if (attr->type != event->pmu->type)
> +		return -ENOENT;
> +
> +	if (event->cpu >= 0 &&
> +	    !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
> +		return -ENOENT;

We're not rejecting cpu < 0, so I take it we're trying to handle
per-task events?

As I've mentioned before, that case worries me. One thing I've just
realised we need to figure out is what happens if attr.inherit is set.
The core doesn't reject that, and I suspect we may need to here.

> +
> +	if (arm_spe_event_to_pmsevfr(event) & PMSEVFR_EL1_RES0)
> +		return -EOPNOTSUPP;
> +
> +	if (event->hw.sample_period < spe_pmu->min_period ||
> +	    event->hw.sample_period & PMSIRR_EL1_IVAL_MASK)
> +		return -EOPNOTSUPP;

As mentioned in the sysreg comments, we need to check the upper 32 bits
of the PMSIRR value are zero, so we'll need something like:

	if (event->hw.sample_period < spe_pmu->min_period)
		return -EOPNOTSUPP;
	
	if (event->hw.sample_period &
	    ~(PMSIRR_EL1_INTERVAL_MASK << PMSIRR_EL1_INTERVAL_SHIFT))
		return -EOPNOTSUPP;

I think there's a slight miswording in the spec. Page 56 of the spec
(DDI 0586A) says of the PMSIRR_EL1.INTERVAL field:

    Software should set this to a value greater than the minimum
    indicated by PMSIDR_EL1.Interval.

... whereas here we're checking it's *at least* the minimum interval,
which I think is the intent of the spec. That's probably something we
should have clarified spec-side, with s/greater than/at least/.

> +
> +	if (attr->exclude_idle)
> +		return -EOPNOTSUPP;
> +
> +	/*
> +	 * Feedback-directed frequency throttling doesn't work when we
> +	 * have a buffer of samples. We'd need to manually count the
> +	 * samples in the buffer when it fills up and adjust the event
> +	 * count to reflect that. Instead, force the user to specify a
> +	 * sample period instead.
> +	 */
> +	if (attr->freq)
> +		return -EINVAL;
> +
> +	reg = arm_spe_event_to_pmsfcr(event);
> +	if ((reg & BIT(PMSFCR_EL1_FE_SHIFT)) &&
> +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_EVT))
> +		return -EOPNOTSUPP;
> +
> +	if ((reg & BIT(PMSFCR_EL1_FT_SHIFT)) &&
> +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_TYP))
> +		return -EOPNOTSUPP;
> +
> +	if ((reg & BIT(PMSFCR_EL1_FL_SHIFT)) &&
> +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
> +		return -EOPNOTSUPP;

Does anything prevent this event from being added to a group?

Surely we should check that here?

> +
> +	return 0;
> +}
> +
> +static void arm_spe_pmu_start(struct perf_event *event, int flags)
> +{
> +	u64 reg;
> +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
> +
> +	hwc->state = 0;
> +	arm_spe_perf_aux_output_begin(handle, event);
> +	if (hwc->state)
> +		return;

I was expecting we'd do this last, since PMBLIIMITR.E enables profiling.

I understand that we're relying on the PMSCR_EL1 filtering value to
prevent anything being written to tbe buffer until we've vonfigured the
options, but I'd feel a lot happier if we consistently relied upon
PMBLIMITR.E for that.

> +
> +	reg = arm_spe_event_to_pmsfcr(event);
> +	write_sysreg_s(reg, PMSFCR_EL1);
> +
> +	reg = arm_spe_event_to_pmsevfr(event);
> +	write_sysreg_s(reg, PMSEVFR_EL1);
> +
> +	reg = arm_spe_event_to_pmslatfr(event);
> +	write_sysreg_s(reg, PMSLATFR_EL1);
> +
> +	if (flags & PERF_EF_RELOAD) {
> +		reg = arm_spe_event_to_pmsirr(event);
> +		write_sysreg_s(reg, PMSIRR_EL1);
> +		isb();
> +		reg = local64_read(&hwc->period_left);
> +		write_sysreg_s(reg, PMSICR_EL1);
> +	}
> +
> +	reg = arm_spe_event_to_pmscr(event);
> +	isb();
> +	write_sysreg_s(reg, PMSCR_EL1);
> +}
> +
> +static void arm_spe_pmu_disable_and_drain_local(void)
> +{
> +	/* Disable profiling at EL0 and EL1 */
> +	write_sysreg_s(0, PMSCR_EL1);
> +	isb();
> +
> +	/* Drain any buffered data */
> +	psb_csync();
> +	dsb(nsh);
> +
> +	/* Disable the profiling buffer */
> +	write_sysreg_s(0, PMBLIMITR_EL1);

Can't this be done when we clear PMSCR_EL1? Surely buffered data would
be written out regardless?

> +}

[...]

> +static void *arm_spe_pmu_setup_aux(int cpu, void **pages, int nr_pages,
> +				   bool snapshot)
> +{
> +	int i;
> +	struct page **pglist;
> +	struct arm_spe_pmu_buf *buf;
> +
> +	/*
> +	 * We require an even number of pages for snapshot mode, so that
> +	 * we can effectively treat the buffer as consisting of two equal
> +	 * parts and give userspace a fighting chance of getting some
> +	 * useful data out of it.
> +	 */
> +	if (!nr_pages || (snapshot && (nr_pages & 1)))
> +		return NULL;

As noted above, we may need to ensure that this is a pwoer of two.

> +
> +	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
> +	if (!buf)
> +		return NULL;
> +
> +	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
> +	if (!pglist)
> +		goto out_free_buf;
> +
> +	for (i = 0; i < nr_pages; ++i) {
> +		struct page *page = virt_to_page(pages[i]);
> +
> +		if (PagePrivate(page)) {
> +			pr_warn("unexpected high-order page for auxbuf!");

It looks like the intel-pt driver expects high-order pages.

What prevents us from seeing those?

Why can't we handle those?

How are these pages pinned? Does the core ensure that?

> +			goto out_free_pglist;
> +		}
> +
> +		pglist[i] = virt_to_page(pages[i]);
> +	}
> +
> +	buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
> +	if (!buf->base)
> +		goto out_free_pglist;
> +
> +	buf->nr_pages	= nr_pages;
> +	buf->snapshot	= snapshot;
> +
> +	kfree(pglist);
> +	return buf;
> +
> +out_free_pglist:
> +	kfree(pglist);
> +out_free_buf:
> +	kfree(buf);
> +	return NULL;
> +}
> +
> +static void arm_spe_pmu_free_aux(void *aux)
> +{
> +	struct arm_spe_pmu_buf *buf = aux;
> +
> +	vunmap(buf->base);
> +	kfree(buf);
> +}
> +
> +/* Initialisation and teardown functions */
> +static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu)
> +{
> +	static atomic_t pmu_idx = ATOMIC_INIT(-1);
> +
> +	int idx;
> +	char *name;
> +	struct device *dev = &spe_pmu->pdev->dev;
> +
> +	spe_pmu->pmu = (struct pmu) {
> +		.capabilities	= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
> +		.attr_groups	= arm_spe_pmu_attr_groups,
> +		/*
> +		 * We hitch a ride on the software context here, so that
> +		 * we can support per-task profiling (which is not possible
> +		 * with the invalid context as it doesn't get sched callbacks).
> +		 * This requires that userspace either uses a dummy event for
> +		 * perf_event_open, since the aux buffer is not setup until
> +		 * a subsequent mmap, or creates the profiling event in a
> +		 * disabled state and explicitly PERF_EVENT_IOC_ENABLEs it
> +		 * once the buffer has been created.
> +		 */
> +		.task_ctx_nr	= perf_sw_context,

While other tracing PMUs do this, I think this is a horrible bodge, and
a bad idea, given it violates assumptions made in the core code.

For example, unlike true SW events, add() and start() can fail, so a
tracing event can unexpectedly stop SW events from being scheduled.

AFAICT, we could also try to move a tracing event into a later-created
HW PMU group, which is very worrying.

I really think we should have a separate tracing context for this class
of PMU, or we make it so that the invalid context can receive sched
callbacks.

> +		.event_init	= arm_spe_pmu_event_init,
> +		.add		= arm_spe_pmu_add,
> +		.del		= arm_spe_pmu_del,
> +		.start		= arm_spe_pmu_start,
> +		.stop		= arm_spe_pmu_stop,
> +		.read		= arm_spe_pmu_read,
> +		.setup_aux	= arm_spe_pmu_setup_aux,
> +		.free_aux	= arm_spe_pmu_free_aux,
> +	};
> +
> +	idx = atomic_inc_return(&pmu_idx);
> +	name = devm_kasprintf(dev, GFP_KERNEL, "%s_%d", PMUNAME, idx);
> +	return perf_pmu_register(&spe_pmu->pmu, name, -1);
> +}
> +
> +static void arm_spe_pmu_perf_destroy(struct arm_spe_pmu *spe_pmu)
> +{
> +	perf_pmu_unregister(&spe_pmu->pmu);
> +}
> +
> +static void __arm_spe_pmu_dev_probe(void *info)
> +{
> +	int fld;
> +	u64 reg;
> +	struct arm_spe_pmu *spe_pmu = info;
> +	struct device *dev = &spe_pmu->pdev->dev;
> +
> +	fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1),
> +						   ID_AA64DFR0_PMSVER_SHIFT);
> +	if (!fld) {
> +		dev_err(dev,
> +			"unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n",
> +			fld, smp_processor_id());
> +		return;
> +	}

Given we only bail out when PMSver is zero, surely we can just say:

	dev_err(dev, "SPE not supported on cpu %d", smp_processor_id())

Otherwise, the rest of the probing logic and boilerplate code looked
fine to me.

Thanks,
Mark.
Will Deacon June 21, 2017, 3:39 p.m. UTC | #4
Hi Mark,

Thanks for the extensive review. Replies inline.

On Thu, Jun 15, 2017 at 03:57:29PM +0100, Mark Rutland wrote:
> On Mon, Jun 05, 2017 at 04:22:56PM +0100, Will Deacon wrote:
> > +/* ID registers */
> > +#define PMSIDR_EL1			sys_reg(3, 0, 9, 9, 7)
> 
> Nit: could we please give the sysreg definitions a SYS_ prefix, for
> consistency with other (architected) sysregs?
> 
> Ideally, as these are architected they'd live in <asm/sysreg.h>, but I'm
> happy to factor that out later.

I don't see the point in adding the prefixes without moving the #defines
into sysreg.h, so I'll look at just doing it in one go.

> > +#define PMSIRR_EL1			sys_reg(3, 0, 9, 9, 3)
> > +#define PMSIRR_EL1_RND_SHIFT		0
> > +#define PMSIRR_EL1_IVAL_MASK		0xffUL
> 
> This is a little odd. This covers RND and (some) RES0 bits, not
> INTERVAL, so it's the opposite polarity to the rest of the masks, and
> incomplete (missing the upper 32 RES0 bits).
> 
> Could we please have this in the same polarity as the other masks,
> precisely covering the INTERVAL field? i.e.
> 
> #define PMSIRR_EL1_INTERVAL_SHIFT	8
> #define PMSIRR_EL1_INTERVAL_MASK	0xffffffUL

Yup, I'll fix that.

> > +#define PMBSR_EL1_EC_BUF		(0x0UL << PMBSR_EL1_EC_SHIFT)
> > +#define PMBSR_EL1_EC_FAULT_S1		(0x24UL << PMBSR_EL1_EC_SHIFT)
> > +#define PMBSR_EL1_EC_FAULT_S2		(0x25UL << PMBSR_EL1_EC_SHIFT)
> > +
> > +#define PMBSR_EL1_FAULT_FSC_SHIFT	0
> > +#define PMBSR_EL1_FAULT_FSC_MASK	0x3fUL
> 
> Nit: it might be worth having MSS in the name to show the field
> hierarchy, e.g. PMBSR_EL1_MSS_FAULT_FSC_MASK

I'd rather avoid growing the name, and I don't find this confusing as-is.

> > +struct arm_spe_pmu {
> > +	struct pmu				pmu;
> > +	struct platform_device			*pdev;
> > +	cpumask_t				supported_cpus;
> > +	struct hlist_node			hotplug_node;
> > +
> > +	int					irq; /* PPI */
> > +
> > +	u16					min_period;
> > +	u16					cnt_width;
> 
> Elsewhere, we refer to this as the counter size (e.g.
> SPE_PMU_CAP_CNT_SZ, and count_size in the sysfs interface).
> 
> Could we please pick one of "width" and "size" and use it consistently?
> 
> I guess "size" is preferable, given the architectural name is
> "CountSize".

Will fix.

> > +/* Convert a free-running index from perf into an SPE buffer offset */
> > +#define PERF_IDX2OFF(idx, buf)	((idx) & (((buf)->nr_pages << PAGE_SHIFT) - 1))
> 
> The masking logic here assumes nr_pages is a power of two.
> 
> That's not always true, as arm_spe_pmu_setup_aux() only ensures that
> nr_pages is a multiple of 2 * PAGE_SIZE, and only when using snapshot
> mode.
> 
> For example, with ten 4K pages:
> 
> nr_pages:			0b1010
> (nr_pages << PAGE_SHIFT):	0b1010000000000000
> (nr_pages << PAGE_SHIFT) - 1:	0b1001111111111111

Nice. Fixed (using a bloody mod operator).

> > +
> > +/* Keep track of our dynamic hotplug state */
> > +static enum cpuhp_state arm_spe_pmu_online;
> > +
> > +/* This sysfs gunk was really good fun to write. */
> > +enum arm_spe_pmu_capabilities {
> > +	SPE_PMU_CAP_ARCH_INST = 0,
> 
> No need for the initializer; enums start at zero.

I know, but I do like to make it explicit that I'm relying on that here.
This is done all over the place in the kernel.

> 
> > +	SPE_PMU_CAP_ERND,
> > +	SPE_PMU_CAP_FEAT_MAX,
> > +	SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX,
> > +	SPE_PMU_CAP_MIN_IVAL,
> > +};
> 
> IMO, it would be worth having s/CAP/CAP_FEAT/ for the HW features.
> 
> We could get rid of the confusing SPE_PMU_CAP_FEAT_MAX definition here,
> if we were to:
> 
> > +
> > +static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = {
> > +	[SPE_PMU_CAP_ARCH_INST]	= SPE_PMU_FEAT_ARCH_INST,
> > +	[SPE_PMU_CAP_ERND]	= SPE_PMU_FEAT_ERND,
> > +};
> 
> .. change this to:
> 
> static int arm_spe_pmu_feat_caps[] = {
> 	...
> };
> 
> ... and:
> 
> > +
> > +static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
> > +{
> > +	if (cap < SPE_PMU_CAP_FEAT_MAX)
> 
> ... change this to:
> 
> 	if (cap < ARRAY_SIZE(arm_spe_pmu_feat_caps))

I quite liked this suggestion at first and I even implemented it, but the
result was IMO less maintainable than the above. There's ABI involved here,
so I want to make it as difficult as possible to break the ABI when adding a
new hardware capability to the driver. The current code does a good job of
that:

  - If you add a boolean feature in the wrong place in arm_spe_capabilities,
    then you'll get a WARN

  - If you add a non-boolean feature in the wrong place then it will be
    reported as non-present, unless you add an entry in
    arm_spe_pmu_feat_caps (at which point you'd realise the mistake)

  - If you only update arm_spe_pmu_feat_caps, then you'll get a build error.

With your change, it's a lot easier to break things subtly, so I'd rather
keep this as-is unless you have non-cosmetic reasons to change it.

> > +#define SPE_EXT_ATTR_ENTRY(_name, _func, _var)				\
> > +	&((struct dev_ext_attribute[]) {				\
> > +		{ __ATTR(_name, S_IRUGO, _func, NULL), (void *)_var }	\
> > +	})[0].attr.attr
> > +
> > +#define SPE_CAP_EXT_ATTR_ENTRY(_name, _var)				\
> > +	SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var)
> > +
> > +static struct attribute *arm_spe_pmu_cap_attr[] = {
> > +	SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST),
> > +	SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND),
> > +	SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ),
> > +	SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL),
> > +	NULL,
> > +};
> 
> I'd have expected GCC to warn about (integer/enum) _var values being
> cast straight to void *, given the size mismatch.
> 
> Is that not the case, or do we need an unsigned long cast in
> SPE_CAP_EXT_ATTR_ENTRY()?
> 
> Maybe GCC only complains the other way around.

GCC 7 seems to be perfectly happy with this code.

> 
> [...]
> 
> > +/* Convert between user ABI and register values */
> > +static u64 arm_spe_event_to_pmscr(struct perf_event *event)
> > +{
> > +	struct perf_event_attr *attr = &event->attr;
> > +	u64 reg = 0;
> > +
> > +	reg |= ATTR_CFG_GET_FLD(attr, ts_enable) << PMSCR_EL1_TS_SHIFT;
> > +	reg |= ATTR_CFG_GET_FLD(attr, pa_enable) << PMSCR_EL1_PA_SHIFT;
> 
> We should limit PA access to privileged users.
> 
> > +
> > +	if (!attr->exclude_user)
> > +		reg |= BIT(PMSCR_EL1_E0SPE_SHIFT);
> > +
> > +	if (!attr->exclude_kernel)
> > +		reg |= BIT(PMSCR_EL1_E1SPE_SHIFT);
> > +
> > +	if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
> > +		reg |= BIT(PMSCR_EL1_CX_SHIFT);
> 
> ... maybe likewise for CONTEXTIDR, too.

Yes, agreed. Fixed.

> > +static void arm_spe_event_sanitise_period(struct perf_event *event)
> > +{
> > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > +	u64 period = event->hw.sample_period & ~PMSIRR_EL1_IVAL_MASK;
> 
> ... as noted above, the upper 32 bits are RES0 in addition to the low 8
> bits, so we need to explicitly check bits 31:8, e.g.
> 
> 	u64 period = event->hw.sample_period;
> 	period &= (PMSIRR_EL1_INTERVAL_MASK << PMSIRR_EL1_INTERVAL_SHIFT);

Fixed.

> > +
> > +	if (period < spe_pmu->min_period)
> > +		period = spe_pmu->min_period;
> 
> We already verify this in arm_spe_pmu_event_init(), so we don't need to
> check this here.
> 
> We can drop arm_spe_event_sanitise_period() entirely. Given we validate
> the period at event_init() time, there's no need to sanitize the value.

What about PERF_IOC_PERIOD? I don't think that re-inits the event.

> > +static bool arm_spe_pmu_buffer_mgmt_pending(u64 pmbsr)
> > +{
> > +	const char *err_str;
> > +
> > +	/* Service required? */
> > +	if (!(pmbsr & BIT(PMBSR_EL1_S_SHIFT)))
> > +		return false;
> > +
> > +	/* We only expect buffer management events */
> > +	switch (pmbsr & (PMBSR_EL1_EC_MASK << PMBSR_EL1_EC_SHIFT)) {
> > +	case PMBSR_EL1_EC_BUF:
> > +		/* Handled below */
> > +		break;
> > +	case PMBSR_EL1_EC_FAULT_S1:
> > +	case PMBSR_EL1_EC_FAULT_S2:
> > +		err_str = "Unexpected buffer fault";
> > +		goto out_err;
> > +	default:
> > +		err_str = "Unknown error code";
> > +		goto out_err;
> > +	}
> 
> For the error cases, I take it the assumption is that we leave
> PMBSR_EL1.S set, so that the HW doesn't start again?

No, I don't think I actually handle these cases at all. Whilst they're
probably catastrophic (vmapped mappings are faulting!), I should at least
try to park the profiler. Will fix for the next version.

> > +out_err:
> > +	pr_err_ratelimited("%s on CPU %d [PMBSR=0x%08llx]\n", err_str,
> > +			   smp_processor_id(), pmbsr);
> 
> It might be worth dumping pmbsr with %016lx. The upper 64 bits are
> currently RES0, but they do exist.

Ok.

> > +	return false;
> > +}
> > +
> 
> Could we have a comment block here to describe (roughly) what 
> we're trying to do for the snapshot case?

Ok, I'll try to think of something to say.

> > +
> > +	/*
> > +	 * If we're within max_record_sz of the limit, we must
> > +	 * pad, move the head index and recompute the limit.
> > +	 */
> > +	if (limit - head < spe_pmu->max_record_sz) {
> > +		memset(buf->base + head, 0, limit - head);
> 
> Could we have a mnemonic for the padding byte, and/or a helper that
> wraps memset? e.g.
> 
> static void pad_buffer(void *start, u64 size)
> {
> 	/* The padding packet is a single zero byte */
> 	memset(start, 0, size);
> }

Sure.

> > +		handle->head = PERF_IDX2OFF(limit, buf);
> > +		limit = ((buf->nr_pages * PAGE_SIZE) >> 1) + handle->head;
> > +	}
> > +
> > +	return limit;
> > +}
> > +
> > +static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
> > +{
> > +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> > +	u64 head = PERF_IDX2OFF(handle->head, buf);
> > +	u64 tail = PERF_IDX2OFF(handle->head + handle->size, buf);
> > +	u64 wakeup = PERF_IDX2OFF(handle->wakeup, buf);
> > +	u64 limit = buf->nr_pages * PAGE_SIZE;
> > +
> > +	/*
> > +	 * Set the limit pointer to either the watermark or the
> > +	 * current tail pointer; whichever comes first.
> > +	 */
> > +	if (handle->head + handle->size <= handle->wakeup) {
> > +		/* The tail is next, so check for wrapping */
> > +		if (tail >= head) {
> > +			/*
> > +			 * No wrapping, but need to align downwards to
> > +			 * avoid corrupting unconsumed data.
> > +			 */
> > +			limit = round_down(tail, PAGE_SIZE);
> > +
> > +		}
> > +	} else if (wakeup >= head) {
> 
> When wakeup == head, do we need to signal a wakeup event somehow?
> Currently we'll pad the buffer, signal truncation, and end output, which
> seems a little odd, but maybe that's what perf expects.

Wakeup can never be equal to head here. We know that the wakeup is next (by
the first if above) and therefore it is within handle->size of head. Since
we're starting a session, then we either have:

  1. Wakeup calculated by perf_aux_output_{skip,end}, or
  2. Wakeup calculated by rb_alloc_aux (initial mmap)

In case (1), the wakeup will have been signalled when it was calculated to
be equal to head, and then the wakeup will have been moved to the next
watermark point. In case (2), the only way it can be equal to head is if
the watermark was 0, but an initial watermark is converted to half the
buffer size by the core.

Admittedly, I'd not realised this at the time (hence the >= check), and it
looks like we're going to rewrite this anyway :)

> > +		/*
> > +		 * The wakeup is next and doesn't wrap. Align upwards to
> > +		 * ensure that we do indeed reach the watermark.
> > +		 */
> > +		limit = round_up(wakeup, PAGE_SIZE);
> > +
> > +		/*
> > +		 * If rounding up crosses the tail, then we have to
> > +		 * round down to avoid corrupting unconsumed data.
> > +		 * Hopefully the tail will have moved by the time we
> > +		 * hit the new limit.
> > +		 */
> > +		if (wakeup < tail && limit > tail)
> > +			limit = round_down(wakeup, PAGE_SIZE);
> > +	}
> 
> It took me a while to grok that we must consider the wakeup in
> free-running counter space to avoid early wakeups, while we must
> consider the tail in ring-buffer offset space to avoid clobbering data.
> 
> With that understanding, I think we have an issue here. If wakeup is
> more than buffer size in the future, and the buffer is empty, I think we
> set the limit too low.
> 
> In that case, we'd evaluate:
> 
> 	handle->head + handle->size <= handle->wakeup
> 
> ... as true, since size is at most buffer size. Thus we'd go into the
> first if block. There we'd evaluate:

If the buffer is empty, then size is exactly buffer size - 1, but I take
your point.

> 
> 	tail >= head
> 
> ... as true, since when the buffer is empty, head == tail. Thus, we'd
> set the limit to:
> 
> 	round_down(tail, PAGE_SIZE)
> 
> ... which'll leave us with limit <= head, since head == tail. Thus,
> we'll hit the case below:
> 
> > +
> > +	/*
> > +	 * If rounding down crosses the head, then the buffer is full,
> > +	 * so pad to tail and end the session.
> > +	 */
> > +	if (limit <= head) {
> > +		memset(buf->base + head, 0, handle->size);
> > +		perf_aux_output_skip(handle, handle->size);
> > +		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
> > +		perf_aux_output_end(handle, 0);
> > +		limit = 0;
> > +	}
> > +
> > +	return limit;
> > +}
> 
> ... and end all output, even though the entire buffer was empty, and we
> could have returned the end of the buffer as the limit.
> 
> It might be that something prevents wakeup from being that far in the
> future, but in previous discussions we'd assumed that it could be any
> arbitrary value.

Yes, I think this case does indeed go wrong. Well spotted!

> I believe we can solve that, and simplify the logic as below. I've left
> the wakeup < head and wakeup == head cases as above, ignored and
> terminating respectively.

I think this mostly works, some suggestions/questions below.

> static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
> {
> 	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> 	const u64 bufsize = buf->nr_pages * PAGE_SIZE;
> 	u64 limit = bufsize;
> 	u64 head = PERF_IDX2OFF(handle->head, buf);
> 	u64 tail = PERF_IDX2OFF(handle->head + handle->size, buf);
> 	u64 wakeup = PERF_IDX2OFF(handle->wakeup, buf);
> 
> 	if (!handle->size)
> 		goto no_space;

We can avoid the memset/output_skip in this case.

> 	/*
> 	 * Avoid clobbering unconsumed data. We know we have space, so
> 	 * if we see head == tail we know that the buffer is empty. If
> 	 * head > tail, then there's nothing to clobber prior to
> 	 * wrapping.
> 	 */
> 	if (head < tail)
> 		limit = round_down(tail, PAGE_SIZE);
> 	
> 	/*
> 	 * Wakeup may be arbitrarily far into future. If it's not in the
> 	 * current generation, either we'll wrap before hitting it, or
> 	 * it's in the past and has been handled already.
> 	 *
> 	 * If there's a wakeup before we wrap, arrange to be woken up by
> 	 * the page boundary following it. Keep the tail boundary if
> 	 * that's lower.
> 	 */
> 	if ((handle->wakeup / bufsize) == (handle->head / bufsize)) &&

I'd really like to get rid of these divisions, since we're not working with
nice powers of 2 here. Can't you just do:

  handle->wakeup < (handle->head + handle->size)

to establish that they're in the same "generation"?

> 	    head <= wakeup)
> 		limit = min(limit, round_up(wakeup, PAGE_SIZE));
> 
> 	if (limit <= head)
> 		goto no_space;

Does this correctly handle the case where the buffer is full and head ==
tail, but limit == bufsize? AFAICT, we can return a limit of bufsize and
corrupt the whole buffer.

> > +static void arm_spe_perf_aux_output_begin(struct perf_output_handle *handle,
> > +					  struct perf_event *event)
> > +{
> > +	u64 base, limit;
> > +	struct arm_spe_pmu_buf *buf;
> > +
> > +	/* Start a new aux session */
> > +	buf = perf_aux_output_begin(handle, event);
> > +	if (!buf) {
> > +		event->hw.state |= PERF_HES_STOPPED;
> > +		/*
> > +		 * We still need to clear the limit pointer, since the
> > +		 * profiler might only be disabled by virtue of a fault.
> > +		 */
> > +		limit = 0;
> > +		goto out_write_limit;
> > +	}
> > +
> > +	limit = buf->snapshot ? arm_spe_pmu_next_snapshot_off(handle)
> > +			      : arm_spe_pmu_next_off(handle);
> > +	if (limit)
> > +		limit |= BIT(PMBLIMITR_EL1_E_SHIFT);
> > +
> > +	base = (u64)buf->base + PERF_IDX2OFF(handle->head, buf);
> > +	write_sysreg_s(base, PMBPTR_EL1);
> > +	limit += (u64)buf->base;
> > +
> 
> I believe an isb() is necessary here to ensure the write to PMBPTR_EL1
> occurs before the write to PMBLIMITR_EL1 enables the PMU. Otherwise, the
> CPU could execute those out-of-order.

This function is always called in a context where the profiler is disabled
due to some other control (e.g. in PMSCR or because we're in fault context)
so the isb isn't necessary.

> > +out_write_limit:
> > +	write_sysreg_s(limit, PMBLIMITR_EL1);
> > +}
> > +
> > +static bool arm_spe_perf_aux_output_end(struct perf_output_handle *handle,
> > +					struct perf_event *event,
> > +					bool resume)
> > +{
> > +	u64 pmbptr, pmbsr, offset, size;
> > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> > +	bool truncated;
> > +
> > +	/*
> > +	 * We can be called via IRQ work trying to disable the PMU after
> > +	 * a buffer full event. In this case, the aux session has already
> > +	 * been stopped, so there's nothing to do here.
> > +	 */
> > +	if (!buf)
> > +		return false;
> > +
> > +	/*
> > +	 * If there isn't a pending management event and we're not stopping
> > +	 * the current session, then just leave everything alone.
> > +	 */
> > +	pmbsr = read_sysreg_s(PMBSR_EL1);
> 
> When we call from arm_spe_pmu_irq_handler(), I think we need
> synchronisation before reading PMBSR_EL1.
> 
> AFAICT from the spec, a context synchronisation event doesn't ensure
> that the PMU's indirect write to PMBSR_EL1 is visible to the PE's direct
> read above. I beleive we need a PSB CSYNC (and subsequent ISB) to ensure
> that.

I don't think that's right, but the spec isn't completely clear. PSB CSYNC
is about the profiling data itself, but in this case we've taken an IRQ
already so the PMBSR will be up-to-date. I'll seek clarification anyway.

> The only other caller is from arm_spe_pmu_stop(), which first calls
> arm_spe_pmu_disable_and_drain_local(), so I guess the new barriers
> should live in arm_spe_pmu_irq_handler(). I'll comment there.
> 
> > +	if (!arm_spe_pmu_buffer_mgmt_pending(pmbsr) && resume)
> > +		return false; /* Spurious IRQ */
> > +
> > +	/* Ensure hardware updates to PMBPTR_EL1 are visible */
> > +	isb();
> 
> Can we please move this into arm_spe_pmu_buffer_mgmt_pending(), after
> the associated PSB CSYNC?

Hmmm, I deliberately *didn't* do that because I wanted
arm_spe_pmu_buffer_mgmt_pending to ensure the buffer writes are visible, and
then the caller can decide if it cares about indirect SPE register writes
being visible. In reality, I ended up with a single caller, but let's see
how it looks when I rework it to deal with fatal aborts.

> > +	/*
> > +	 * Work out how much data has been written since the last update
> > +	 * to the head index.
> > +	 */
> > +	pmbptr = round_down(read_sysreg_s(PMBPTR_EL1), spe_pmu->align);
> 
> I don't believe we need to align this.
> 
> Per the spec, PMBPTR_EL1[M:0] are RES0 in HW unless sync external abort
> reporting is present, in which case they're valid. We write these bits
> as zero, unless we have a bug elsewhere.
> 
> ... so either the bits are zero, and we're fine, or an external abourt
> has been hit. In the external abort case, we have no idea how far we
> need to reverse the base pointer anyhow.

As part of the fatal abort handling, I should probably round down to
max_record_sz (keyed off DL==1, which I don't think can ever happen
at the moment).

> > +	offset = pmbptr - (u64)buf->base;
> > +	size = offset - PERF_IDX2OFF(handle->head, buf);
> > +
> > +	if (buf->snapshot)
> > +		handle->head = offset;
> 
> It's be worth a /* see arm_spe_pmu_next_snapshot_off() */ comment
> or similar to explain what we're going for the snapshot case here.

Ok.

> > +
> > +	/*
> > +	 * Either the buffer is full or we're stopping the session. Check
> > +	 * that we didn't write a partial record, since this can result
> > +	 * in unparseable trace and we must disable the event.
> > +	 */
> > +	if (pmbsr & BIT(PMBSR_EL1_COLL_SHIFT))
> > +		perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
> > +
> > +	truncated = pmbsr & BIT(PMBSR_EL1_DL_SHIFT);
> > +	if (truncated)
> > +		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
> > +
> > +	perf_aux_output_end(handle, size);
> 
> The comment block above perf_aux_output_end() says:
> 
>   It is the pmu driver's responsibility to observe ordering rules of the
>   hardware, so that all the data is externally visible before this is
>   called.
> 
> ... but in arm_spe_pmu_buffer_mgmt_pending() we only ensured that the
> data was visible in the current NSH domain (i.e. only to this CPU).
> 
> I followed the callchain for updating head:
> 
> perf_aux_output_end()
> -> perf_event_aux_event()
> -> perf_output_end()
> -> perf_output_put_handle()
> 
> ... I see that there's an smp_wmb() (i.e. a DMB ISHST) on that path, but
> it's not clear to me if that's sufficient to ensure that the PMU's
> writes are made visible to other CPUs.

With the new memory model, it should be sufficient; the DSB NSH ensures
data is visible to us locally, and then we order that before the update
of the ring buffer.

> Given the comment, I'd feel happier if we had something here or in
> arm_spe_pmu_buffer_mgmt_pending() to ensure that the PMU's prior writes
> are visible to other CPUs.

I could add a comment?

> > +	/*
> > +	 * If we're not resuming the session, then we can clear the fault
> > +	 * and we're done, otherwise we need to start a new session.
> > +	 */
> > +	if (!resume)
> > +		write_sysreg_s(0, PMBSR_EL1);
> > +	else if (!truncated)
> > +		arm_spe_perf_aux_output_begin(handle, event);
> > +
> > +	return true;
> > +}
> > +
> > +/* IRQ handling */
> > +static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
> > +{
> > +	struct perf_output_handle *handle = dev;
> > +
> > +	if (!perf_get_aux(handle))
> > +		return IRQ_NONE;
> > +
> > +	if (!arm_spe_perf_aux_output_end(handle, handle->event, true))
> > +		return IRQ_NONE;
> 
> As commented in arm_spe_perf_aux_output_end(), I think we need a
> psb_csync(); isb() sequence prior to the read of PMBSR_EL1 in
> arm_spe_perf_aux_output_end() to ensure that it is up-to-date w.r.t. the
> interrupt.

As above, I don't agree but am checking this with the architects.

> > +	irq_work_run();
> > +	isb(); /* Ensure the buffer is disabled if data loss has occurred */
> 
> What exactly are we synchronising here?
> 
> AFAICT, when truncation occurs we don't clear PMBLIMITR_EL1.E, so the
> buffer is only implicitly disabled by the PMU's indirect write
> PMBSR_EL1.S, which we must have already synchronised prior to reading
> PMBSR_EL1.

When you report truncation to perf_aux_output_end, it eventually (the
irq_work_run() above) calls back into arm_spe_pmu_stop, and I want to make
sure we've nobbled the limit pointer.

It would probably be clearer just to add an ISB to the end of
arm_spe_pmu_disable_and_drain_local, which goes back to your previous comments
about the mgmt_pending code.

> ... so I can't see why this is necessary.
> 
> > +	write_sysreg_s(0, PMBSR_EL1);
> 
> ... and regardless, we clear PMBSR_EL1.S here, which'll start the PMU
> again, even if truncation occured, which I don't think we want.
> 
> Can we have arm_spe_perf_aux_output_end() clear PMBLIMITR_EL1.E when
> truncation occurs?

Right, that's exactly what happens. It's just highly convoluted.

> 
> > +	return IRQ_HANDLED;
> > +}
> > +
> > +/* Perf callbacks */
> > +static int arm_spe_pmu_event_init(struct perf_event *event)
> > +{
> > +	u64 reg;
> > +	struct perf_event_attr *attr = &event->attr;
> > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > +
> > +	/* This is, of course, deeply driver-specific */
> > +	if (attr->type != event->pmu->type)
> > +		return -ENOENT;
> > +
> > +	if (event->cpu >= 0 &&
> > +	    !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
> > +		return -ENOENT;
> 
> We're not rejecting cpu < 0, so I take it we're trying to handle
> per-task events?

Yes, like intel-pt does.

> As I've mentioned before, that case worries me. One thing I've just
> realised we need to figure out is what happens if attr.inherit is set.
> The core doesn't reject that, and I suspect we may need to here.

That's already rejected by perf_mmap.

> > +
> > +	if (arm_spe_event_to_pmsevfr(event) & PMSEVFR_EL1_RES0)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (event->hw.sample_period < spe_pmu->min_period ||
> > +	    event->hw.sample_period & PMSIRR_EL1_IVAL_MASK)
> > +		return -EOPNOTSUPP;
> 
> As mentioned in the sysreg comments, we need to check the upper 32 bits
> of the PMSIRR value are zero, so we'll need something like:
> 
> 	if (event->hw.sample_period < spe_pmu->min_period)
> 		return -EOPNOTSUPP;
> 	
> 	if (event->hw.sample_period &
> 	    ~(PMSIRR_EL1_INTERVAL_MASK << PMSIRR_EL1_INTERVAL_SHIFT))
> 		return -EOPNOTSUPP;

I wonder if we're actually better off just truncating the interval. That
way, if the interval is extended in the future, then new software won't
get an error on older cores. It feels a bit weird putting the max interval
in sysfs.

> > +	if (attr->exclude_idle)
> > +		return -EOPNOTSUPP;
> > +
> > +	/*
> > +	 * Feedback-directed frequency throttling doesn't work when we
> > +	 * have a buffer of samples. We'd need to manually count the
> > +	 * samples in the buffer when it fills up and adjust the event
> > +	 * count to reflect that. Instead, force the user to specify a
> > +	 * sample period instead.
> > +	 */
> > +	if (attr->freq)
> > +		return -EINVAL;
> > +
> > +	reg = arm_spe_event_to_pmsfcr(event);
> > +	if ((reg & BIT(PMSFCR_EL1_FE_SHIFT)) &&
> > +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_EVT))
> > +		return -EOPNOTSUPP;
> > +
> > +	if ((reg & BIT(PMSFCR_EL1_FT_SHIFT)) &&
> > +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_TYP))
> > +		return -EOPNOTSUPP;
> > +
> > +	if ((reg & BIT(PMSFCR_EL1_FL_SHIFT)) &&
> > +	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
> > +		return -EOPNOTSUPP;
> 
> Does anything prevent this event from being added to a group?

PERF_PMU_CAP_EXCLUSIVE should take care of that in the core.

> > +static void arm_spe_pmu_start(struct perf_event *event, int flags)
> > +{
> > +	u64 reg;
> > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > +	struct hw_perf_event *hwc = &event->hw;
> > +	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
> > +
> > +	hwc->state = 0;
> > +	arm_spe_perf_aux_output_begin(handle, event);
> > +	if (hwc->state)
> > +		return;
> 
> I was expecting we'd do this last, since PMBLIIMITR.E enables profiling.
> 
> I understand that we're relying on the PMSCR_EL1 filtering value to
> prevent anything being written to tbe buffer until we've vonfigured the
> options, but I'd feel a lot happier if we consistently relied upon
> PMBLIMITR.E for that.

I actually prefer failing fast if we can, so I'd rather keep this as-is
given that it works and your objections are down to personal taste.

> > +
> > +	reg = arm_spe_event_to_pmsfcr(event);
> > +	write_sysreg_s(reg, PMSFCR_EL1);
> > +
> > +	reg = arm_spe_event_to_pmsevfr(event);
> > +	write_sysreg_s(reg, PMSEVFR_EL1);
> > +
> > +	reg = arm_spe_event_to_pmslatfr(event);
> > +	write_sysreg_s(reg, PMSLATFR_EL1);
> > +
> > +	if (flags & PERF_EF_RELOAD) {
> > +		reg = arm_spe_event_to_pmsirr(event);
> > +		write_sysreg_s(reg, PMSIRR_EL1);
> > +		isb();
> > +		reg = local64_read(&hwc->period_left);
> > +		write_sysreg_s(reg, PMSICR_EL1);
> > +	}
> > +
> > +	reg = arm_spe_event_to_pmscr(event);
> > +	isb();
> > +	write_sysreg_s(reg, PMSCR_EL1);
> > +}
> > +
> > +static void arm_spe_pmu_disable_and_drain_local(void)
> > +{
> > +	/* Disable profiling at EL0 and EL1 */
> > +	write_sysreg_s(0, PMSCR_EL1);
> > +	isb();
> > +
> > +	/* Drain any buffered data */
> > +	psb_csync();
> > +	dsb(nsh);
> > +
> > +	/* Disable the profiling buffer */
> > +	write_sysreg_s(0, PMBLIMITR_EL1);
> 
> Can't this be done when we clear PMSCR_EL1? Surely buffered data would
> be written out regardless?

I think the buffered data could be silently dropped if you did that.

> > +static void *arm_spe_pmu_setup_aux(int cpu, void **pages, int nr_pages,
> > +				   bool snapshot)
> > +{
> > +	int i;
> > +	struct page **pglist;
> > +	struct arm_spe_pmu_buf *buf;
> > +
> > +	/*
> > +	 * We require an even number of pages for snapshot mode, so that
> > +	 * we can effectively treat the buffer as consisting of two equal
> > +	 * parts and give userspace a fighting chance of getting some
> > +	 * useful data out of it.
> > +	 */
> > +	if (!nr_pages || (snapshot && (nr_pages & 1)))
> > +		return NULL;
> 
> As noted above, we may need to ensure that this is a pwoer of two.

Sorry, I don't follow. The power-of-two bug was in my IDX2OFF macro, which
I've fixed. For snapshot mode, we just need an even number of pages.

> > +	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
> > +	if (!buf)
> > +		return NULL;
> > +
> > +	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
> > +	if (!pglist)
> > +		goto out_free_buf;
> > +
> > +	for (i = 0; i < nr_pages; ++i) {
> > +		struct page *page = virt_to_page(pages[i]);
> > +
> > +		if (PagePrivate(page)) {
> > +			pr_warn("unexpected high-order page for auxbuf!");
> 
> It looks like the intel-pt driver expects high-order pages.
> 
> What prevents us from seeing those?

The fact that we don't set the PERF_PMU_CAP_AUX_NO_SG capability. The intel
driver uses physical address and scatter/gather lists, whereas ours just
takes virtual addresses for the buffer.

> Why can't we handle those?

We never get them, so we don't need to.

> How are these pages pinned? Does the core ensure that?

Yes, they're GFP_KERNEL pages underneath.

> > +	spe_pmu->pmu = (struct pmu) {
> > +		.capabilities	= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
> > +		.attr_groups	= arm_spe_pmu_attr_groups,
> > +		/*
> > +		 * We hitch a ride on the software context here, so that
> > +		 * we can support per-task profiling (which is not possible
> > +		 * with the invalid context as it doesn't get sched callbacks).
> > +		 * This requires that userspace either uses a dummy event for
> > +		 * perf_event_open, since the aux buffer is not setup until
> > +		 * a subsequent mmap, or creates the profiling event in a
> > +		 * disabled state and explicitly PERF_EVENT_IOC_ENABLEs it
> > +		 * once the buffer has been created.
> > +		 */
> > +		.task_ctx_nr	= perf_sw_context,
> 
> While other tracing PMUs do this, I think this is a horrible bodge, and
> a bad idea, given it violates assumptions made in the core code.
> 
> For example, unlike true SW events, add() and start() can fail, so a
> tracing event can unexpectedly stop SW events from being scheduled.
> 
> AFAICT, we could also try to move a tracing event into a later-created
> HW PMU group, which is very worrying.
> 
> I really think we should have a separate tracing context for this class
> of PMU, or we make it so that the invalid context can receive sched
> callbacks.

Fair point, and I already have a comment to call this out. Whilst I'm not
against seeing this fixed, I think it should be a separate patch series
given that this is a common idiom amongst system/uncore PMU drivers.

> > +static void __arm_spe_pmu_dev_probe(void *info)
> > +{
> > +	int fld;
> > +	u64 reg;
> > +	struct arm_spe_pmu *spe_pmu = info;
> > +	struct device *dev = &spe_pmu->pdev->dev;
> > +
> > +	fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1),
> > +						   ID_AA64DFR0_PMSVER_SHIFT);
> > +	if (!fld) {
> > +		dev_err(dev,
> > +			"unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n",
> > +			fld, smp_processor_id());
> > +		return;
> > +	}
> 
> Given we only bail out when PMSver is zero, surely we can just say:
> 
> 	dev_err(dev, "SPE not supported on cpu %d", smp_processor_id())

I'd rather leave it like this, so we have the information when we start
supporting additional values of fld.

Will
Mark Rutland June 27, 2017, 5:12 p.m. UTC | #5
Hi,

On Wed, Jun 21, 2017 at 04:39:33PM +0100, Will Deacon wrote:
> On Thu, Jun 15, 2017 at 03:57:29PM +0100, Mark Rutland wrote:
> > On Mon, Jun 05, 2017 at 04:22:56PM +0100, Will Deacon wrote:

> > > +	SPE_PMU_CAP_ERND,
> > > +	SPE_PMU_CAP_FEAT_MAX,
> > > +	SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX,
> > > +	SPE_PMU_CAP_MIN_IVAL,
> > > +};

> > We could get rid of the confusing SPE_PMU_CAP_FEAT_MAX definition here,
> > if we were to:
> > 
> > > +static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = {
> > > +	[SPE_PMU_CAP_ARCH_INST]	= SPE_PMU_FEAT_ARCH_INST,
> > > +	[SPE_PMU_CAP_ERND]	= SPE_PMU_FEAT_ERND,
> > > +};
> > 
> > .. change this to:
> > 
> > static int arm_spe_pmu_feat_caps[] = {
> > 	...
> > };
> > 
> > ... and:
> > 
> > > +static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
> > > +{
> > > +	if (cap < SPE_PMU_CAP_FEAT_MAX)
> > 
> > ... change this to:
> > 
> > 	if (cap < ARRAY_SIZE(arm_spe_pmu_feat_caps))
> 
> I quite liked this suggestion at first and I even implemented it, but the
> result was IMO less maintainable than the above. There's ABI involved here,
> so I want to make it as difficult as possible to break the ABI when adding a
> new hardware capability to the driver. The current code does a good job of
> that:
> 
>   - If you add a boolean feature in the wrong place in arm_spe_capabilities,
>     then you'll get a WARN
> 
>   - If you add a non-boolean feature in the wrong place then it will be
>     reported as non-present, unless you add an entry in
>     arm_spe_pmu_feat_caps (at which point you'd realise the mistake)
> 
>   - If you only update arm_spe_pmu_feat_caps, then you'll get a build error.
> 
> With your change, it's a lot easier to break things subtly, so I'd rather
> keep this as-is unless you have non-cosmetic reasons to change it.

No objections here; that's a pretty good rationale for keeping this
as-is.

[...]

> > > +static void arm_spe_event_sanitise_period(struct perf_event *event)
> > > +{
> > > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > > +	u64 period = event->hw.sample_period & ~PMSIRR_EL1_IVAL_MASK;

> > > +	if (period < spe_pmu->min_period)
> > > +		period = spe_pmu->min_period;
> > 
> > We already verify this in arm_spe_pmu_event_init(), so we don't need to
> > check this here.
> > 
> > We can drop arm_spe_event_sanitise_period() entirely. Given we validate
> > the period at event_init() time, there's no need to sanitize the value.
> 
> What about PERF_IOC_PERIOD? I don't think that re-inits the event.

Ugh; good point.

Given that, it's arguably worth not validating the period at
event_init() time, to have consistent behaviour across opening an event
with a given period, and fiddling with tha via PERF_IOC_PERIOD.

I also think that rather than masking the period with
PMSIRR_EL1_IVAL_MASK we should have something like:

	if (period > SPE_PMU_MAX_PERIOD)
		period = SPE_PMU_MAX_PERIOD;

... as that avoids the user asking for max_period + 1, and getting the
minimum period, and other such weirdness.

[...]

> > > +static bool arm_spe_pmu_buffer_mgmt_pending(u64 pmbsr)
> > > +{
> > > +	const char *err_str;
> > > +
> > > +	/* Service required? */
> > > +	if (!(pmbsr & BIT(PMBSR_EL1_S_SHIFT)))
> > > +		return false;
> > > +
> > > +	/* We only expect buffer management events */
> > > +	switch (pmbsr & (PMBSR_EL1_EC_MASK << PMBSR_EL1_EC_SHIFT)) {
> > > +	case PMBSR_EL1_EC_BUF:
> > > +		/* Handled below */
> > > +		break;
> > > +	case PMBSR_EL1_EC_FAULT_S1:
> > > +	case PMBSR_EL1_EC_FAULT_S2:
> > > +		err_str = "Unexpected buffer fault";
> > > +		goto out_err;
> > > +	default:
> > > +		err_str = "Unknown error code";
> > > +		goto out_err;
> > > +	}
> > 
> > For the error cases, I take it the assumption is that we leave
> > PMBSR_EL1.S set, so that the HW doesn't start again?
> 
> No, I don't think I actually handle these cases at all. Whilst they're
> probably catastrophic (vmapped mappings are faulting!), I should at least
> try to park the profiler. Will fix for the next version.

Great; thanks.

[...]

> > > +		handle->head = PERF_IDX2OFF(limit, buf);
> > > +		limit = ((buf->nr_pages * PAGE_SIZE) >> 1) + handle->head;
> > > +	}
> > > +
> > > +	return limit;
> > > +}
> > > +
> > > +static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
> > > +{
> > > +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> > > +	u64 head = PERF_IDX2OFF(handle->head, buf);
> > > +	u64 tail = PERF_IDX2OFF(handle->head + handle->size, buf);
> > > +	u64 wakeup = PERF_IDX2OFF(handle->wakeup, buf);
> > > +	u64 limit = buf->nr_pages * PAGE_SIZE;
> > > +
> > > +	/*
> > > +	 * Set the limit pointer to either the watermark or the
> > > +	 * current tail pointer; whichever comes first.
> > > +	 */
> > > +	if (handle->head + handle->size <= handle->wakeup) {
> > > +		/* The tail is next, so check for wrapping */
> > > +		if (tail >= head) {
> > > +			/*
> > > +			 * No wrapping, but need to align downwards to
> > > +			 * avoid corrupting unconsumed data.
> > > +			 */
> > > +			limit = round_down(tail, PAGE_SIZE);
> > > +
> > > +		}
> > > +	} else if (wakeup >= head) {
> > 
> > When wakeup == head, do we need to signal a wakeup event somehow?
> > Currently we'll pad the buffer, signal truncation, and end output, which
> > seems a little odd, but maybe that's what perf expects.
> 
> Wakeup can never be equal to head here. We know that the wakeup is next (by
> the first if above) and therefore it is within handle->size of head. Since
> we're starting a session, then we either have:
> 
>   1. Wakeup calculated by perf_aux_output_{skip,end}, or
>   2. Wakeup calculated by rb_alloc_aux (initial mmap)
> 
> In case (1), the wakeup will have been signalled when it was calculated to
> be equal to head, and then the wakeup will have been moved to the next
> watermark point. In case (2), the only way it can be equal to head is if
> the watermark was 0, but an initial watermark is converted to half the
> buffer size by the core.
> 
> Admittedly, I'd not realised this at the time (hence the >= check), and it
> looks like we're going to rewrite this anyway :)

Thanks for the explanation; the above is really helpful.

For (2) I agree that after the mmap(), handle->wakeup > handle->head.

For (1), I think that we can hit a case where perf_aux_output_end() will
signal a wakeup, but even after rb->aux_wakeup is moved along, we still
have rb->aux_wakeup <= rb->aux_head.

Rationale for the below; enjoy.

Consider a non-snapshot case where we set up the mmap with aux_watermark
being PAGE_SIZE / 2, and the buffer size being 2 * PAGE_SIZE. Our
initial state of things is:

	rb->aux_head = 0;
	rb->aux_wakeup = 0;
	rb->aux_watermark = PAGE_SIZE / 2;

The first time we start, perf_aux_output_begin() gives us:

	handle->head = 0;
	handle->size = 2 * PAGE_SIZE;
	handle->wakeup = PAGE_SIZE / 2;

Given that, __arm_spe_pmu_next_off() gives us a limit of PAGE_SIZE (both
in your version and mine).

We set SPE off, and it fills PAGE_SIZE worth of buffer, then asserts its
maintenance IRQ. We thus call perf_aux_output_end(handle, PAGE_SIZE),
which adjusts the aux_head, detects we passed wakeup, adjusts the
aux_wakeup, and signals the wakeup:

	local_add(size, &rb->aux_head);
	local_add(rb->aux_watermark, &rb->aux_wakeup);
	...
	perf_output_wakeup(handle);

... leaving us with:

	rb->aux_head = PAGE_SIZE
	rb->aux_wakeup = PAGE_SIZE / 2;
	rb->aux_watermark = PAGE_SIZE / 2;

... so next time we call arm_spe_perf_aux_output_begin() we see:

	handle->head = PAGE_SIZE;
	handle->wakeup = PAGE_SIZE;

So AFAICT, we can see wakeup == head here, though it's arguably the core
code that's at fault. Hopefully I've missed something.

Assuming that above is correct,in either of our __arm_spe_pmu_next_off()
implementations that results in signalling truncation, and calling
perf_aux_output_end(), which will detect another wakeup, and move wakeup
past head. However, as we (spuriously) signalled truncation we end up
stopping.

I think that it would make sense for the perf core to advance the wakeup
beyond head in perf_aux_output_end(), even if this means outputting a
number of wakeup events.

> > > +		/*
> > > +		 * The wakeup is next and doesn't wrap. Align upwards to
> > > +		 * ensure that we do indeed reach the watermark.
> > > +		 */
> > > +		limit = round_up(wakeup, PAGE_SIZE);
> > > +
> > > +		/*
> > > +		 * If rounding up crosses the tail, then we have to
> > > +		 * round down to avoid corrupting unconsumed data.
> > > +		 * Hopefully the tail will have moved by the time we
> > > +		 * hit the new limit.
> > > +		 */
> > > +		if (wakeup < tail && limit > tail)
> > > +			limit = round_down(wakeup, PAGE_SIZE);
> > > +	}
> > 
> > It took me a while to grok that we must consider the wakeup in
> > free-running counter space to avoid early wakeups, while we must
> > consider the tail in ring-buffer offset space to avoid clobbering data.
> > 
> > With that understanding, I think we have an issue here. If wakeup is
> > more than buffer size in the future, and the buffer is empty, I think we
> > set the limit too low.
> > 
> > In that case, we'd evaluate:
> > 
> > 	handle->head + handle->size <= handle->wakeup
> > 
> > ... as true, since size is at most buffer size. Thus we'd go into the
> > first if block. There we'd evaluate:
> 
> If the buffer is empty, then size is exactly buffer size - 1, but I take
> your point.

Heh, I missed that the CIRC_*() helpers  enforced that at least a byte
was always free.

> > 
> > 	tail >= head
> > 
> > ... as true, since when the buffer is empty, head == tail. Thus, we'd
> > set the limit to:
> > 
> > 	round_down(tail, PAGE_SIZE)
> > 
> > ... which'll leave us with limit <= head, since head == tail. Thus,
> > we'll hit the case below:
> > 
> > > +
> > > +	/*
> > > +	 * If rounding down crosses the head, then the buffer is full,
> > > +	 * so pad to tail and end the session.
> > > +	 */
> > > +	if (limit <= head) {
> > > +		memset(buf->base + head, 0, handle->size);
> > > +		perf_aux_output_skip(handle, handle->size);
> > > +		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
> > > +		perf_aux_output_end(handle, 0);
> > > +		limit = 0;
> > > +	}
> > > +
> > > +	return limit;
> > > +}
> > 
> > ... and end all output, even though the entire buffer was empty, and we
> > could have returned the end of the buffer as the limit.
> > 
> > It might be that something prevents wakeup from being that far in the
> > future, but in previous discussions we'd assumed that it could be any
> > arbitrary value.
> 
> Yes, I think this case does indeed go wrong. Well spotted!
> 
> > I believe we can solve that, and simplify the logic as below. I've left
> > the wakeup < head and wakeup == head cases as above, ignored and
> > terminating respectively.
> 
> I think this mostly works, some suggestions/questions below.
> 
> > static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
> > {
> > 	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> > 	const u64 bufsize = buf->nr_pages * PAGE_SIZE;
> > 	u64 limit = bufsize;
> > 	u64 head = PERF_IDX2OFF(handle->head, buf);
> > 	u64 tail = PERF_IDX2OFF(handle->head + handle->size, buf);
> > 	u64 wakeup = PERF_IDX2OFF(handle->wakeup, buf);
> > 
> > 	if (!handle->size)
> > 		goto no_space;
> 
> We can avoid the memset/output_skip in this case.

True.

We can move the no_space label if we remove the other goto (by inverting
the condition paired with it and returning limit there instead). i.e.

	if (!handle->size)
		goto no_space;

	< rounding down limit, etc >

	if (limit > head)
		return limit;
	
	/* 
	 * The buffer wasn't completely full, but we can't use the
	 * remaining space. Fill the unusable remainder with padding
	 */
no_space:
	perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
	perf_aux_output_end(handle, 0);
	return 0;

> > 	/*
> > 	 * Avoid clobbering unconsumed data. We know we have space, so
> > 	 * if we see head == tail we know that the buffer is empty. If
> > 	 * head > tail, then there's nothing to clobber prior to
> > 	 * wrapping.
> > 	 */
> > 	if (head < tail)
> > 		limit = round_down(tail, PAGE_SIZE);
> > 	
> > 	/*
> > 	 * Wakeup may be arbitrarily far into future. If it's not in the
> > 	 * current generation, either we'll wrap before hitting it, or
> > 	 * it's in the past and has been handled already.
> > 	 *
> > 	 * If there's a wakeup before we wrap, arrange to be woken up by
> > 	 * the page boundary following it. Keep the tail boundary if
> > 	 * that's lower.
> > 	 */
> > 	if ((handle->wakeup / bufsize) == (handle->head / bufsize)) &&
> 
> I'd really like to get rid of these divisions, since we're not working with
> nice powers of 2 here. Can't you just do:
> 
>   handle->wakeup < (handle->head + handle->size)
> 
> to establish that they're in the same "generation"?

I initially thought that, but then realised that would also be true if
wakeup was a "generation" behind. When I wrote this, I wasn't sure if
that could happen, and/or what we should do in that case -- as noted
above I'd retained the fact we ignored wakeup in that case.

Extending my argument from earlier, I think that *can* happen for
certain values of aux_watermark, but we have other problems resulting
from that, and we should try to rule that out in the core code.

> > 	    head <= wakeup)
> > 		limit = min(limit, round_up(wakeup, PAGE_SIZE));
> > 
> > 	if (limit <= head)
> > 		goto no_space;
> 
> Does this correctly handle the case where the buffer is full and head
> == tail, but limit == bufsize? AFAICT, we can return a limit of
> bufsize and corrupt the whole buffer.

When head == tail, handle->size == 0, so at the start of the function
we'd have gone straight to no_space.

So AFAICT, we're fine on that front.

[...]

> > > +static void arm_spe_perf_aux_output_begin(struct perf_output_handle *handle,
> > > +					  struct perf_event *event)
> > > +{

> > > +	write_sysreg_s(base, PMBPTR_EL1);
> > > +	limit += (u64)buf->base;
> > > +
> > 
> > I believe an isb() is necessary here to ensure the write to PMBPTR_EL1
> > occurs before the write to PMBLIMITR_EL1 enables the PMU. Otherwise, the
> > CPU could execute those out-of-order.
> 
> This function is always called in a context where the profiler is disabled
> due to some other control (e.g. in PMSCR or because we're in fault context)
> so the isb isn't necessary.
> 
> > > +out_write_limit:
> > > +	write_sysreg_s(limit, PMBLIMITR_EL1);
> > > +}

Ah, I see. Sorry for the noise.

[...]

> > > +static bool arm_spe_perf_aux_output_end(struct perf_output_handle *handle,
> > > +					struct perf_event *event,
> > > +					bool resume)
> > > +{
> > > +	u64 pmbptr, pmbsr, offset, size;
> > > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > > +	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
> > > +	bool truncated;
> > > +
> > > +	/*
> > > +	 * We can be called via IRQ work trying to disable the PMU after
> > > +	 * a buffer full event. In this case, the aux session has already
> > > +	 * been stopped, so there's nothing to do here.
> > > +	 */
> > > +	if (!buf)
> > > +		return false;
> > > +
> > > +	/*
> > > +	 * If there isn't a pending management event and we're not stopping
> > > +	 * the current session, then just leave everything alone.
> > > +	 */
> > > +	pmbsr = read_sysreg_s(PMBSR_EL1);
> > 
> > When we call from arm_spe_pmu_irq_handler(), I think we need
> > synchronisation before reading PMBSR_EL1.
> > 
> > AFAICT from the spec, a context synchronisation event doesn't ensure
> > that the PMU's indirect write to PMBSR_EL1 is visible to the PE's direct
> > read above. I beleive we need a PSB CSYNC (and subsequent ISB) to ensure
> > that.
> 
> I don't think that's right, but the spec isn't completely clear. PSB CSYNC
> is about the profiling data itself, but in this case we've taken an IRQ
> already so the PMBSR will be up-to-date. I'll seek clarification anyway.

Thanks; I'll await this.

> > The only other caller is from arm_spe_pmu_stop(), which first calls
> > arm_spe_pmu_disable_and_drain_local(), so I guess the new barriers
> > should live in arm_spe_pmu_irq_handler(). I'll comment there.
> > 
> > > +	if (!arm_spe_pmu_buffer_mgmt_pending(pmbsr) && resume)
> > > +		return false; /* Spurious IRQ */
> > > +
> > > +	/* Ensure hardware updates to PMBPTR_EL1 are visible */
> > > +	isb();
> > 
> > Can we please move this into arm_spe_pmu_buffer_mgmt_pending(), after
> > the associated PSB CSYNC?
> 
> Hmmm, I deliberately *didn't* do that because I wanted
> arm_spe_pmu_buffer_mgmt_pending to ensure the buffer writes are visible, and
> then the caller can decide if it cares about indirect SPE register writes
> being visible. In reality, I ended up with a single caller, but let's see
> how it looks when I rework it to deal with fatal aborts.

Ok.

> > > +	/*
> > > +	 * Work out how much data has been written since the last update
> > > +	 * to the head index.
> > > +	 */
> > > +	pmbptr = round_down(read_sysreg_s(PMBPTR_EL1), spe_pmu->align);
> > 
> > I don't believe we need to align this.
> > 
> > Per the spec, PMBPTR_EL1[M:0] are RES0 in HW unless sync external abort
> > reporting is present, in which case they're valid. We write these bits
> > as zero, unless we have a bug elsewhere.
> > 
> > ... so either the bits are zero, and we're fine, or an external abourt
> > has been hit. In the external abort case, we have no idea how far we
> > need to reverse the base pointer anyhow.
> 
> As part of the fatal abort handling, I should probably round down to
> max_record_sz (keyed off DL==1, which I don't think can ever happen
> at the moment).

I don't think that's sufficient.

For an async external abort the spec says we cannot assume any valid
data has been written to the buffer when DL==1, so we must throw away
the whole run.

For other cases, it's not clear to me whether PMBPTR_EL1 is guaranteed
to be within max_record_sz of the end of the last complete record. The
spec says that PMBPTR_EL1 might not point at the byte after the last
complete record, and it doesn't say how far beyond this is may be.

Given that the async case is called out specifically, I think the intent
is that it is bounded somehow, but this could do with clarification.

[...]

> > > +	/*
> > > +	 * Either the buffer is full or we're stopping the session. Check
> > > +	 * that we didn't write a partial record, since this can result
> > > +	 * in unparseable trace and we must disable the event.
> > > +	 */
> > > +	if (pmbsr & BIT(PMBSR_EL1_COLL_SHIFT))
> > > +		perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
> > > +
> > > +	truncated = pmbsr & BIT(PMBSR_EL1_DL_SHIFT);
> > > +	if (truncated)
> > > +		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
> > > +
> > > +	perf_aux_output_end(handle, size);
> > 
> > The comment block above perf_aux_output_end() says:
> > 
> >   It is the pmu driver's responsibility to observe ordering rules of the
> >   hardware, so that all the data is externally visible before this is
> >   called.
> > 
> > ... but in arm_spe_pmu_buffer_mgmt_pending() we only ensured that the
> > data was visible in the current NSH domain (i.e. only to this CPU).
> > 
> > I followed the callchain for updating head:
> > 
> > perf_aux_output_end()
> > -> perf_event_aux_event()
> > -> perf_output_end()
> > -> perf_output_put_handle()
> > 
> > ... I see that there's an smp_wmb() (i.e. a DMB ISHST) on that path, but
> > it's not clear to me if that's sufficient to ensure that the PMU's
> > writes are made visible to other CPUs.
> 
> With the new memory model, it should be sufficient; the DSB NSH ensures
> data is visible to us locally, and then we order that before the update
> of the ring buffer.
> 
> > Given the comment, I'd feel happier if we had something here or in
> > arm_spe_pmu_buffer_mgmt_pending() to ensure that the PMU's prior writes
> > are visible to other CPUs.
> 
> I could add a comment?

A comment would be great.

> > > +	/*
> > > +	 * If we're not resuming the session, then we can clear the fault
> > > +	 * and we're done, otherwise we need to start a new session.
> > > +	 */
> > > +	if (!resume)
> > > +		write_sysreg_s(0, PMBSR_EL1);
> > > +	else if (!truncated)
> > > +		arm_spe_perf_aux_output_begin(handle, event);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +/* IRQ handling */
> > > +static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
> > > +{
> > > +	struct perf_output_handle *handle = dev;
> > > +
> > > +	if (!perf_get_aux(handle))
> > > +		return IRQ_NONE;
> > > +
> > > +	if (!arm_spe_perf_aux_output_end(handle, handle->event, true))
> > > +		return IRQ_NONE;
> > 
> > As commented in arm_spe_perf_aux_output_end(), I think we need a
> > psb_csync(); isb() sequence prior to the read of PMBSR_EL1 in
> > arm_spe_perf_aux_output_end() to ensure that it is up-to-date w.r.t. the
> > interrupt.
> 
> As above, I don't agree but am checking this with the architects.

Sure. I'm also hoping that you're correct here!

> > > +	irq_work_run();
> > > +	isb(); /* Ensure the buffer is disabled if data loss has occurred */
> > 
> > What exactly are we synchronising here?
> > 
> > AFAICT, when truncation occurs we don't clear PMBLIMITR_EL1.E, so the
> > buffer is only implicitly disabled by the PMU's indirect write
> > PMBSR_EL1.S, which we must have already synchronised prior to reading
> > PMBSR_EL1.
> 
> When you report truncation to perf_aux_output_end, it eventually (the
> irq_work_run() above) calls back into arm_spe_pmu_stop, and I want to make
> sure we've nobbled the limit pointer.
> 
> It would probably be clearer just to add an ISB to the end of
> arm_spe_pmu_disable_and_drain_local, which goes back to your previous comments
> about the mgmt_pending code.

That would work for me.

> > ... so I can't see why this is necessary.
> > 
> > > +	write_sysreg_s(0, PMBSR_EL1);
> > 
> > ... and regardless, we clear PMBSR_EL1.S here, which'll start the PMU
> > again, even if truncation occured, which I don't think we want.
> > 
> > Can we have arm_spe_perf_aux_output_end() clear PMBLIMITR_EL1.E when
> > truncation occurs?
> 
> Right, that's exactly what happens. It's just highly convoluted.

Thanks; I see what's going on now. What a joy.

> > > +	return IRQ_HANDLED;
> > > +}
> > > +
> > > +/* Perf callbacks */
> > > +static int arm_spe_pmu_event_init(struct perf_event *event)
> > > +{
> > > +	u64 reg;
> > > +	struct perf_event_attr *attr = &event->attr;
> > > +	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
> > > +
> > > +	/* This is, of course, deeply driver-specific */
> > > +	if (attr->type != event->pmu->type)
> > > +		return -ENOENT;
> > > +
> > > +	if (event->cpu >= 0 &&
> > > +	    !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
> > > +		return -ENOENT;
> > 
> > We're not rejecting cpu < 0, so I take it we're trying to handle
> > per-task events?
> 
> Yes, like intel-pt does.
> 
> > As I've mentioned before, that case worries me. One thing I've just
> > realised we need to figure out is what happens if attr.inherit is set.
> > The core doesn't reject that, and I suspect we may need to here.
> 
> That's already rejected by perf_mmap.

Ok. Odd that we even allow those events to be created, though.

> > > +	if (arm_spe_event_to_pmsevfr(event) & PMSEVFR_EL1_RES0)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	if (event->hw.sample_period < spe_pmu->min_period ||
> > > +	    event->hw.sample_period & PMSIRR_EL1_IVAL_MASK)
> > > +		return -EOPNOTSUPP;
> > 
> > As mentioned in the sysreg comments, we need to check the upper 32 bits
> > of the PMSIRR value are zero, so we'll need something like:
> > 
> > 	if (event->hw.sample_period < spe_pmu->min_period)
> > 		return -EOPNOTSUPP;
> > 	
> > 	if (event->hw.sample_period &
> > 	    ~(PMSIRR_EL1_INTERVAL_MASK << PMSIRR_EL1_INTERVAL_SHIFT))
> > 		return -EOPNOTSUPP;
> 
> I wonder if we're actually better off just truncating the interval. That
> way, if the interval is extended in the future, then new software won't
> get an error on older cores.

As above, given the PERF_EVENT_IOC_INTERVAL case, I agree that it's
better to not validate the interval here, and to truncate it as
necessary when installing the event.

> It feels a bit weird putting the max interval in sysfs.

I have no strong feelings either way. Maybe it's useful for the user to
choose a sensible / optimum userspace buffer size?

[...]

> > Does anything prevent this event from being added to a group?
> 
> PERF_PMU_CAP_EXCLUSIVE should take care of that in the core.

So it does. Phew.

[...]

> > > +static void arm_spe_pmu_disable_and_drain_local(void)
> > > +{
> > > +	/* Disable profiling at EL0 and EL1 */
> > > +	write_sysreg_s(0, PMSCR_EL1);
> > > +	isb();
> > > +
> > > +	/* Drain any buffered data */
> > > +	psb_csync();
> > > +	dsb(nsh);
> > > +
> > > +	/* Disable the profiling buffer */
> > > +	write_sysreg_s(0, PMBLIMITR_EL1);
> > 
> > Can't this be done when we clear PMSCR_EL1? Surely buffered data would
> > be written out regardless?
> 
> I think the buffered data could be silently dropped if you did that.

Having looked over the spec again, I think you're correct. Sorry for the
noise.

I'd missed the distinction between disabling sampling and disabling
the buffer into which samples are fed.

> > > +static void *arm_spe_pmu_setup_aux(int cpu, void **pages, int nr_pages,
> > > +				   bool snapshot)
> > > +{
> > > +	int i;
> > > +	struct page **pglist;
> > > +	struct arm_spe_pmu_buf *buf;
> > > +
> > > +	/*
> > > +	 * We require an even number of pages for snapshot mode, so that
> > > +	 * we can effectively treat the buffer as consisting of two equal
> > > +	 * parts and give userspace a fighting chance of getting some
> > > +	 * useful data out of it.
> > > +	 */
> > > +	if (!nr_pages || (snapshot && (nr_pages & 1)))
> > > +		return NULL;
> > 
> > As noted above, we may need to ensure that this is a pwoer of two.
> 
> Sorry, I don't follow. The power-of-two bug was in my IDX2OFF macro, which
> I've fixed. For snapshot mode, we just need an even number of pages.

Sure; with IDX2OFF() fixed the above is sufficient.

> > > +	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
> > > +	if (!buf)
> > > +		return NULL;
> > > +
> > > +	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
> > > +	if (!pglist)
> > > +		goto out_free_buf;
> > > +
> > > +	for (i = 0; i < nr_pages; ++i) {
> > > +		struct page *page = virt_to_page(pages[i]);
> > > +
> > > +		if (PagePrivate(page)) {
> > > +			pr_warn("unexpected high-order page for auxbuf!");
> > 
> > It looks like the intel-pt driver expects high-order pages.
> > 
> > What prevents us from seeing those?
> 
> The fact that we don't set the PERF_PMU_CAP_AUX_NO_SG capability. The intel
> driver uses physical address and scatter/gather lists, whereas ours just
> takes virtual addresses for the buffer.
> 
> > Why can't we handle those?
> 
> We never get them, so we don't need to.

Ok.

> > How are these pages pinned? Does the core ensure that?
> 
> Yes, they're GFP_KERNEL pages underneath.

Ok.

> > > +	spe_pmu->pmu = (struct pmu) {
> > > +		.capabilities	= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
> > > +		.attr_groups	= arm_spe_pmu_attr_groups,
> > > +		/*
> > > +		 * We hitch a ride on the software context here, so that
> > > +		 * we can support per-task profiling (which is not possible
> > > +		 * with the invalid context as it doesn't get sched callbacks).
> > > +		 * This requires that userspace either uses a dummy event for
> > > +		 * perf_event_open, since the aux buffer is not setup until
> > > +		 * a subsequent mmap, or creates the profiling event in a
> > > +		 * disabled state and explicitly PERF_EVENT_IOC_ENABLEs it
> > > +		 * once the buffer has been created.
> > > +		 */
> > > +		.task_ctx_nr	= perf_sw_context,
> > 
> > While other tracing PMUs do this, I think this is a horrible bodge, and
> > a bad idea, given it violates assumptions made in the core code.
> > 
> > For example, unlike true SW events, add() and start() can fail, so a
> > tracing event can unexpectedly stop SW events from being scheduled.
> > 
> > AFAICT, we could also try to move a tracing event into a later-created
> > HW PMU group, which is very worrying.
> > 
> > I really think we should have a separate tracing context for this class
> > of PMU, or we make it so that the invalid context can receive sched
> > callbacks.
> 
> Fair point, and I already have a comment to call this out. Whilst I'm not
> against seeing this fixed, I think it should be a separate patch series
> given that this is a common idiom amongst system/uncore PMU drivers.

I'll add this to my list of things to look at.

I would like to give this a poke in at least the group attach scenario I
mention above.

Thanks,
Mark.
Mark Rutland July 3, 2017, 5:23 p.m. UTC | #6
On Mon, Jun 05, 2017 at 04:22:56PM +0100, Will Deacon wrote:
> +static const struct of_device_id arm_spe_pmu_of_match[] = {
> +	{ .compatible = "arm,statistical-profiling-extension-v1", .data = (void *)1 },
> +};

I just noticed that we're missing a sentinel entry here. Please could
you append one for v5?

Somehow we've been getting lucky with subsequent memory being zero, but
KASAN screams when it sees us accessing said memory:

[    5.451190] ==================================================================
[    5.459186] BUG: KASAN: global-out-of-bounds in __of_match_node+0x140/0x158
[    5.466595] Read of size 1 at addr ffff20000a504788 by task swapper/0/1
[    5.473615] 
[    5.475388] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc4-00010-g35b026e #2
[    5.483547] Hardware name: ARM Juno development board (r1) (DT)
[    5.489856] Call trace:
[    5.492629] [<ffff200008094540>] dump_backtrace+0x0/0x560
[    5.498445] [<ffff200008094ac0>] show_stack+0x20/0x30
[    5.503910] [<ffff200008ba4280>] dump_stack+0x11c/0x184
[    5.509555] [<ffff200008546a18>] print_address_description+0x40/0x388
[    5.516446] [<ffff200008547080>] kasan_report+0x138/0x398
[    5.522266] [<ffff2000085472f8>] __asan_report_load1_noabort+0x18/0x20
[    5.529238] [<ffff200009a90f00>] __of_match_node+0x140/0x158
[    5.535312] [<ffff200009a90f54>] of_match_node+0x3c/0x60
[    5.541032] [<ffff200009a95fa4>] of_match_device+0x54/0x98
[    5.546939] [<ffff2000091b39ac>] platform_match+0xc4/0x2e8
[    5.552841] [<ffff2000091ada40>] __driver_attach+0x70/0x218
[    5.558831] [<ffff2000091a7c2c>] bus_for_each_dev+0x13c/0x1d0
[    5.564999] [<ffff2000091ac590>] driver_attach+0x48/0x78
[    5.570720] [<ffff2000091ab514>] bus_add_driver+0x26c/0x5e0
[    5.576712] [<ffff2000091af86c>] driver_register+0x16c/0x398
[    5.582802] [<ffff2000091b33f8>] __platform_driver_register+0xd8/0x128
[    5.589775] [<ffff20000a89a18c>] arm_spe_pmu_init+0x60/0x8c
[    5.595766] [<ffff200008084acc>] do_one_initcall+0xcc/0x370
[    5.601762] [<ffff20000a7e1d3c>] kernel_init_freeable+0x5f8/0x6c4
[    5.608294] [<ffff200009f8f7a8>] kernel_init+0x18/0x190
[    5.613924] [<ffff200008084710>] ret_from_fork+0x10/0x40
[    5.619604] 
[    5.621346] The buggy address belongs to the variable:
[    5.626893]  arm_spe_pmu_of_match+0xc8/0x4c0
[    5.631496] 
[    5.633235] Memory state around the buggy address:
[    5.638413]  ffff20000a504680: 00 01 fa fa fa fa fa fa 00 00 00 00 00 00 00 00
[    5.646235]  ffff20000a504700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[    5.654057] >ffff20000a504780: 00 fa fa fa fa fa fa fa 00 fa fa fa fa fa fa fa
[    5.661852]                       ^
[    5.665683]  ffff20000a504800: 07 fa fa fa fa fa fa fa 00 04 fa fa fa fa fa fa
[    5.673506]  ffff20000a504880: 00 05 fa fa fa fa fa fa 00 06 fa fa fa fa fa fa
[    5.681302] ==================================================================

Thanks,
Mark.
diff mbox

Patch

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index aa587edaf9ea..2e24b9c5744e 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -42,4 +42,12 @@  config XGENE_PMU
         help
           Say y if you want to use APM X-Gene SoC performance monitors.
 
+config ARM_SPE_PMU
+	tristate "Enable support for the ARMv8.2 Statistical Profiling Extension"
+	depends on PERF_EVENTS && ARM64
+	help
+	  Enable perf support for the ARMv8.2 Statistical Profiling
+	  Extension, which provides periodic sampling of operations in
+	  the CPU pipeline and reports this via the perf AUX interface.
+
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 6420bd4394d5..eaee60cf4b1b 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -3,3 +3,4 @@  obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
 obj-$(CONFIG_QCOM_L2_PMU)	+= qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
+obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
new file mode 100644
index 000000000000..a271300ad27d
--- /dev/null
+++ b/drivers/perf/arm_spe_pmu.c
@@ -0,0 +1,1243 @@ 
+/*
+ * Perf support for the Statistical Profiling Extension, introduced as
+ * part of ARMv8.2.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (C) 2016 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#define PMUNAME				"arm_spe"
+#define DRVNAME				PMUNAME "_pmu"
+#define pr_fmt(fmt)			DRVNAME ": " fmt
+
+#include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <asm/sysreg.h>
+
+/* ID registers */
+#define PMSIDR_EL1			sys_reg(3, 0, 9, 9, 7)
+#define PMSIDR_EL1_FE_SHIFT		0
+#define PMSIDR_EL1_FT_SHIFT		1
+#define PMSIDR_EL1_FL_SHIFT		2
+#define PMSIDR_EL1_ARCHINST_SHIFT	3
+#define PMSIDR_EL1_LDS_SHIFT		4
+#define PMSIDR_EL1_ERND_SHIFT		5
+#define PMSIDR_EL1_INTERVAL_SHIFT	8
+#define PMSIDR_EL1_INTERVAL_MASK	0xfUL
+#define PMSIDR_EL1_MAXSIZE_SHIFT	12
+#define PMSIDR_EL1_MAXSIZE_MASK		0xfUL
+#define PMSIDR_EL1_COUNTSIZE_SHIFT	16
+#define PMSIDR_EL1_COUNTSIZE_MASK	0xfUL
+
+#define PMBIDR_EL1			sys_reg(3, 0, 9, 10, 7)
+#define PMBIDR_EL1_ALIGN_SHIFT		0
+#define PMBIDR_EL1_ALIGN_MASK		0xfU
+#define PMBIDR_EL1_P_SHIFT		4
+#define PMBIDR_EL1_F_SHIFT		5
+
+/* Sampling controls */
+#define PMSCR_EL1			sys_reg(3, 0, 9, 9, 0)
+#define PMSCR_EL1_E0SPE_SHIFT		0
+#define PMSCR_EL1_E1SPE_SHIFT		1
+#define PMSCR_EL1_CX_SHIFT		3
+#define PMSCR_EL1_PA_SHIFT		4
+#define PMSCR_EL1_TS_SHIFT		5
+#define PMSCR_EL1_PCT_SHIFT		6
+
+#define PMSICR_EL1			sys_reg(3, 0, 9, 9, 2)
+
+#define PMSIRR_EL1			sys_reg(3, 0, 9, 9, 3)
+#define PMSIRR_EL1_RND_SHIFT		0
+#define PMSIRR_EL1_IVAL_MASK		0xffUL
+
+/* Filtering controls */
+#define PMSFCR_EL1			sys_reg(3, 0, 9, 9, 4)
+#define PMSFCR_EL1_FE_SHIFT		0
+#define PMSFCR_EL1_FT_SHIFT		1
+#define PMSFCR_EL1_FL_SHIFT		2
+#define PMSFCR_EL1_B_SHIFT		16
+#define PMSFCR_EL1_LD_SHIFT		17
+#define PMSFCR_EL1_ST_SHIFT		18
+
+#define PMSEVFR_EL1			sys_reg(3, 0, 9, 9, 5)
+#define PMSEVFR_EL1_RES0		0x0000ffff00ff0f55UL
+
+#define PMSLATFR_EL1			sys_reg(3, 0, 9, 9, 6)
+#define PMSLATFR_EL1_MINLAT_SHIFT	0
+
+/* Buffer controls */
+#define PMBLIMITR_EL1			sys_reg(3, 0, 9, 10, 0)
+#define PMBLIMITR_EL1_E_SHIFT		0
+#define PMBLIMITR_EL1_FM_SHIFT		1
+#define PMBLIMITR_EL1_FM_MASK		0x3UL
+#define PMBLIMITR_EL1_FM_STOP_IRQ	(0 << PMBLIMITR_EL1_FM_SHIFT)
+
+#define PMBPTR_EL1			sys_reg(3, 0, 9, 10, 1)
+
+/* Buffer error reporting */
+#define PMBSR_EL1			sys_reg(3, 0, 9, 10, 3)
+#define PMBSR_EL1_COLL_SHIFT		16
+#define PMBSR_EL1_S_SHIFT		17
+#define PMBSR_EL1_EA_SHIFT		18
+#define PMBSR_EL1_DL_SHIFT		19
+#define PMBSR_EL1_EC_SHIFT		26
+#define PMBSR_EL1_EC_MASK		0x3fUL
+
+#define PMBSR_EL1_EC_BUF		(0x0UL << PMBSR_EL1_EC_SHIFT)
+#define PMBSR_EL1_EC_FAULT_S1		(0x24UL << PMBSR_EL1_EC_SHIFT)
+#define PMBSR_EL1_EC_FAULT_S2		(0x25UL << PMBSR_EL1_EC_SHIFT)
+
+#define PMBSR_EL1_FAULT_FSC_SHIFT	0
+#define PMBSR_EL1_FAULT_FSC_MASK	0x3fUL
+
+#define PMBSR_EL1_BUF_BSC_SHIFT		0
+#define PMBSR_EL1_BUF_BSC_MASK		0x3fUL
+
+#define PMBSR_EL1_BUF_BSC_FULL		(0x1UL << PMBSR_EL1_BUF_BSC_SHIFT)
+
+#define psb_csync()			asm volatile("hint #17")
+
+struct arm_spe_pmu_buf {
+	int					nr_pages;
+	bool					snapshot;
+	void					*base;
+};
+
+struct arm_spe_pmu {
+	struct pmu				pmu;
+	struct platform_device			*pdev;
+	cpumask_t				supported_cpus;
+	struct hlist_node			hotplug_node;
+
+	int					irq; /* PPI */
+
+	u16					min_period;
+	u16					cnt_width;
+
+#define SPE_PMU_FEAT_FILT_EVT			(1UL << 0)
+#define SPE_PMU_FEAT_FILT_TYP			(1UL << 1)
+#define SPE_PMU_FEAT_FILT_LAT			(1UL << 2)
+#define SPE_PMU_FEAT_ARCH_INST			(1UL << 3)
+#define SPE_PMU_FEAT_LDS			(1UL << 4)
+#define SPE_PMU_FEAT_ERND			(1UL << 5)
+#define SPE_PMU_FEAT_DEV_PROBED			(1UL << 63)
+	u64					features;
+
+	u16					max_record_sz;
+	u16					align;
+	struct perf_output_handle __percpu	*handle;
+};
+
+#define to_spe_pmu(p) (container_of(p, struct arm_spe_pmu, pmu))
+
+/* Convert a free-running index from perf into an SPE buffer offset */
+#define PERF_IDX2OFF(idx, buf)	((idx) & (((buf)->nr_pages << PAGE_SHIFT) - 1))
+
+/* Keep track of our dynamic hotplug state */
+static enum cpuhp_state arm_spe_pmu_online;
+
+/* This sysfs gunk was really good fun to write. */
+enum arm_spe_pmu_capabilities {
+	SPE_PMU_CAP_ARCH_INST = 0,
+	SPE_PMU_CAP_ERND,
+	SPE_PMU_CAP_FEAT_MAX,
+	SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX,
+	SPE_PMU_CAP_MIN_IVAL,
+};
+
+static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = {
+	[SPE_PMU_CAP_ARCH_INST]	= SPE_PMU_FEAT_ARCH_INST,
+	[SPE_PMU_CAP_ERND]	= SPE_PMU_FEAT_ERND,
+};
+
+static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
+{
+	if (cap < SPE_PMU_CAP_FEAT_MAX)
+		return !!(spe_pmu->features & arm_spe_pmu_feat_caps[cap]);
+
+	switch (cap) {
+	case SPE_PMU_CAP_CNT_SZ:
+		return spe_pmu->cnt_width;
+	case SPE_PMU_CAP_MIN_IVAL:
+		return spe_pmu->min_period;
+	default:
+		WARN(1, "unknown cap %d\n", cap);
+	}
+
+	return 0;
+}
+
+static ssize_t arm_spe_pmu_cap_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	int cap = (long)ea->var;
+
+	return snprintf(buf, PAGE_SIZE, "%u\n",
+		arm_spe_pmu_cap_get(spe_pmu, cap));
+}
+
+#define SPE_EXT_ATTR_ENTRY(_name, _func, _var)				\
+	&((struct dev_ext_attribute[]) {				\
+		{ __ATTR(_name, S_IRUGO, _func, NULL), (void *)_var }	\
+	})[0].attr.attr
+
+#define SPE_CAP_EXT_ATTR_ENTRY(_name, _var)				\
+	SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var)
+
+static struct attribute *arm_spe_pmu_cap_attr[] = {
+	SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST),
+	SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND),
+	SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ),
+	SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL),
+	NULL,
+};
+
+static struct attribute_group arm_spe_pmu_cap_group = {
+	.name	= "caps",
+	.attrs	= arm_spe_pmu_cap_attr,
+};
+
+/* User ABI */
+#define ATTR_CFG_FLD_ts_enable_CFG		config	/* PMSCR_EL1.TS */
+#define ATTR_CFG_FLD_ts_enable_LO		0
+#define ATTR_CFG_FLD_ts_enable_HI		0
+#define ATTR_CFG_FLD_pa_enable_CFG		config	/* PMSCR_EL1.PA */
+#define ATTR_CFG_FLD_pa_enable_LO		1
+#define ATTR_CFG_FLD_pa_enable_HI		1
+#define ATTR_CFG_FLD_jitter_CFG			config	/* PMSIRR_EL1.RND */
+#define ATTR_CFG_FLD_jitter_LO			16
+#define ATTR_CFG_FLD_jitter_HI			16
+#define ATTR_CFG_FLD_branch_filter_CFG		config	/* PMSFCR_EL1.B */
+#define ATTR_CFG_FLD_branch_filter_LO		32
+#define ATTR_CFG_FLD_branch_filter_HI		32
+#define ATTR_CFG_FLD_load_filter_CFG		config	/* PMSFCR_EL1.LD */
+#define ATTR_CFG_FLD_load_filter_LO		33
+#define ATTR_CFG_FLD_load_filter_HI		33
+#define ATTR_CFG_FLD_store_filter_CFG		config	/* PMSFCR_EL1.ST */
+#define ATTR_CFG_FLD_store_filter_LO		34
+#define ATTR_CFG_FLD_store_filter_HI		34
+
+#define ATTR_CFG_FLD_event_filter_CFG		config1	/* PMSEVFR_EL1 */
+#define ATTR_CFG_FLD_event_filter_LO		0
+#define ATTR_CFG_FLD_event_filter_HI		63
+
+#define ATTR_CFG_FLD_min_latency_CFG		config2	/* PMSLATFR_EL1.MINLAT */
+#define ATTR_CFG_FLD_min_latency_LO		0
+#define ATTR_CFG_FLD_min_latency_HI		11
+
+/* Why does everything I do descend into this? */
+#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
+	(lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi
+
+#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
+	__GEN_PMU_FORMAT_ATTR(cfg, lo, hi)
+
+#define GEN_PMU_FORMAT_ATTR(name)					\
+	PMU_FORMAT_ATTR(name,						\
+	_GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG,			\
+			     ATTR_CFG_FLD_##name##_LO,			\
+			     ATTR_CFG_FLD_##name##_HI))
+
+#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi)				\
+	((((attr)->cfg) >> lo) & GENMASK(hi - lo, 0))
+
+#define ATTR_CFG_GET_FLD(attr, name)					\
+	_ATTR_CFG_GET_FLD(attr,						\
+			  ATTR_CFG_FLD_##name##_CFG,			\
+			  ATTR_CFG_FLD_##name##_LO,			\
+			  ATTR_CFG_FLD_##name##_HI)
+
+GEN_PMU_FORMAT_ATTR(ts_enable);
+GEN_PMU_FORMAT_ATTR(pa_enable);
+GEN_PMU_FORMAT_ATTR(jitter);
+GEN_PMU_FORMAT_ATTR(load_filter);
+GEN_PMU_FORMAT_ATTR(store_filter);
+GEN_PMU_FORMAT_ATTR(branch_filter);
+GEN_PMU_FORMAT_ATTR(event_filter);
+GEN_PMU_FORMAT_ATTR(min_latency);
+
+static struct attribute *arm_spe_pmu_formats_attr[] = {
+	&format_attr_ts_enable.attr,
+	&format_attr_pa_enable.attr,
+	&format_attr_jitter.attr,
+	&format_attr_load_filter.attr,
+	&format_attr_store_filter.attr,
+	&format_attr_branch_filter.attr,
+	&format_attr_event_filter.attr,
+	&format_attr_min_latency.attr,
+	NULL,
+};
+
+static struct attribute_group arm_spe_pmu_format_group = {
+	.name	= "format",
+	.attrs	= arm_spe_pmu_formats_attr,
+};
+
+static ssize_t arm_spe_pmu_get_attr_cpumask(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
+
+	return cpumap_print_to_pagebuf(true, buf, &spe_pmu->supported_cpus);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, arm_spe_pmu_get_attr_cpumask, NULL);
+
+static struct attribute *arm_spe_pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group arm_spe_pmu_group = {
+	.attrs	= arm_spe_pmu_attrs,
+};
+
+static const struct attribute_group *arm_spe_pmu_attr_groups[] = {
+	&arm_spe_pmu_group,
+	&arm_spe_pmu_cap_group,
+	&arm_spe_pmu_format_group,
+	NULL,
+};
+
+/* Convert between user ABI and register values */
+static u64 arm_spe_event_to_pmscr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	u64 reg = 0;
+
+	reg |= ATTR_CFG_GET_FLD(attr, ts_enable) << PMSCR_EL1_TS_SHIFT;
+	reg |= ATTR_CFG_GET_FLD(attr, pa_enable) << PMSCR_EL1_PA_SHIFT;
+
+	if (!attr->exclude_user)
+		reg |= BIT(PMSCR_EL1_E0SPE_SHIFT);
+
+	if (!attr->exclude_kernel)
+		reg |= BIT(PMSCR_EL1_E1SPE_SHIFT);
+
+	if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
+		reg |= BIT(PMSCR_EL1_CX_SHIFT);
+
+	return reg;
+}
+
+static void arm_spe_event_sanitise_period(struct perf_event *event)
+{
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
+	u64 period = event->hw.sample_period & ~PMSIRR_EL1_IVAL_MASK;
+
+	if (period < spe_pmu->min_period)
+		period = spe_pmu->min_period;
+
+	event->hw.sample_period = period;
+}
+
+static u64 arm_spe_event_to_pmsirr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	u64 reg = 0;
+
+	arm_spe_event_sanitise_period(event);
+
+	reg |= ATTR_CFG_GET_FLD(attr, jitter) << PMSIRR_EL1_RND_SHIFT;
+	reg |= event->hw.sample_period;
+
+	return reg;
+}
+
+static u64 arm_spe_event_to_pmsfcr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	u64 reg = 0;
+
+	reg |= ATTR_CFG_GET_FLD(attr, load_filter) << PMSFCR_EL1_LD_SHIFT;
+	reg |= ATTR_CFG_GET_FLD(attr, store_filter) << PMSFCR_EL1_ST_SHIFT;
+	reg |= ATTR_CFG_GET_FLD(attr, branch_filter) << PMSFCR_EL1_B_SHIFT;
+
+	if (reg)
+		reg |= BIT(PMSFCR_EL1_FT_SHIFT);
+
+	if (ATTR_CFG_GET_FLD(attr, event_filter))
+		reg |= BIT(PMSFCR_EL1_FE_SHIFT);
+
+	if (ATTR_CFG_GET_FLD(attr, min_latency))
+		reg |= BIT(PMSFCR_EL1_FL_SHIFT);
+
+	return reg;
+}
+
+static u64 arm_spe_event_to_pmsevfr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	return ATTR_CFG_GET_FLD(attr, event_filter);
+}
+
+static u64 arm_spe_event_to_pmslatfr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	return ATTR_CFG_GET_FLD(attr, min_latency) << PMSLATFR_EL1_MINLAT_SHIFT;
+}
+
+static bool arm_spe_pmu_buffer_mgmt_pending(u64 pmbsr)
+{
+	const char *err_str;
+
+	/* Service required? */
+	if (!(pmbsr & BIT(PMBSR_EL1_S_SHIFT)))
+		return false;
+
+	/* We only expect buffer management events */
+	switch (pmbsr & (PMBSR_EL1_EC_MASK << PMBSR_EL1_EC_SHIFT)) {
+	case PMBSR_EL1_EC_BUF:
+		/* Handled below */
+		break;
+	case PMBSR_EL1_EC_FAULT_S1:
+	case PMBSR_EL1_EC_FAULT_S2:
+		err_str = "Unexpected buffer fault";
+		goto out_err;
+	default:
+		err_str = "Unknown error code";
+		goto out_err;
+	}
+
+	/* Buffer management event */
+	switch (pmbsr & (PMBSR_EL1_BUF_BSC_MASK << PMBSR_EL1_BUF_BSC_SHIFT)) {
+	case PMBSR_EL1_BUF_BSC_FULL:
+		/* Ensure new profiling data is visible to the CPU */
+		psb_csync();
+		dsb(nsh);
+		return true;
+	default:
+		err_str = "Unknown buffer status code";
+	}
+
+out_err:
+	pr_err_ratelimited("%s on CPU %d [PMBSR=0x%08llx]\n", err_str,
+			   smp_processor_id(), pmbsr);
+	return false;
+}
+
+static u64 arm_spe_pmu_next_snapshot_off(struct perf_output_handle *handle)
+{
+	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(handle->event->pmu);
+	u64 head = PERF_IDX2OFF(handle->head, buf);
+	u64 limit = buf->nr_pages * PAGE_SIZE;
+
+	/*
+	 * The trace format isn't parseable in reverse, so clamp
+	 * the limit to half of the buffer size in snapshot mode
+	 * so that the worst case is half a buffer of records, as
+	 * opposed to a single record.
+	 */
+	if (head < limit >> 1)
+		limit >>= 1;
+
+	/*
+	 * If we're within max_record_sz of the limit, we must
+	 * pad, move the head index and recompute the limit.
+	 */
+	if (limit - head < spe_pmu->max_record_sz) {
+		memset(buf->base + head, 0, limit - head);
+		handle->head = PERF_IDX2OFF(limit, buf);
+		limit = ((buf->nr_pages * PAGE_SIZE) >> 1) + handle->head;
+	}
+
+	return limit;
+}
+
+static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
+{
+	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
+	u64 head = PERF_IDX2OFF(handle->head, buf);
+	u64 tail = PERF_IDX2OFF(handle->head + handle->size, buf);
+	u64 wakeup = PERF_IDX2OFF(handle->wakeup, buf);
+	u64 limit = buf->nr_pages * PAGE_SIZE;
+
+	/*
+	 * Set the limit pointer to either the watermark or the
+	 * current tail pointer; whichever comes first.
+	 */
+	if (handle->head + handle->size <= handle->wakeup) {
+		/* The tail is next, so check for wrapping */
+		if (tail >= head) {
+			/*
+			 * No wrapping, but need to align downwards to
+			 * avoid corrupting unconsumed data.
+			 */
+			limit = round_down(tail, PAGE_SIZE);
+
+		}
+	} else if (wakeup >= head) {
+		/*
+		 * The wakeup is next and doesn't wrap. Align upwards to
+		 * ensure that we do indeed reach the watermark.
+		 */
+		limit = round_up(wakeup, PAGE_SIZE);
+
+		/*
+		 * If rounding up crosses the tail, then we have to
+		 * round down to avoid corrupting unconsumed data.
+		 * Hopefully the tail will have moved by the time we
+		 * hit the new limit.
+		 */
+		if (wakeup < tail && limit > tail)
+			limit = round_down(wakeup, PAGE_SIZE);
+	}
+
+	/*
+	 * If rounding down crosses the head, then the buffer is full,
+	 * so pad to tail and end the session.
+	 */
+	if (limit <= head) {
+		memset(buf->base + head, 0, handle->size);
+		perf_aux_output_skip(handle, handle->size);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+		perf_aux_output_end(handle, 0);
+		limit = 0;
+	}
+
+	return limit;
+}
+
+static u64 arm_spe_pmu_next_off(struct perf_output_handle *handle)
+{
+	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(handle->event->pmu);
+	u64 limit = __arm_spe_pmu_next_off(handle);
+	u64 head = PERF_IDX2OFF(handle->head, buf);
+
+	/*
+	 * If the head has come too close to the end of the buffer,
+	 * then pad to the end and recompute the limit.
+	 */
+	if (limit && (limit - head < spe_pmu->max_record_sz)) {
+		memset(buf->base + head, 0, limit - head);
+		perf_aux_output_skip(handle, limit - head);
+		limit = __arm_spe_pmu_next_off(handle);
+	}
+
+	return limit;
+}
+
+static void arm_spe_perf_aux_output_begin(struct perf_output_handle *handle,
+					  struct perf_event *event)
+{
+	u64 base, limit;
+	struct arm_spe_pmu_buf *buf;
+
+	/* Start a new aux session */
+	buf = perf_aux_output_begin(handle, event);
+	if (!buf) {
+		event->hw.state |= PERF_HES_STOPPED;
+		/*
+		 * We still need to clear the limit pointer, since the
+		 * profiler might only be disabled by virtue of a fault.
+		 */
+		limit = 0;
+		goto out_write_limit;
+	}
+
+	limit = buf->snapshot ? arm_spe_pmu_next_snapshot_off(handle)
+			      : arm_spe_pmu_next_off(handle);
+	if (limit)
+		limit |= BIT(PMBLIMITR_EL1_E_SHIFT);
+
+	base = (u64)buf->base + PERF_IDX2OFF(handle->head, buf);
+	write_sysreg_s(base, PMBPTR_EL1);
+	limit += (u64)buf->base;
+
+out_write_limit:
+	write_sysreg_s(limit, PMBLIMITR_EL1);
+}
+
+static bool arm_spe_perf_aux_output_end(struct perf_output_handle *handle,
+					struct perf_event *event,
+					bool resume)
+{
+	u64 pmbptr, pmbsr, offset, size;
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
+	struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
+	bool truncated;
+
+	/*
+	 * We can be called via IRQ work trying to disable the PMU after
+	 * a buffer full event. In this case, the aux session has already
+	 * been stopped, so there's nothing to do here.
+	 */
+	if (!buf)
+		return false;
+
+	/*
+	 * If there isn't a pending management event and we're not stopping
+	 * the current session, then just leave everything alone.
+	 */
+	pmbsr = read_sysreg_s(PMBSR_EL1);
+	if (!arm_spe_pmu_buffer_mgmt_pending(pmbsr) && resume)
+		return false; /* Spurious IRQ */
+
+	/* Ensure hardware updates to PMBPTR_EL1 are visible */
+	isb();
+
+	/*
+	 * Work out how much data has been written since the last update
+	 * to the head index.
+	 */
+	pmbptr = round_down(read_sysreg_s(PMBPTR_EL1), spe_pmu->align);
+	offset = pmbptr - (u64)buf->base;
+	size = offset - PERF_IDX2OFF(handle->head, buf);
+
+	if (buf->snapshot)
+		handle->head = offset;
+
+	/*
+	 * Either the buffer is full or we're stopping the session. Check
+	 * that we didn't write a partial record, since this can result
+	 * in unparseable trace and we must disable the event.
+	 */
+	if (pmbsr & BIT(PMBSR_EL1_COLL_SHIFT))
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
+
+	truncated = pmbsr & BIT(PMBSR_EL1_DL_SHIFT);
+	if (truncated)
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+
+	perf_aux_output_end(handle, size);
+
+	/*
+	 * If we're not resuming the session, then we can clear the fault
+	 * and we're done, otherwise we need to start a new session.
+	 */
+	if (!resume)
+		write_sysreg_s(0, PMBSR_EL1);
+	else if (!truncated)
+		arm_spe_perf_aux_output_begin(handle, event);
+
+	return true;
+}
+
+/* IRQ handling */
+static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
+{
+	struct perf_output_handle *handle = dev;
+
+	if (!perf_get_aux(handle))
+		return IRQ_NONE;
+
+	if (!arm_spe_perf_aux_output_end(handle, handle->event, true))
+		return IRQ_NONE;
+
+	irq_work_run();
+	isb(); /* Ensure the buffer is disabled if data loss has occurred */
+	write_sysreg_s(0, PMBSR_EL1);
+	return IRQ_HANDLED;
+}
+
+/* Perf callbacks */
+static int arm_spe_pmu_event_init(struct perf_event *event)
+{
+	u64 reg;
+	struct perf_event_attr *attr = &event->attr;
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
+
+	/* This is, of course, deeply driver-specific */
+	if (attr->type != event->pmu->type)
+		return -ENOENT;
+
+	if (event->cpu >= 0 &&
+	    !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
+		return -ENOENT;
+
+	if (arm_spe_event_to_pmsevfr(event) & PMSEVFR_EL1_RES0)
+		return -EOPNOTSUPP;
+
+	if (event->hw.sample_period < spe_pmu->min_period ||
+	    event->hw.sample_period & PMSIRR_EL1_IVAL_MASK)
+		return -EOPNOTSUPP;
+
+	if (attr->exclude_idle)
+		return -EOPNOTSUPP;
+
+	/*
+	 * Feedback-directed frequency throttling doesn't work when we
+	 * have a buffer of samples. We'd need to manually count the
+	 * samples in the buffer when it fills up and adjust the event
+	 * count to reflect that. Instead, force the user to specify a
+	 * sample period instead.
+	 */
+	if (attr->freq)
+		return -EINVAL;
+
+	reg = arm_spe_event_to_pmsfcr(event);
+	if ((reg & BIT(PMSFCR_EL1_FE_SHIFT)) &&
+	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_EVT))
+		return -EOPNOTSUPP;
+
+	if ((reg & BIT(PMSFCR_EL1_FT_SHIFT)) &&
+	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_TYP))
+		return -EOPNOTSUPP;
+
+	if ((reg & BIT(PMSFCR_EL1_FL_SHIFT)) &&
+	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static void arm_spe_pmu_start(struct perf_event *event, int flags)
+{
+	u64 reg;
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
+
+	hwc->state = 0;
+	arm_spe_perf_aux_output_begin(handle, event);
+	if (hwc->state)
+		return;
+
+	reg = arm_spe_event_to_pmsfcr(event);
+	write_sysreg_s(reg, PMSFCR_EL1);
+
+	reg = arm_spe_event_to_pmsevfr(event);
+	write_sysreg_s(reg, PMSEVFR_EL1);
+
+	reg = arm_spe_event_to_pmslatfr(event);
+	write_sysreg_s(reg, PMSLATFR_EL1);
+
+	if (flags & PERF_EF_RELOAD) {
+		reg = arm_spe_event_to_pmsirr(event);
+		write_sysreg_s(reg, PMSIRR_EL1);
+		isb();
+		reg = local64_read(&hwc->period_left);
+		write_sysreg_s(reg, PMSICR_EL1);
+	}
+
+	reg = arm_spe_event_to_pmscr(event);
+	isb();
+	write_sysreg_s(reg, PMSCR_EL1);
+}
+
+static void arm_spe_pmu_disable_and_drain_local(void)
+{
+	/* Disable profiling at EL0 and EL1 */
+	write_sysreg_s(0, PMSCR_EL1);
+	isb();
+
+	/* Drain any buffered data */
+	psb_csync();
+	dsb(nsh);
+
+	/* Disable the profiling buffer */
+	write_sysreg_s(0, PMBLIMITR_EL1);
+}
+
+static void arm_spe_pmu_stop(struct perf_event *event, int flags)
+{
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
+
+	/* If we're already stopped, then nothing to do */
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	/* Stop all trace generation */
+	arm_spe_pmu_disable_and_drain_local();
+
+	if (flags & PERF_EF_UPDATE) {
+		arm_spe_perf_aux_output_end(handle, event, false);
+		/*
+		 * This may also contain ECOUNT, but nobody else should
+		 * be looking at period_left, since we forbid frequency
+		 * based sampling.
+		 */
+		local64_set(&hwc->period_left, read_sysreg_s(PMSICR_EL1));
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+
+	hwc->state |= PERF_HES_STOPPED;
+}
+
+static int arm_spe_pmu_add(struct perf_event *event, int flags)
+{
+	int ret = 0;
+	struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int cpu = event->cpu == -1 ? smp_processor_id() : event->cpu;
+
+	if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
+		return -ENOENT;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_START) {
+		arm_spe_pmu_start(event, PERF_EF_RELOAD);
+		if (hwc->state & PERF_HES_STOPPED)
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void arm_spe_pmu_del(struct perf_event *event, int flags)
+{
+	arm_spe_pmu_stop(event, PERF_EF_UPDATE);
+}
+
+static void arm_spe_pmu_read(struct perf_event *event)
+{
+}
+
+static void *arm_spe_pmu_setup_aux(int cpu, void **pages, int nr_pages,
+				   bool snapshot)
+{
+	int i;
+	struct page **pglist;
+	struct arm_spe_pmu_buf *buf;
+
+	/*
+	 * We require an even number of pages for snapshot mode, so that
+	 * we can effectively treat the buffer as consisting of two equal
+	 * parts and give userspace a fighting chance of getting some
+	 * useful data out of it.
+	 */
+	if (!nr_pages || (snapshot && (nr_pages & 1)))
+		return NULL;
+
+	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
+	if (!buf)
+		return NULL;
+
+	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
+	if (!pglist)
+		goto out_free_buf;
+
+	for (i = 0; i < nr_pages; ++i) {
+		struct page *page = virt_to_page(pages[i]);
+
+		if (PagePrivate(page)) {
+			pr_warn("unexpected high-order page for auxbuf!");
+			goto out_free_pglist;
+		}
+
+		pglist[i] = virt_to_page(pages[i]);
+	}
+
+	buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!buf->base)
+		goto out_free_pglist;
+
+	buf->nr_pages	= nr_pages;
+	buf->snapshot	= snapshot;
+
+	kfree(pglist);
+	return buf;
+
+out_free_pglist:
+	kfree(pglist);
+out_free_buf:
+	kfree(buf);
+	return NULL;
+}
+
+static void arm_spe_pmu_free_aux(void *aux)
+{
+	struct arm_spe_pmu_buf *buf = aux;
+
+	vunmap(buf->base);
+	kfree(buf);
+}
+
+/* Initialisation and teardown functions */
+static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu)
+{
+	static atomic_t pmu_idx = ATOMIC_INIT(-1);
+
+	int idx;
+	char *name;
+	struct device *dev = &spe_pmu->pdev->dev;
+
+	spe_pmu->pmu = (struct pmu) {
+		.capabilities	= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
+		.attr_groups	= arm_spe_pmu_attr_groups,
+		/*
+		 * We hitch a ride on the software context here, so that
+		 * we can support per-task profiling (which is not possible
+		 * with the invalid context as it doesn't get sched callbacks).
+		 * This requires that userspace either uses a dummy event for
+		 * perf_event_open, since the aux buffer is not setup until
+		 * a subsequent mmap, or creates the profiling event in a
+		 * disabled state and explicitly PERF_EVENT_IOC_ENABLEs it
+		 * once the buffer has been created.
+		 */
+		.task_ctx_nr	= perf_sw_context,
+		.event_init	= arm_spe_pmu_event_init,
+		.add		= arm_spe_pmu_add,
+		.del		= arm_spe_pmu_del,
+		.start		= arm_spe_pmu_start,
+		.stop		= arm_spe_pmu_stop,
+		.read		= arm_spe_pmu_read,
+		.setup_aux	= arm_spe_pmu_setup_aux,
+		.free_aux	= arm_spe_pmu_free_aux,
+	};
+
+	idx = atomic_inc_return(&pmu_idx);
+	name = devm_kasprintf(dev, GFP_KERNEL, "%s_%d", PMUNAME, idx);
+	return perf_pmu_register(&spe_pmu->pmu, name, -1);
+}
+
+static void arm_spe_pmu_perf_destroy(struct arm_spe_pmu *spe_pmu)
+{
+	perf_pmu_unregister(&spe_pmu->pmu);
+}
+
+static void __arm_spe_pmu_dev_probe(void *info)
+{
+	int fld;
+	u64 reg;
+	struct arm_spe_pmu *spe_pmu = info;
+	struct device *dev = &spe_pmu->pdev->dev;
+
+	fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1),
+						   ID_AA64DFR0_PMSVER_SHIFT);
+	if (!fld) {
+		dev_err(dev,
+			"unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n",
+			fld, smp_processor_id());
+		return;
+	}
+
+	/* Read PMBIDR first to determine whether or not we have access */
+	reg = read_sysreg_s(PMBIDR_EL1);
+	if (reg & BIT(PMBIDR_EL1_P_SHIFT)) {
+		dev_err(dev,
+			"profiling buffer owned by higher exception level\n");
+		return;
+	}
+
+	/* Minimum alignment. If it's out-of-range, then fail the probe */
+	fld = reg >> PMBIDR_EL1_ALIGN_SHIFT & PMBIDR_EL1_ALIGN_MASK;
+	spe_pmu->align = 1 << fld;
+	if (spe_pmu->align > SZ_2K) {
+		dev_err(dev, "unsupported PMBIDR.Align [%d] on CPU %d\n",
+			fld, smp_processor_id());
+		return;
+	}
+
+	/* It's now safe to read PMSIDR and figure out what we've got */
+	reg = read_sysreg_s(PMSIDR_EL1);
+	if (reg & BIT(PMSIDR_EL1_FE_SHIFT))
+		spe_pmu->features |= SPE_PMU_FEAT_FILT_EVT;
+
+	if (reg & BIT(PMSIDR_EL1_FT_SHIFT))
+		spe_pmu->features |= SPE_PMU_FEAT_FILT_TYP;
+
+	if (reg & BIT(PMSIDR_EL1_FL_SHIFT))
+		spe_pmu->features |= SPE_PMU_FEAT_FILT_LAT;
+
+	if (reg & BIT(PMSIDR_EL1_ARCHINST_SHIFT))
+		spe_pmu->features |= SPE_PMU_FEAT_ARCH_INST;
+
+	if (reg & BIT(PMSIDR_EL1_LDS_SHIFT))
+		spe_pmu->features |= SPE_PMU_FEAT_LDS;
+
+	if (reg & BIT(PMSIDR_EL1_ERND_SHIFT))
+		spe_pmu->features |= SPE_PMU_FEAT_ERND;
+
+	/* This field has a spaced out encoding, so just use a look-up */
+	fld = reg >> PMSIDR_EL1_INTERVAL_SHIFT & PMSIDR_EL1_INTERVAL_MASK;
+	switch (fld) {
+	case 0:
+		spe_pmu->min_period = 256;
+		break;
+	case 2:
+		spe_pmu->min_period = 512;
+		break;
+	case 3:
+		spe_pmu->min_period = 768;
+		break;
+	case 4:
+		spe_pmu->min_period = 1024;
+		break;
+	case 5:
+		spe_pmu->min_period = 1536;
+		break;
+	case 6:
+		spe_pmu->min_period = 2048;
+		break;
+	case 7:
+		spe_pmu->min_period = 3072;
+		break;
+	default:
+		dev_warn(dev, "unknown PMSIDR_EL1.Interval [%d]; assuming 8\n",
+			 fld);
+		/* Fallthrough */
+	case 8:
+		spe_pmu->min_period = 4096;
+	}
+
+	/* Maximum record size. If it's out-of-range, then fail the probe */
+	fld = reg >> PMSIDR_EL1_MAXSIZE_SHIFT & PMSIDR_EL1_MAXSIZE_MASK;
+	spe_pmu->max_record_sz = 1 << fld;
+	if (spe_pmu->max_record_sz > SZ_2K || spe_pmu->max_record_sz < 16) {
+		dev_err(dev, "unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n",
+			fld, smp_processor_id());
+		return;
+	}
+
+	fld = reg >> PMSIDR_EL1_COUNTSIZE_SHIFT & PMSIDR_EL1_COUNTSIZE_MASK;
+	switch (fld) {
+	default:
+		dev_warn(dev, "unknown PMSIDR_EL1.CountSize [%d]; assuming 2\n",
+			 fld);
+		/* Fallthrough */
+	case 2:
+		spe_pmu->cnt_width = 12;
+	}
+
+	dev_info(dev,
+		 "probed for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n",
+		 cpumask_pr_args(&spe_pmu->supported_cpus),
+		 spe_pmu->max_record_sz, spe_pmu->align, spe_pmu->features);
+
+	spe_pmu->features |= SPE_PMU_FEAT_DEV_PROBED;
+	return;
+}
+
+static void __arm_spe_pmu_reset_local(void)
+{
+	/*
+	 * This is probably overkill, as we have no idea where we're
+	 * draining any buffered data to...
+	 */
+	arm_spe_pmu_disable_and_drain_local();
+
+	/* Reset the buffer base pointer */
+	write_sysreg_s(0, PMBPTR_EL1);
+	isb();
+
+	/* Clear any pending management interrupts */
+	write_sysreg_s(0, PMBSR_EL1);
+	isb();
+}
+
+static void __arm_spe_pmu_setup_one(void *info)
+{
+	struct arm_spe_pmu *spe_pmu = info;
+
+	__arm_spe_pmu_reset_local();
+	enable_percpu_irq(spe_pmu->irq, IRQ_TYPE_NONE);
+}
+
+static void __arm_spe_pmu_stop_one(void *info)
+{
+	struct arm_spe_pmu *spe_pmu = info;
+
+	disable_percpu_irq(spe_pmu->irq);
+	__arm_spe_pmu_reset_local();
+}
+
+static int arm_spe_pmu_cpu_startup(unsigned int cpu, struct hlist_node *node)
+{
+	struct arm_spe_pmu *spe_pmu;
+
+	spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node);
+	if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
+		return 0;
+
+	__arm_spe_pmu_setup_one(spe_pmu);
+	return 0;
+}
+
+static int arm_spe_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+	struct arm_spe_pmu *spe_pmu;
+
+	spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node);
+	if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
+		return 0;
+
+	__arm_spe_pmu_stop_one(spe_pmu);
+	return 0;
+}
+
+static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu)
+{
+	int ret;
+	cpumask_t *mask = &spe_pmu->supported_cpus;
+
+	/* Make sure we probe the hardware on a relevant CPU */
+	ret = smp_call_function_any(mask,  __arm_spe_pmu_dev_probe, spe_pmu, 1);
+	if (ret || !(spe_pmu->features & SPE_PMU_FEAT_DEV_PROBED))
+		return -ENXIO;
+
+	/* Request our PPIs (note that the IRQ is still disabled) */
+	ret = request_percpu_irq(spe_pmu->irq, arm_spe_pmu_irq_handler, DRVNAME,
+				 spe_pmu->handle);
+	if (ret)
+		return ret;
+
+	/*
+	 * Register our hotplug notifier now so we don't miss any events.
+	 * This will enable the IRQ for any supported CPUs that are already
+	 * up.
+	 */
+	ret = cpuhp_state_add_instance(arm_spe_pmu_online,
+				       &spe_pmu->hotplug_node);
+	if (ret)
+		free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
+
+	return ret;
+}
+
+static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu)
+{
+	cpuhp_state_remove_instance(arm_spe_pmu_online, &spe_pmu->hotplug_node);
+	free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
+}
+
+/* Driver and device probing */
+static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu)
+{
+	struct platform_device *pdev = spe_pmu->pdev;
+	int irq = platform_get_irq(pdev, 0);
+
+	if (irq < 0) {
+		dev_err(&pdev->dev, "failed to get IRQ (%d)\n", irq);
+		return -ENXIO;
+	}
+
+	if (!irq_is_percpu(irq)) {
+		dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq);
+		return -EINVAL;
+	}
+
+	if (irq_get_percpu_devid_partition(irq, &spe_pmu->supported_cpus)) {
+		dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq);
+		return -EINVAL;
+	}
+
+	spe_pmu->irq = irq;
+	return 0;
+}
+
+static const struct of_device_id arm_spe_pmu_of_match[] = {
+	{ .compatible = "arm,statistical-profiling-extension-v1", .data = (void *)1 },
+};
+
+static int arm_spe_pmu_device_dt_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct arm_spe_pmu *spe_pmu;
+	struct device *dev = &pdev->dev;
+
+	spe_pmu = devm_kzalloc(dev, sizeof(*spe_pmu), GFP_KERNEL);
+	if (!spe_pmu) {
+		dev_err(dev, "failed to allocate spe_pmu\n");
+		return -ENOMEM;
+	}
+
+	spe_pmu->handle = alloc_percpu(typeof(*spe_pmu->handle));
+	if (!spe_pmu->handle)
+		return -ENOMEM;
+
+	spe_pmu->pdev = pdev;
+	platform_set_drvdata(pdev, spe_pmu);
+
+	ret = arm_spe_pmu_irq_probe(spe_pmu);
+	if (ret)
+		goto out_free_handle;
+
+	ret = arm_spe_pmu_dev_init(spe_pmu);
+	if (ret)
+		goto out_free_handle;
+
+	ret = arm_spe_pmu_perf_init(spe_pmu);
+	if (ret)
+		goto out_teardown_dev;
+
+	return 0;
+
+out_teardown_dev:
+	arm_spe_pmu_dev_teardown(spe_pmu);
+out_free_handle:
+	free_percpu(spe_pmu->handle);
+	return ret;
+}
+
+static int arm_spe_pmu_device_remove(struct platform_device *pdev)
+{
+	struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
+
+	arm_spe_pmu_perf_destroy(spe_pmu);
+	arm_spe_pmu_dev_teardown(spe_pmu);
+	free_percpu(spe_pmu->handle);
+	return 0;
+}
+
+static struct platform_driver arm_spe_pmu_driver = {
+	.driver	= {
+		.name		= DRVNAME,
+		.of_match_table	= of_match_ptr(arm_spe_pmu_of_match),
+	},
+	.probe	= arm_spe_pmu_device_dt_probe,
+	.remove	= arm_spe_pmu_device_remove,
+};
+
+static int __init arm_spe_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME,
+				      arm_spe_pmu_cpu_startup,
+				      arm_spe_pmu_cpu_teardown);
+	if (ret < 0)
+		return ret;
+	arm_spe_pmu_online = ret;
+
+	ret = platform_driver_register(&arm_spe_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(arm_spe_pmu_online);
+
+	return ret;
+}
+
+static void __exit arm_spe_pmu_exit(void)
+{
+	platform_driver_unregister(&arm_spe_pmu_driver);
+	cpuhp_remove_multi_state(arm_spe_pmu_online);
+}
+
+module_init(arm_spe_pmu_init);
+module_exit(arm_spe_pmu_exit);
+
+MODULE_DESCRIPTION("Perf driver for the ARMv8.2 Statistical Profiling Extension");
+MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
+MODULE_LICENSE("GPL v2");