diff mbox series

[v6,2/2] drivers/perf: hisi: Add driver for HiSilicon PCIe PMU

Message ID 1622467951-32114-3-git-send-email-liuqi115@huawei.com (mailing list archive)
State Not Applicable
Delegated to: Bjorn Helgaas
Headers show
Series drivers/perf: hisi: Add support for PCIe PMU | expand

Commit Message

liuqi (BA) May 31, 2021, 1:32 p.m. UTC
PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
to sample bandwidth, latency, buffer occupation etc.

Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
registered as a PMU in /sys/bus/event_source/devices, so users can
select target PMU, and use filter to do further sets.

Filtering options contains:
event        - select the event.
subevent     - select the subevent.
port         - select target Root Ports. Information of Root Ports
               are shown under sysfs.
bdf          - select requester_id of target EP device.
trig_len     - set trigger condition for starting event statistics.
trigger_mode - set trigger mode. 0 means starting to statistic when
               bigger than trigger condition, and 1 means smaller.
thr_len      - set threshold for statistics.
thr_mode     - set threshold mode. 0 means count when bigger than
               threshold, and 1 means smaller.

Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
---
 MAINTAINERS                                |    6 +
 drivers/perf/Kconfig                       |    2 +
 drivers/perf/Makefile                      |    1 +
 drivers/perf/pci/Kconfig                   |   16 +
 drivers/perf/pci/Makefile                  |    2 +
 drivers/perf/pci/hisilicon/Makefile        |    3 +
 drivers/perf/pci/hisilicon/hisi_pcie_pmu.c | 1019 ++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                 |    1 +
 8 files changed, 1050 insertions(+)
 create mode 100644 drivers/perf/pci/Kconfig
 create mode 100644 drivers/perf/pci/Makefile
 create mode 100644 drivers/perf/pci/hisilicon/Makefile
 create mode 100644 drivers/perf/pci/hisilicon/hisi_pcie_pmu.c

Comments

Will Deacon June 11, 2021, 4:23 p.m. UTC | #1
On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
> PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
> to sample bandwidth, latency, buffer occupation etc.
> 
> Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
> registered as a PMU in /sys/bus/event_source/devices, so users can
> select target PMU, and use filter to do further sets.
> 
> Filtering options contains:
> event        - select the event.
> subevent     - select the subevent.
> port         - select target Root Ports. Information of Root Ports
>                are shown under sysfs.
> bdf          - select requester_id of target EP device.
> trig_len     - set trigger condition for starting event statistics.
> trigger_mode - set trigger mode. 0 means starting to statistic when
>                bigger than trigger condition, and 1 means smaller.
> thr_len      - set threshold for statistics.
> thr_mode     - set threshold mode. 0 means count when bigger than
>                threshold, and 1 means smaller.
> 
> Reviewed-by: John Garry <john.garry@huawei.com>
> Signed-off-by: Qi Liu <liuqi115@huawei.com>
> ---
>  MAINTAINERS                                |    6 +
>  drivers/perf/Kconfig                       |    2 +
>  drivers/perf/Makefile                      |    1 +
>  drivers/perf/pci/Kconfig                   |   16 +
>  drivers/perf/pci/Makefile                  |    2 +
>  drivers/perf/pci/hisilicon/Makefile        |    3 +
>  drivers/perf/pci/hisilicon/hisi_pcie_pmu.c | 1019 ++++++++++++++++++++++++++++

Can we keep this under drivers/perf/hisilicon/ please? I don't see the
need to create a 'pci' directory here.

>  include/linux/cpuhotplug.h                 |    1 +
>  8 files changed, 1050 insertions(+)
>  create mode 100644 drivers/perf/pci/Kconfig
>  create mode 100644 drivers/perf/pci/Makefile
>  create mode 100644 drivers/perf/pci/hisilicon/Makefile
>  create mode 100644 drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 81e1ede..dd5c62d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8233,6 +8233,12 @@ W:	http://www.hisilicon.com
>  F:	Documentation/admin-guide/perf/hisi-pmu.rst
>  F:	drivers/perf/hisilicon
>  
> +HISILICON PCIE PMU DRIVER
> +M:	Qi Liu <liuqi115@huawei.com>
> +S:	Maintained
> +F:	Documentation/admin-guide/perf/hisi-pcie-pmu.rst
> +F:	drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
> +
>  HISILICON QM AND ZIP Controller DRIVER
>  M:	Zhou Wang <wangzhou1@hisilicon.com>
>  L:	linux-crypto@vger.kernel.org
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 77522e5..ddd82fa 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -139,4 +139,6 @@ config ARM_DMC620_PMU
>  
>  source "drivers/perf/hisilicon/Kconfig"
>  
> +source "drivers/perf/pci/Kconfig"
> +
>  endmenu
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 5260b11..1208c08 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
>  obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
>  obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
>  obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
> +obj-y += pci/
> diff --git a/drivers/perf/pci/Kconfig b/drivers/perf/pci/Kconfig
> new file mode 100644
> index 0000000..36b430f
> --- /dev/null
> +++ b/drivers/perf/pci/Kconfig
> @@ -0,0 +1,16 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +#
> +# PCIe Performance Monitor Drivers
> +#
> +menu "PCIe Performance Monitor"
> +
> +config HISI_PCIE_PMU
> +	tristate "HiSilicon PCIE PERF PMU"
> +	depends on PCI && (ARM64 || COMPILE_TEST)
> +	help
> +	  Provide support for HiSilicon PCIe performance monitoring unit (PMU)
> +	  RCiEP devices.
> +	  Adds the PCIe PMU into perf events system for monitoring latency,
> +	  bandwidth etc.
> +
> +endmenu
> diff --git a/drivers/perf/pci/Makefile b/drivers/perf/pci/Makefile
> new file mode 100644
> index 0000000..a56b1a9
> --- /dev/null
> +++ b/drivers/perf/pci/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +obj-y += hisilicon/
> diff --git a/drivers/perf/pci/hisilicon/Makefile b/drivers/perf/pci/hisilicon/Makefile
> new file mode 100644
> index 0000000..65b0bd7
> --- /dev/null
> +++ b/drivers/perf/pci/hisilicon/Makefile
> @@ -0,0 +1,3 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +
> +obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
> diff --git a/drivers/perf/pci/hisilicon/hisi_pcie_pmu.c b/drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
> new file mode 100644
> index 0000000..ed411dd
> --- /dev/null
> +++ b/drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
> @@ -0,0 +1,1019 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * This driver adds support for PCIe PMU RCiEP device. Related
> + * perf events are bandwidth, bandwidth utilization, latency
> + * etc.
> + *
> + * Copyright (C) 2021 HiSilicon Limited
> + * Author: Qi Liu<liuqi115@huawei.com>
> + */
> +#include <linux/bitfield.h>
> +#include <linux/bitmap.h>
> +#include <linux/bug.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/err.h>
> +#include <linux/interrupt.h>
> +#include <linux/io-64-nonatomic-hi-lo.h>
> +#include <linux/irq.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/perf_event.h>
> +
> +#include <asm/div64.h>
> +
> +/* Define registers */
> +#define HISI_PCIE_GLOBAL_CTRL		0x00
> +#define HISI_PCIE_EVENT_CTRL		0x010
> +#define HISI_PCIE_CNT			0x090
> +#define HISI_PCIE_EXT_CNT		0x110
> +#define HISI_PCIE_INT_STAT		0x150
> +#define HISI_PCIE_INT_MASK		0x154
> +#define HISI_PCIE_REG_BDF		0xfe0
> +#define HISI_PCIE_REG_VERSION		0xfe4
> +#define HISI_PCIE_REG_INFO		0xfe8
> +#define HISI_PCIE_REG_FREQ		0xfec
> +
> +/* Define PCIE CTRL CMD */
> +#define HISI_PCIE_GLOBAL_EN		0x01
> +#define HISI_PCIE_GLOBAL_NONE		0
> +#define HISI_PCIE_EVENT_EN		BIT_ULL(20)
> +#define HISI_PCIE_RESET_CNT		BIT_ULL(22)
> +#define HISI_PCIE_DEFAULT_SET		BIT_ULL(34)
> +#define HISI_PCIE_THR_EN		BIT_ULL(26)
> +#define HISI_PCIE_TARGET_EN		BIT_ULL(32)
> +#define HISI_PCIE_TRIG_EN		BIT_ULL(52)
> +
> +/* Define offsets in event ctrl regesiter */
> +#define HISI_PCIE_EVENT_M		GENMASK_ULL(7, 0)
> +#define HISI_PCIE_SUBEVENT_M		GENMASK_ULL(15, 8)
> +#define HISI_PCIE_THR_MODE_M		GENMASK_ULL(27, 27)
> +#define HISI_PCIE_THR_M			GENMASK_ULL(31, 28)
> +#define HISI_PCIE_TARGET_M		GENMASK_ULL(52, 36)
> +#define HISI_PCIE_TRIG_MODE_M		GENMASK_ULL(53, 53)
> +#define HISI_PCIE_TRIG_M		GENMASK_ULL(59, 56)
> +
> +#define HISI_PCIE_MAX_COUNTERS		8
> +#define HISI_PCIE_REG_STEP		8
> +#define HISI_PCIE_EVENT_MAX		0xa2
> +#define HISI_PCIE_SUBEVENT_MAX		0x20
> +#define HISI_PCIE_THR_MAX_VAL		10
> +#define HISI_PCIE_TRIG_MAX_VAL		10
> +#define HISI_PCIE_COUNTER_BITS		64
> +#define HISI_PCIE_MAX_PERIOD		BIT_ULL(63)
> +
> +struct hisi_pcie_pmu {
> +	struct perf_event *hw_events[HISI_PCIE_MAX_COUNTERS];
> +	struct hlist_node node;
> +	struct pci_dev *pdev;
> +	struct pmu pmu;
> +	void __iomem *base;
> +	int irq;
> +	u32 identifier;
> +	/* Minimum and maximum bdf of root ports monitored by PMU */
> +	u16 bdf_min;
> +	u16 bdf_max;
> +	int on_cpu;
> +};
> +
> +#define to_pcie_pmu(p)  (container_of((p), struct hisi_pcie_pmu, pmu))
> +#define GET_PCI_DEVFN(bdf)  ((bdf) & 0xff)
> +
> +#define HISI_PCIE_PMU_FILTER_ATTR(_name, _config, _hi, _lo)		  \
> +	static u64 hisi_pcie_get_##_name(struct perf_event *event)	  \
> +	{								  \
> +		return FIELD_GET(GENMASK(_hi, _lo), event->attr._config); \
> +	}								  \
> +
> +HISI_PCIE_PMU_FILTER_ATTR(event, config, 7, 0);
> +HISI_PCIE_PMU_FILTER_ATTR(subevent, config, 15, 8);
> +HISI_PCIE_PMU_FILTER_ATTR(thr_len, config1, 3, 0);
> +HISI_PCIE_PMU_FILTER_ATTR(thr_mode, config1, 4, 4);
> +HISI_PCIE_PMU_FILTER_ATTR(trig_len, config1, 8, 5);
> +HISI_PCIE_PMU_FILTER_ATTR(trig_mode, config1, 9, 9);
> +HISI_PCIE_PMU_FILTER_ATTR(port, config2, 15, 0);
> +HISI_PCIE_PMU_FILTER_ATTR(bdf, config2, 31, 16);
> +
> +#define HISI_PCIE_BUILD_EVENTS(name)					\
> +	static bool is_##name##_event(u32 idx)				\
> +	{								\
> +		return (idx >= name##_events_list[0] &&			\
> +			idx <= name##_events_list[1]) ||		\
> +			idx == name##_events_list[2];			\
> +	}								\
> +
> +/*
> + * The first element of event list is minimum index of TL-layer events
> + * and the second element is maximum index. The third element is index
> + * of a DL-layer event.
> + */
> +static const u32 bw_events_list[] = {0x04, 0x08, 0x84};
> +static const u32 latency_events_list[] = {0x10, 0x15, 0x85};
> +static const u32 bus_util_events_list[] = {0x20, 0x24, 0x09};
> +static const u32 buf_util_events_list[] = {0x28, 0x2a, 0x33};
> +
> +HISI_PCIE_BUILD_EVENTS(bw);
> +HISI_PCIE_BUILD_EVENTS(latency);
> +HISI_PCIE_BUILD_EVENTS(bus_util);
> +HISI_PCIE_BUILD_EVENTS(buf_util);
> +
> +static ssize_t hisi_pcie_format_sysfs_show(struct device *dev,
> +				    struct device_attribute *attr, char *buf)
> +{
> +	struct dev_ext_attribute *eattr;
> +
> +	eattr = container_of(attr, struct dev_ext_attribute, attr);
> +
> +	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
> +}
> +
> +static ssize_t hisi_pcie_event_sysfs_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	struct dev_ext_attribute *eattr;
> +
> +	eattr = container_of(attr, struct dev_ext_attribute, attr);
> +
> +	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var);
> +}
> +
> +#define HISI_PCIE_PMU_ATTR(_name, _func, _config)			\
> +	(&((struct dev_ext_attribute[]) {				\
> +		{ __ATTR(_name, 0444, _func, NULL), (void *)_config }   \
> +	})[0].attr.attr)

If you rebase onto my patch queue, then you can use PMU_EVENT_ATTR_ID to
define this.

> +#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format)			\
> +	HISI_PCIE_PMU_ATTR(_name, hisi_pcie_format_sysfs_show, (void *)_format)
> +#define HISI_PCIE_PMU_EVENT_ATTR(_name, _event)			\
> +	HISI_PCIE_PMU_ATTR(_name, hisi_pcie_event_sysfs_show, (void *)_event)
> +
> +static ssize_t hisi_pcie_cpumask_show(struct device *dev,
> +				      struct device_attribute *attr, char *buf)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return sysfs_emit(buf, "%d\n", pcie_pmu->on_cpu);
> +}

This isn't a cpumask.

> +
> +static ssize_t hisi_pcie_identifier_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return sysfs_emit(buf, "0x%x\n", pcie_pmu->identifier);
> +}
> +
> +static ssize_t hisi_pcie_bus_show(struct device *dev,
> +				  struct device_attribute *attr, char *buf)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return sysfs_emit(buf, "0x%02x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
> +}
> +
> +static void hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu,
> +				      u32 reg_off, u16 *arg0, u16 *arg1)
> +{
> +	u32 val = readl(pcie_pmu->base + reg_off);
> +
> +	*arg0 = val & 0xffff;
> +	*arg1 = (val & 0xffff0000) >> 16;
> +}

Define a new type for the pair of values and return that directly?

> +
> +static u32 hisi_pcie_pmu_get_offset(u32 offset, u32 idx)
> +{
> +	return offset + HISI_PCIE_REG_STEP * idx;
> +}
> +
> +static u32 hisi_pcie_pmu_readl(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
> +			       u32 idx)
> +{
> +	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
> +
> +	return readl(pcie_pmu->base + offset);
> +}
> +
> +static void hisi_pcie_pmu_writel(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
> +				 u32 idx, u32 val)
> +{
> +	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
> +
> +	writel(val, pcie_pmu->base + offset);
> +}
> +
> +static u64 hisi_pcie_pmu_readq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
> +			       u32 idx)
> +{
> +	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
> +
> +	return readq(pcie_pmu->base + offset);
> +}
> +
> +static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
> +				 u32 idx, u64 val)
> +{
> +	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
> +
> +	writeq(val, pcie_pmu->base + offset);
> +}

I'm guessing most (all?) of these IO access can be _relaxed() ?

> +
> +static void hisi_pcie_pmu_config_filter(struct perf_event *event)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	u64 reg = HISI_PCIE_DEFAULT_SET;
> +	u64 port, trig_len, thr_len;
> +	u32 idx = hwc->idx;
> +
> +	/* Config HISI_PCIE_EVENT_CTRL according to event and subevent. */
> +	reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_event(event)) |
> +	       FIELD_PREP(HISI_PCIE_SUBEVENT_M, hisi_pcie_get_subevent(event));
> +
> +	/* Config HISI_PCIE_EVENT_CTRL according to ROOT PORT or EP device. */
> +	port = hisi_pcie_get_port(event);
> +	if (port)
> +		reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port);
> +	else
> +		reg |= HISI_PCIE_TARGET_EN |
> +		       FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event));

Please use braces for multi-line conditional expressions (same elsewhere).

> +
> +	/* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */
> +	trig_len = hisi_pcie_get_trig_len(event);
> +	if (trig_len)
> +		reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len) |
> +		       FIELD_PREP(HISI_PCIE_TRIG_MODE_M,
> +		       hisi_pcie_get_trig_mode(event)) | HISI_PCIE_TRIG_EN;

The formatting is very weird here.

> +
> +	/* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */
> +	thr_len = hisi_pcie_get_thr_len(event);
> +	if (thr_len)
> +		reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len) |
> +		       FIELD_PREP(HISI_PCIE_THR_MODE_M,
> +		       hisi_pcie_get_thr_mode(event)) | HISI_PCIE_THR_EN;

and here.

> +
> +	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, reg);
> +}
> +
> +static void hisi_pcie_pmu_clear_filter(struct perf_event *event)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx,
> +			     HISI_PCIE_DEFAULT_SET);
> +}
> +
> +static bool hisi_pcie_pmu_valid_port(struct hisi_pcie_pmu *pcie_pmu, u16 rp_bdf)
> +{
> +	return rp_bdf >= pcie_pmu->bdf_min && rp_bdf <= pcie_pmu->bdf_max;
> +}
> +
> +static bool hisi_pcie_pmu_valid_requester_id(struct hisi_pcie_pmu *pcie_pmu,
> +					    u32 bdf)
> +{
> +	struct pci_dev *root_port, *pdev;
> +
> +	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pcie_pmu->pdev->bus),
> +					   PCI_BUS_NUM(bdf),
> +					   GET_PCI_DEVFN(bdf));
> +	if (!pdev)
> +		return false;
> +
> +	root_port = pcie_find_root_port(pdev);
> +	if (!root_port)
> +		return false;
> +
> +	pci_dev_put(pdev);
> +	return hisi_pcie_pmu_valid_port(pcie_pmu, pci_dev_id(root_port));
> +}
> +
> +static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
> +				       struct hisi_pcie_pmu *pcie_pmu)
> +{
> +	u32 subev_idx = hisi_pcie_get_subevent(event);
> +	u32 event_idx = hisi_pcie_get_event(event);
> +	u32 requester_id = hisi_pcie_get_bdf(event);
> +
> +	if (subev_idx > HISI_PCIE_SUBEVENT_MAX ||
> +	    event_idx > HISI_PCIE_EVENT_MAX) {
> +		pci_err(pcie_pmu->pdev,
> +			"Max event index and max subevent index is: %d, %d.\n",
> +			HISI_PCIE_EVENT_MAX, HISI_PCIE_SUBEVENT_MAX);
> +		return false;
> +	}
> +
> +	if (hisi_pcie_get_thr_len(event) > HISI_PCIE_THR_MAX_VAL)
> +		return false;
> +
> +	if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL)
> +		return false;
> +
> +	if (requester_id) {
> +		if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id))
> +			return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)
> +{
> +	struct perf_event *sibling, *leader = event->group_leader;
> +	int counters = 1;
> +
> +	if (!is_software_event(leader)) {
> +		if (leader->pmu != event->pmu)
> +			return false;
> +
> +		if (leader != event)
> +			counters++;
> +	}
> +
> +	for_each_sibling_event(sibling, event->group_leader) {
> +		if (is_software_event(sibling))
> +			continue;
> +
> +		if (sibling->pmu != event->pmu)
> +			return false;
> +
> +		counters++;
> +	}
> +
> +	return counters <= HISI_PCIE_MAX_COUNTERS;
> +}
> +
> +static int hisi_pcie_pmu_event_init(struct perf_event *event)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
> +
> +	event->cpu = pcie_pmu->on_cpu;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/* Sampling is not supported. */
> +	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
> +		return -EOPNOTSUPP;
> +
> +	if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu)) {
> +		pci_err(pcie_pmu->pdev, "Invalid filter!\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!hisi_pcie_pmu_validate_event_group(event))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +/*
> + * The bandwidth, latency, bus utilization and buffer occupancy features are
> + * calculated from data in HISI_PCIE_CNT and extended data in HISI_PCIE_EXT_CNT.
> + * Other features are obtained only by HISI_PCIE_CNT.
> + * So data and data_ext are processed in this function to get performanace
> + * value like, bandwidth, latency, etc.
> + */
> +static u64 hisi_pcie_pmu_get_performance(struct perf_event *event, u64 data,
> +					 u64 data_ext)
> +{
> +#define CONVERT_DW_TO_BYTE(x)	(sizeof(u32) * (x))

I don't know what a "DW" is, but this macro adds nothing...

> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
> +	u64 us_per_cycle = readl(pcie_pmu->base + HISI_PCIE_REG_FREQ);
> +	u32 idx = hisi_pcie_get_event(event);
> +
> +	if (!data_ext)
> +		return 0;
> +
> +	/* Process data to set unit of bandwidth as "Byte/ms". */
> +	if (is_bw_event(idx)) {
> +
> +		if (!div64_u64(data_ext, 1000))
> +			return 0;
> +
> +		return div64_u64(CONVERT_DW_TO_BYTE(data),

... especially as this is the only use of it.


> +				 div64_u64(data_ext, 1000));
> +	}
> +
> +	/* Process data to set unit of latency as "us". */
> +	if (is_latency_event(idx))
> +		return div64_u64(data * us_per_cycle, data_ext);
> +
> +	if (is_bus_util_event(idx))
> +		return div64_u64(data * us_per_cycle, data_ext);
> +
> +	if (is_buf_util_event(idx))
> +		return div64_u64(data, data_ext * us_per_cycle);

Why do we need to do all this division in the kernel? Can't we just expose
the underlying values and let userspace figure out what it wants to do with
the numbers?

Will
Krzysztof Wilczyński June 11, 2021, 11:33 p.m. UTC | #2
Hi Qi,

Thank you for sending the patch over!

[...]
> +/*
> + * This driver adds support for PCIe PMU RCiEP device. Related
> + * perf events are bandwidth, bandwidth utilization, latency
> + * etc.
> + *
> + * Copyright (C) 2021 HiSilicon Limited
> + * Author: Qi Liu<liuqi115@huawei.com>
> + */

A small nitpick: missing space between your name and the e-mail address.

[...]
> +static ssize_t hisi_pcie_event_sysfs_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	struct dev_ext_attribute *eattr;
> +
> +	eattr = container_of(attr, struct dev_ext_attribute, attr);
> +
> +	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var);
> +}

I am not that familiar with the perf drivers, thus I might be completely
wrong here, but usually for sysfs objects a single value is preferred,
so that this "config=" technically would not be needed, unless this is
somewhat essential to the consumers of this attribute to know what the
value is?  What do you think?

[...]
> +static ssize_t hisi_pcie_identifier_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return sysfs_emit(buf, "0x%x\n", pcie_pmu->identifier);
> +}

What about using the "%#x" formatting flag?  It would automatically
added the "0x" prefix, etc.

> +static ssize_t hisi_pcie_bus_show(struct device *dev,
> +				  struct device_attribute *attr, char *buf)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return sysfs_emit(buf, "0x%02x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
> +}

Same as above, what about "%#02x"?

[...]
> +static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
> +				       struct hisi_pcie_pmu *pcie_pmu)
> +{
> +	u32 subev_idx = hisi_pcie_get_subevent(event);
> +	u32 event_idx = hisi_pcie_get_event(event);
> +	u32 requester_id = hisi_pcie_get_bdf(event);
> +
> +	if (subev_idx > HISI_PCIE_SUBEVENT_MAX ||
> +	    event_idx > HISI_PCIE_EVENT_MAX) {
> +		pci_err(pcie_pmu->pdev,
> +			"Max event index and max subevent index is: %d, %d.\n",
> +			HISI_PCIE_EVENT_MAX, HISI_PCIE_SUBEVENT_MAX);
> +		return false;
> +	}

Was this error message above intended to be a debug message?  It's a bit
opaque in terms what the error actually is here.  We might need to clear
it up a little.

[...]
> +static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)
> +{
> +	struct perf_event *sibling, *leader = event->group_leader;
> +	int counters = 1;

How big this counter could become?

Would it ever be greater than HISI_PCIE_MAX_COUNTERS?  I am asking, as
if it would be ever greater, then perhaps unsigned int would be better
to use, and if not, then perhaps something smaller than int?  What do
you think, does this even make sense to change?

[...]
> +static int hisi_pcie_pmu_event_init(struct perf_event *event)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
> +
> +	event->cpu = pcie_pmu->on_cpu;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/* Sampling is not supported. */
> +	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
> +		return -EOPNOTSUPP;
> +
> +	if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu)) {
> +		pci_err(pcie_pmu->pdev, "Invalid filter!\n");
> +		return -EINVAL;
> +	}

[...]
> +/*
> + * The bandwidth, latency, bus utilization and buffer occupancy features are
> + * calculated from data in HISI_PCIE_CNT and extended data in HISI_PCIE_EXT_CNT.
> + * Other features are obtained only by HISI_PCIE_CNT.
> + * So data and data_ext are processed in this function to get performanace
> + * value like, bandwidth, latency, etc.
> + */

A small typo in the world "performance" above.

[...]
> +static u64 hisi_pcie_pmu_get_performance(struct perf_event *event, u64 data,
> +					 u64 data_ext)
> +{
> +#define CONVERT_DW_TO_BYTE(x)	(sizeof(u32) * (x))

I would move this macro at the top alongside other constants and macros,
as here it makes the code harder to read.  What do you think?

[...]
> +static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev,
> +				      struct hisi_pcie_pmu *pcie_pmu)
> +{
> +	int irq, ret;
> +
> +	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
> +	if (ret < 0) {
> +		pci_err(pdev, "Failed to enable MSI vectors, ret = %d!\n", ret);
> +		return ret;
> +	}

This is a nitpick, so feel free to ignore it, but what do you think of
changing this (and also other messages alike) message to be, for
example:

  pci_err(pdev, "Failed to enable MSI vectors: %d\n", ret);

Why?  I personally don't find displaying a return code/value followed by
a punctuation easy to read, especially when looking through a lot of
lines and other messages in the kernel ring buffer.

[...]
> +
> +	irq = pci_irq_vector(pdev, 0);
> +	ret = request_irq(irq, hisi_pcie_pmu_irq,
> +			  IRQF_NOBALANCING | IRQF_NO_THREAD, "hisi_pcie_pmu",
> +			  pcie_pmu);
> +	if (ret) {
> +		pci_err(pdev, "Failed to register irq, ret = %d!\n", ret);
> +		pci_free_irq_vectors(pdev);
> +		return ret;
> +	}

It would be "IRQ" in the error message above.

[...]
> +static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node,
> +					 struct hisi_pcie_pmu, node);
> +	unsigned int target;
> +
> +	/* Nothing to do if this CPU doesn't own the PMU */
> +	if (pcie_pmu->on_cpu != cpu)
> +		return 0;
> +
> +	/* Choose a new CPU from all online cpus. */
> +	target = cpumask_first(cpu_online_mask);
> +	if (target >= nr_cpu_ids) {
> +		pci_err(pcie_pmu->pdev, "There is no cpu to set!\n");
> +		return 0;
> +	}

To be consistent, it would be "CPUs" and "CPU" in the above.

[...]
> +static struct device_attribute hisi_pcie_pmu_bus_attr =
> +	__ATTR(bus, 0444, hisi_pcie_bus_show, NULL);
[...]
> +static struct device_attribute hisi_pcie_pmu_cpumask_attr =
> +	__ATTR(cpumask, 0444, hisi_pcie_cpumask_show, NULL);
[...]
> +static struct device_attribute hisi_pcie_pmu_identifier_attr =
> +	__ATTR(identifier, 0444, hisi_pcie_identifier_show, NULL);

Would it be at possible for any of the above __ATTR() macros to be
replaced with the DEVICE_ATTR_RO() macro?  Or perhaps with __ATTR_RO()
if the other one would be a good fit?

[...]
> +static int hisi_pcie_init_dev(struct pci_dev *pdev)
> +{
> +	int ret;
> +
> +	ret = pci_enable_device(pdev);
> +	if (ret) {
> +		pci_err(pdev, "Failed to enable pci device, ret = %d.\n", ret);
> +		return ret;
> +	}
> +
> +	ret = pci_request_mem_regions(pdev, "hisi_pcie_pmu");
> +	if (ret < 0) {
> +		pci_err(pdev, "Failed to request pci mem regions, ret = %d.\n",
> +			ret);
> +		pci_disable_device(pdev);
> +		return ret;
> +	}

It would be "PCI" in both error messages above.

[...]
> +static int __init hisi_pcie_module_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
> +				      "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE",
> +				      hisi_pcie_pmu_online_cpu,
> +				      hisi_pcie_pmu_offline_cpu);
> +	if (ret) {
> +		pr_err("Failed to setup PCIE PMU hotplug, ret = %d.\n", ret);
> +		return ret;
> +	}

It would be "PCIe" in the error message above.

	Krzysztof
Jonathan Cameron June 14, 2021, 9:20 a.m. UTC | #3
On Fri, 11 Jun 2021 17:23:48 +0100
Will Deacon <will@kernel.org> wrote:

> On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
> > PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
> > to sample bandwidth, latency, buffer occupation etc.
> > 
> > Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
> > registered as a PMU in /sys/bus/event_source/devices, so users can
> > select target PMU, and use filter to do further sets.
> > 
> > Filtering options contains:
> > event        - select the event.
> > subevent     - select the subevent.
> > port         - select target Root Ports. Information of Root Ports
> >                are shown under sysfs.
> > bdf          - select requester_id of target EP device.
> > trig_len     - set trigger condition for starting event statistics.
> > trigger_mode - set trigger mode. 0 means starting to statistic when
> >                bigger than trigger condition, and 1 means smaller.
> > thr_len      - set threshold for statistics.
> > thr_mode     - set threshold mode. 0 means count when bigger than
> >                threshold, and 1 means smaller.
> > 
> > Reviewed-by: John Garry <john.garry@huawei.com>
> > Signed-off-by: Qi Liu <liuqi115@huawei.com>
> > ---
> >  MAINTAINERS                                |    6 +
> >  drivers/perf/Kconfig                       |    2 +
> >  drivers/perf/Makefile                      |    1 +
> >  drivers/perf/pci/Kconfig                   |   16 +
> >  drivers/perf/pci/Makefile                  |    2 +
> >  drivers/perf/pci/hisilicon/Makefile        |    3 +
> >  drivers/perf/pci/hisilicon/hisi_pcie_pmu.c | 1019 ++++++++++++++++++++++++++++  
> 
> Can we keep this under drivers/perf/hisilicon/ please? I don't see the
> need to create a 'pci' directory here.

https://lore.kernel.org/linux-pci/20190103154439.GC16311@edgewater-inn.cambridge.arm.com/

Discussion back in 2018 about where to put these...

Though, perf/pci/hisilicon does seem over the top in terms of depth, maybe perf/pci/
or just give up on that plan and put them (for now at least) in per company directories.

Jonathan
liuqi (BA) June 15, 2021, 8:57 a.m. UTC | #4
Hi Will,
Thanks for your reviewing.

On 2021/6/12 0:23, Will Deacon wrote:
> On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
>> PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
>> to sample bandwidth, latency, buffer occupation etc.
>>
>> Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
>> registered as a PMU in /sys/bus/event_source/devices, so users can
>> select target PMU, and use filter to do further sets.
>>
>> Filtering options contains:
>> event        - select the event.
>> subevent     - select the subevent.
>> port         - select target Root Ports. Information of Root Ports
>>                 are shown under sysfs.
>> bdf          - select requester_id of target EP device.
>> trig_len     - set trigger condition for starting event statistics.
>> trigger_mode - set trigger mode. 0 means starting to statistic when
>>                 bigger than trigger condition, and 1 means smaller.
>> thr_len      - set threshold for statistics.
>> thr_mode     - set threshold mode. 0 means count when bigger than
>>                 threshold, and 1 means smaller.
>>
>> Reviewed-by: John Garry <john.garry@huawei.com>
>> Signed-off-by: Qi Liu <liuqi115@huawei.com>
>> ---
>>   MAINTAINERS                                |    6 +
>>   drivers/perf/Kconfig                       |    2 +
>>   drivers/perf/Makefile                      |    1 +
>>   drivers/perf/pci/Kconfig                   |   16 +
>>   drivers/perf/pci/Makefile                  |    2 +
>>   drivers/perf/pci/hisilicon/Makefile        |    3 +
>>   drivers/perf/pci/hisilicon/hisi_pcie_pmu.c | 1019 ++++++++++++++++++++++++++++
> 
> Can we keep this under drivers/perf/hisilicon/ please? I don't see the
> need to create a 'pci' directory here.
>
So how about drivers/perf/hisilicon/pci? as hisi_pcie_pmu.c do not use 
hisi_uncore_pmu framework.
thanks
>>   include/linux/cpuhotplug.h                 |    1 +
>>   8 files changed, 1050 insertions(+)
>>   create mode 100644 drivers/perf/pci/Kconfig
>>   create mode 100644 drivers/perf/pci/Makefile
>>   create mode 100644 drivers/perf/pci/hisilicon/Makefile
>>   create mode 100644 drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
>>

[...]
>> +
>> +#define HISI_PCIE_PMU_ATTR(_name, _func, _config)			\
>> +	(&((struct dev_ext_attribute[]) {				\
>> +		{ __ATTR(_name, 0444, _func, NULL), (void *)_config }   \
>> +	})[0].attr.attr)
> 
> If you rebase onto my patch queue, then you can use PMU_EVENT_ATTR_ID to
> define this.
> 
ok, will fix this, thanks.
>> +#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format)			\
>> +	HISI_PCIE_PMU_ATTR(_name, hisi_pcie_format_sysfs_show, (void *)_format)
>> +#define HISI_PCIE_PMU_EVENT_ATTR(_name, _event)			\
>> +	HISI_PCIE_PMU_ATTR(_name, hisi_pcie_event_sysfs_show, (void *)_event)
>> +
>> +static ssize_t hisi_pcie_cpumask_show(struct device *dev,
>> +				      struct device_attribute *attr, char *buf)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>> +
>> +	return sysfs_emit(buf, "%d\n", pcie_pmu->on_cpu);
>> +}
> 
> This isn't a cpumask.
> 
got it, I'll use cpumask_of(pcie_pmu->on_cpu) next time, thanks.

>> +
>> +static ssize_t hisi_pcie_identifier_show(struct device *dev,
>> +					 struct device_attribute *attr,
>> +					 char *buf)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>> +
>> +	return sysfs_emit(buf, "0x%x\n", pcie_pmu->identifier);
>> +}
>> +
>> +static ssize_t hisi_pcie_bus_show(struct device *dev,
>> +				  struct device_attribute *attr, char *buf)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>> +
>> +	return sysfs_emit(buf, "0x%02x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
>> +}
>> +
>> +static void hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu,
>> +				      u32 reg_off, u16 *arg0, u16 *arg1)
>> +{
>> +	u32 val = readl(pcie_pmu->base + reg_off);
>> +
>> +	*arg0 = val & 0xffff;
>> +	*arg1 = (val & 0xffff0000) >> 16;
>> +}
> 
> Define a new type for the pair of values and return that directly?
> 
Sorry, I'm not sure about how to fix this, do you mean add a union like 
this?
union reg_val {
	struct {
		u16 arg0;
		u16 arg1;
	}
	u32 val;
}

[...]

>> +static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
>> +				 u32 idx, u64 val)
>> +{
>> +	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
>> +
>> +	writeq(val, pcie_pmu->base + offset);
>> +}
> 
> I'm guessing most (all?) of these IO access can be _relaxed() ?
> 

ok, will change this.
>> +
>> +static void hisi_pcie_pmu_config_filter(struct perf_event *event)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
>> +	struct hw_perf_event *hwc = &event->hw;
>> +	u64 reg = HISI_PCIE_DEFAULT_SET;
>> +	u64 port, trig_len, thr_len;
>> +	u32 idx = hwc->idx;
>> +
>> +	/* Config HISI_PCIE_EVENT_CTRL according to event and subevent. */
>> +	reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_event(event)) |
>> +	       FIELD_PREP(HISI_PCIE_SUBEVENT_M, hisi_pcie_get_subevent(event));
>> +
>> +	/* Config HISI_PCIE_EVENT_CTRL according to ROOT PORT or EP device. */
>> +	port = hisi_pcie_get_port(event);
>> +	if (port)
>> +		reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port);
>> +	else
>> +		reg |= HISI_PCIE_TARGET_EN |
>> +		       FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event));
> 
> Please use braces for multi-line conditional expressions (same elsewhere).
> 
It is single-line here, this line is more than 80 words so wrap here.
>> +
>> +	/* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */
>> +	trig_len = hisi_pcie_get_trig_len(event);
>> +	if (trig_len)
>> +		reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len) |
>> +		       FIELD_PREP(HISI_PCIE_TRIG_MODE_M,
>> +		       hisi_pcie_get_trig_mode(event)) | HISI_PCIE_TRIG_EN;
> 
> The formatting is very weird here.
> 
will fix this.
>> +
>> +	/* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */
>> +	thr_len = hisi_pcie_get_thr_len(event);
>> +	if (thr_len)
>> +		reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len) |
>> +		       FIELD_PREP(HISI_PCIE_THR_MODE_M,
>> +		       hisi_pcie_get_thr_mode(event)) | HISI_PCIE_THR_EN;
> 
> and here.
> 

will fix this, thanks.
[...]

>> +
>> +/*
>> + * The bandwidth, latency, bus utilization and buffer occupancy features are
>> + * calculated from data in HISI_PCIE_CNT and extended data in HISI_PCIE_EXT_CNT.
>> + * Other features are obtained only by HISI_PCIE_CNT.
>> + * So data and data_ext are processed in this function to get performanace
>> + * value like, bandwidth, latency, etc.
>> + */
>> +static u64 hisi_pcie_pmu_get_performance(struct perf_event *event, u64 data,
>> +					 u64 data_ext)
>> +{
>> +#define CONVERT_DW_TO_BYTE(x)	(sizeof(u32) * (x))
> 
> I don't know what a "DW" is, but this macro adds nothing...

DW means double words, and 1DW = 4Bytes, value in hardware counter means 
DW so I wanna change it into Byte.
So how about using 4*data here and adding code comment to explain it.

> 
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
>> +	u64 us_per_cycle = readl(pcie_pmu->base + HISI_PCIE_REG_FREQ);
>> +	u32 idx = hisi_pcie_get_event(event);
>> +
>> +	if (!data_ext)
>> +		return 0;
>> +
>> +	/* Process data to set unit of bandwidth as "Byte/ms". */
>> +	if (is_bw_event(idx)) {
>> +
>> +		if (!div64_u64(data_ext, 1000))
>> +			return 0;
>> +
>> +		return div64_u64(CONVERT_DW_TO_BYTE(data),
> 
> ... especially as this is the only use of it.
> 
> 
>> +				 div64_u64(data_ext, 1000));
>> +	}
>> +
>> +	/* Process data to set unit of latency as "us". */
>> +	if (is_latency_event(idx))
>> +		return div64_u64(data * us_per_cycle, data_ext);
>> +
>> +	if (is_bus_util_event(idx))
>> +		return div64_u64(data * us_per_cycle, data_ext);
>> +
>> +	if (is_buf_util_event(idx))
>> +		return div64_u64(data, data_ext * us_per_cycle);
> 
> Why do we need to do all this division in the kernel? Can't we just expose
> the underlying values and let userspace figure out what it wants to do with
> the numbers?
> 
> Will
> 
Our PMU hardware support 8 sets of counters to count bandwidth, latency 
and utilization events.

For example, when users set latency event, common counter will count 
delay cycles, and extern counter count number of PCIe packets 
automaticly. And we do not have a event number for counting number of 
PCIe packets.

So this division cannot move to userspace tool.

Thanks,
Qi
> .
>
Will Deacon June 15, 2021, 9:26 a.m. UTC | #5
On Mon, Jun 14, 2021 at 10:20:25AM +0100, Jonathan Cameron wrote:
> On Fri, 11 Jun 2021 17:23:48 +0100
> Will Deacon <will@kernel.org> wrote:
> 
> > On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
> > > PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
> > > to sample bandwidth, latency, buffer occupation etc.
> > > 
> > > Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
> > > registered as a PMU in /sys/bus/event_source/devices, so users can
> > > select target PMU, and use filter to do further sets.
> > > 
> > > Filtering options contains:
> > > event        - select the event.
> > > subevent     - select the subevent.
> > > port         - select target Root Ports. Information of Root Ports
> > >                are shown under sysfs.
> > > bdf          - select requester_id of target EP device.
> > > trig_len     - set trigger condition for starting event statistics.
> > > trigger_mode - set trigger mode. 0 means starting to statistic when
> > >                bigger than trigger condition, and 1 means smaller.
> > > thr_len      - set threshold for statistics.
> > > thr_mode     - set threshold mode. 0 means count when bigger than
> > >                threshold, and 1 means smaller.
> > > 
> > > Reviewed-by: John Garry <john.garry@huawei.com>
> > > Signed-off-by: Qi Liu <liuqi115@huawei.com>
> > > ---
> > >  MAINTAINERS                                |    6 +
> > >  drivers/perf/Kconfig                       |    2 +
> > >  drivers/perf/Makefile                      |    1 +
> > >  drivers/perf/pci/Kconfig                   |   16 +
> > >  drivers/perf/pci/Makefile                  |    2 +
> > >  drivers/perf/pci/hisilicon/Makefile        |    3 +
> > >  drivers/perf/pci/hisilicon/hisi_pcie_pmu.c | 1019 ++++++++++++++++++++++++++++  
> > 
> > Can we keep this under drivers/perf/hisilicon/ please? I don't see the
> > need to create a 'pci' directory here.
> 
> https://lore.kernel.org/linux-pci/20190103154439.GC16311@edgewater-inn.cambridge.arm.com/
> 
> Discussion back in 2018 about where to put these...

I don't remember that at all :)

> Though, perf/pci/hisilicon does seem over the top in terms of depth, maybe perf/pci/
> or just give up on that plan and put them (for now at least) in per company directories.

I think perf/hisilicon makes the most sense. We can always move it later
if we need to.

Will
Will Deacon June 15, 2021, 9:35 a.m. UTC | #6
On Tue, Jun 15, 2021 at 04:57:09PM +0800, liuqi (BA) wrote:
> On 2021/6/12 0:23, Will Deacon wrote:
> > On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
> > > PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
> > > to sample bandwidth, latency, buffer occupation etc.
> > > 
> > > Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
> > > registered as a PMU in /sys/bus/event_source/devices, so users can
> > > select target PMU, and use filter to do further sets.
> > > 
> > > Filtering options contains:
> > > event        - select the event.
> > > subevent     - select the subevent.
> > > port         - select target Root Ports. Information of Root Ports
> > >                 are shown under sysfs.
> > > bdf          - select requester_id of target EP device.
> > > trig_len     - set trigger condition for starting event statistics.
> > > trigger_mode - set trigger mode. 0 means starting to statistic when
> > >                 bigger than trigger condition, and 1 means smaller.
> > > thr_len      - set threshold for statistics.
> > > thr_mode     - set threshold mode. 0 means count when bigger than
> > >                 threshold, and 1 means smaller.
> > > 
> > > Reviewed-by: John Garry <john.garry@huawei.com>
> > > Signed-off-by: Qi Liu <liuqi115@huawei.com>
> > > ---
> > >   MAINTAINERS                                |    6 +
> > >   drivers/perf/Kconfig                       |    2 +
> > >   drivers/perf/Makefile                      |    1 +
> > >   drivers/perf/pci/Kconfig                   |   16 +
> > >   drivers/perf/pci/Makefile                  |    2 +
> > >   drivers/perf/pci/hisilicon/Makefile        |    3 +
> > >   drivers/perf/pci/hisilicon/hisi_pcie_pmu.c | 1019 ++++++++++++++++++++++++++++
> > 
> > Can we keep this under drivers/perf/hisilicon/ please? I don't see the
> > need to create a 'pci' directory here.
> > 
> So how about drivers/perf/hisilicon/pci? as hisi_pcie_pmu.c do not use
> hisi_uncore_pmu framework.

That's up to you. As long as it's _somewhere_ under drivers/perf/hisilicon/,
then I'm not too fussed.

> > > +static void hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu,
> > > +				      u32 reg_off, u16 *arg0, u16 *arg1)
> > > +{
> > > +	u32 val = readl(pcie_pmu->base + reg_off);
> > > +
> > > +	*arg0 = val & 0xffff;
> > > +	*arg1 = (val & 0xffff0000) >> 16;
> > > +}
> > 
> > Define a new type for the pair of values and return that directly?
> > 
> Sorry, I'm not sure about how to fix this, do you mean add a union like
> this?
> union reg_val {
> 	struct {
> 		u16 arg0;
> 		u16 arg1;
> 	}
> 	u32 val;
> }

I was just thinking along the lines of:

struct hisi_pcie_reg_pair {
	u16 lo;
	u16 hi;
};

static struct hisi_pcie_reg_pair
hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmum u32 reg_off)
{
	u32 val = readl_relaxed(pcie_pmu->base + reg_off);
	struct hisi_pcie_reg_pair regs = {
		.lo = val,
		.hi = val >> 16,
	};

	return regs;
}

Does that work?

> > > +/*
> > > + * The bandwidth, latency, bus utilization and buffer occupancy features are
> > > + * calculated from data in HISI_PCIE_CNT and extended data in HISI_PCIE_EXT_CNT.
> > > + * Other features are obtained only by HISI_PCIE_CNT.
> > > + * So data and data_ext are processed in this function to get performanace
> > > + * value like, bandwidth, latency, etc.
> > > + */
> > > +static u64 hisi_pcie_pmu_get_performance(struct perf_event *event, u64 data,
> > > +					 u64 data_ext)
> > > +{
> > > +#define CONVERT_DW_TO_BYTE(x)	(sizeof(u32) * (x))
> > 
> > I don't know what a "DW" is, but this macro adds nothing...
> 
> DW means double words, and 1DW = 4Bytes, value in hardware counter means DW
> so I wanna change it into Byte.
> So how about using 4*data here and adding code comment to explain it.

Just remove the macro and replace it's single user with sizeof(u32) * x

> > > +	/* Process data to set unit of latency as "us". */
> > > +	if (is_latency_event(idx))
> > > +		return div64_u64(data * us_per_cycle, data_ext);
> > > +
> > > +	if (is_bus_util_event(idx))
> > > +		return div64_u64(data * us_per_cycle, data_ext);
> > > +
> > > +	if (is_buf_util_event(idx))
> > > +		return div64_u64(data, data_ext * us_per_cycle);
> > 
> > Why do we need to do all this division in the kernel? Can't we just expose
> > the underlying values and let userspace figure out what it wants to do with
> > the numbers?
> > 
> Our PMU hardware support 8 sets of counters to count bandwidth, latency and
> utilization events.
> 
> For example, when users set latency event, common counter will count delay
> cycles, and extern counter count number of PCIe packets automaticly. And we
> do not have a event number for counting number of PCIe packets.
> 
> So this division cannot move to userspace tool.

Why can't you expose the packet counter as an extra event to userspace?

Will
liuqi (BA) June 16, 2021, 1:09 a.m. UTC | #7
Hi Krzysztof,
On 2021/6/12 7:33, Krzysztof Wilczyński wrote:
> Hi Qi,
> 
> Thank you for sending the patch over!
> 
> [...]
>> +/*
>> + * This driver adds support for PCIe PMU RCiEP device. Related
>> + * perf events are bandwidth, bandwidth utilization, latency
>> + * etc.
>> + *
>> + * Copyright (C) 2021 HiSilicon Limited
>> + * Author: Qi Liu<liuqi115@huawei.com>
>> + */
> 
> A small nitpick: missing space between your name and the e-mail address.
> 
thanks, will fix this.
> [...]
>> +static ssize_t hisi_pcie_event_sysfs_show(struct device *dev,
>> +				   struct device_attribute *attr, char *buf)
>> +{
>> +	struct dev_ext_attribute *eattr;
>> +
>> +	eattr = container_of(attr, struct dev_ext_attribute, attr);
>> +
>> +	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var);
>> +}
> 
> I am not that familiar with the perf drivers, thus I might be completely
> wrong here, but usually for sysfs objects a single value is preferred,
> so that this "config=" technically would not be needed, unless this is
> somewhat essential to the consumers of this attribute to know what the
> value is?  What do you think?
"config=" is a supported for userspace tool, it is a kind of alias, so 
cannot be remover here, thanks.
> 
> [...]
>> +static ssize_t hisi_pcie_identifier_show(struct device *dev,
>> +					 struct device_attribute *attr,
>> +					 char *buf)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>> +
>> +	return sysfs_emit(buf, "0x%x\n", pcie_pmu->identifier);
>> +}
> 
> What about using the "%#x" formatting flag?  It would automatically
> added the "0x" prefix, etc.
> 
thanks, will fix this.
>> +static ssize_t hisi_pcie_bus_show(struct device *dev,
>> +				  struct device_attribute *attr, char *buf)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
>> +
>> +	return sysfs_emit(buf, "0x%02x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
>> +}
> 
> Same as above, what about "%#02x"?
> 
thanks, will fix this.
> [...]
>> +static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
>> +				       struct hisi_pcie_pmu *pcie_pmu)
>> +{
>> +	u32 subev_idx = hisi_pcie_get_subevent(event);
>> +	u32 event_idx = hisi_pcie_get_event(event);
>> +	u32 requester_id = hisi_pcie_get_bdf(event);
>> +
>> +	if (subev_idx > HISI_PCIE_SUBEVENT_MAX ||
>> +	    event_idx > HISI_PCIE_EVENT_MAX) {
>> +		pci_err(pcie_pmu->pdev,
>> +			"Max event index and max subevent index is: %d, %d.\n",
>> +			HISI_PCIE_EVENT_MAX, HISI_PCIE_SUBEVENT_MAX);
>> +		return false;
>> +	}
> 
> Was this error message above intended to be a debug message?  It's a bit
> opaque in terms what the error actually is here.  We might need to clear
> it up a little.
> 
thanks, will change this message to pci_dbg next time.
> [...]
>> +static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)
>> +{
>> +	struct perf_event *sibling, *leader = event->group_leader;
>> +	int counters = 1;
> 
> How big this counter could become?
> 
> Would it ever be greater than HISI_PCIE_MAX_COUNTERS?  I am asking, as
> if it would be ever greater, then perhaps unsigned int would be better
> to use, and if not, then perhaps something smaller than int?  What do
> you think, does this even make sense to change?
> 
I think this "counter" is used to caculate how many events have been set 
in cmdline, so it will always bigger than zero. So int and u32 seems 
same here.Thanks,

> [...]
>> +static int hisi_pcie_pmu_event_init(struct perf_event *event)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
>> +
>> +	event->cpu = pcie_pmu->on_cpu;
>> +
>> +	if (event->attr.type != event->pmu->type)
>> +		return -ENOENT;
>> +
>> +	/* Sampling is not supported. */
>> +	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
>> +		return -EOPNOTSUPP;
>> +
>> +	if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu)) {
>> +		pci_err(pcie_pmu->pdev, "Invalid filter!\n");
>> +		return -EINVAL;
>> +	}
> 
> [...]
>> +/*
>> + * The bandwidth, latency, bus utilization and buffer occupancy features are
>> + * calculated from data in HISI_PCIE_CNT and extended data in HISI_PCIE_EXT_CNT.
>> + * Other features are obtained only by HISI_PCIE_CNT.
>> + * So data and data_ext are processed in this function to get performanace
>> + * value like, bandwidth, latency, etc.
>> + */
> 
> A small typo in the world "performance" above.
> 
thanks, will fix this.
> [...]
>> +static u64 hisi_pcie_pmu_get_performance(struct perf_event *event, u64 data,
>> +					 u64 data_ext)
>> +{
>> +#define CONVERT_DW_TO_BYTE(x)	(sizeof(u32) * (x))
> 
> I would move this macro at the top alongside other constants and macros,
> as here it makes the code harder to read.  What do you think?
> 
> [...]
>> +static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev,
>> +				      struct hisi_pcie_pmu *pcie_pmu)
>> +{
>> +	int irq, ret;
>> +
>> +	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
>> +	if (ret < 0) {
>> +		pci_err(pdev, "Failed to enable MSI vectors, ret = %d!\n", ret);
>> +		return ret;
>> +	}
> 
> This is a nitpick, so feel free to ignore it, but what do you think of
> changing this (and also other messages alike) message to be, for
> example:
> 
>    pci_err(pdev, "Failed to enable MSI vectors: %d\n", ret);
> 
> Why?  I personally don't find displaying a return code/value followed by
> a punctuation easy to read, especially when looking through a lot of
> lines and other messages in the kernel ring buffer.
> 

got it, will fix this next time.
> [...]
>> +
>> +	irq = pci_irq_vector(pdev, 0);
>> +	ret = request_irq(irq, hisi_pcie_pmu_irq,
>> +			  IRQF_NOBALANCING | IRQF_NO_THREAD, "hisi_pcie_pmu",
>> +			  pcie_pmu);
>> +	if (ret) {
>> +		pci_err(pdev, "Failed to register irq, ret = %d!\n", ret);
>> +		pci_free_irq_vectors(pdev);
>> +		return ret;
>> +	}
> 
> It would be "IRQ" in the error message above.
> 
ok, will change this, thanks.
> [...]
>> +static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
>> +{
>> +	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node,
>> +					 struct hisi_pcie_pmu, node);
>> +	unsigned int target;
>> +
>> +	/* Nothing to do if this CPU doesn't own the PMU */
>> +	if (pcie_pmu->on_cpu != cpu)
>> +		return 0;
>> +
>> +	/* Choose a new CPU from all online cpus. */
>> +	target = cpumask_first(cpu_online_mask);
>> +	if (target >= nr_cpu_ids) {
>> +		pci_err(pcie_pmu->pdev, "There is no cpu to set!\n");
>> +		return 0;
>> +	}
> 
> To be consistent, it would be "CPUs" and "CPU" in the above.
> 
> [...]
>> +static struct device_attribute hisi_pcie_pmu_bus_attr =
>> +	__ATTR(bus, 0444, hisi_pcie_bus_show, NULL);
> [...]
>> +static struct device_attribute hisi_pcie_pmu_cpumask_attr =
>> +	__ATTR(cpumask, 0444, hisi_pcie_cpumask_show, NULL);
> [...]
>> +static struct device_attribute hisi_pcie_pmu_identifier_attr =
>> +	__ATTR(identifier, 0444, hisi_pcie_identifier_show, NULL);
> 
> Would it be at possible for any of the above __ATTR() macros to be
> replaced with the DEVICE_ATTR_RO() macro?  Or perhaps with __ATTR_RO()
> if the other one would be a good fit?
> 
yes, DEVICE_ATTR_RO() macro could be used here, thanks.
> [...]
>> +static int hisi_pcie_init_dev(struct pci_dev *pdev)
>> +{
>> +	int ret;
>> +
>> +	ret = pci_enable_device(pdev);
>> +	if (ret) {
>> +		pci_err(pdev, "Failed to enable pci device, ret = %d.\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	ret = pci_request_mem_regions(pdev, "hisi_pcie_pmu");
>> +	if (ret < 0) {
>> +		pci_err(pdev, "Failed to request pci mem regions, ret = %d.\n",
>> +			ret);
>> +		pci_disable_device(pdev);
>> +		return ret;
>> +	}
> 
> It would be "PCI" in both error messages above.
> 
will fix it.

> [...]
>> +static int __init hisi_pcie_module_init(void)
>> +{
>> +	int ret;
>> +
>> +	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
>> +				      "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE",
>> +				      hisi_pcie_pmu_online_cpu,
>> +				      hisi_pcie_pmu_offline_cpu);
>> +	if (ret) {
>> +		pr_err("Failed to setup PCIE PMU hotplug, ret = %d.\n", ret);
>> +		return ret;
>> +	}
> 
> It would be "PCIe" in the error message above.
> 
will fix it.
Thanks,
Qi
> 	Krzysztof
> .
>
liuqi (BA) June 16, 2021, 1:54 a.m. UTC | #8
Hi Will,
On 2021/6/15 17:35, Will Deacon wrote:
> On Tue, Jun 15, 2021 at 04:57:09PM +0800, liuqi (BA) wrote:
>> On 2021/6/12 0:23, Will Deacon wrote:
>>> On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
>>>> PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
>>>> to sample bandwidth, latency, buffer occupation etc.
>>>>
>>>> Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
>>>> registered as a PMU in /sys/bus/event_source/devices, so users can
>>>> select target PMU, and use filter to do further sets.
>>>>
>>>> Filtering options contains:
>>>> event        - select the event.
>>>> subevent     - select the subevent.
>>>> port         - select target Root Ports. Information of Root Ports
>>>>                  are shown under sysfs.
>>>> bdf          - select requester_id of target EP device.
>>>> trig_len     - set trigger condition for starting event statistics.
>>>> trigger_mode - set trigger mode. 0 means starting to statistic when
>>>>                  bigger than trigger condition, and 1 means smaller.
>>>> thr_len      - set threshold for statistics.
>>>> thr_mode     - set threshold mode. 0 means count when bigger than
>>>>                  threshold, and 1 means smaller.
>>>>
>>>> Reviewed-by: John Garry <john.garry@huawei.com>
>>>> Signed-off-by: Qi Liu <liuqi115@huawei.com>
>>>> ---
>>>>    MAINTAINERS                                |    6 +
>>>>    drivers/perf/Kconfig                       |    2 +
>>>>    drivers/perf/Makefile                      |    1 +
>>>>    drivers/perf/pci/Kconfig                   |   16 +
>>>>    drivers/perf/pci/Makefile                  |    2 +
>>>>    drivers/perf/pci/hisilicon/Makefile        |    3 +
>>>>    drivers/perf/pci/hisilicon/hisi_pcie_pmu.c | 1019 ++++++++++++++++++++++++++++
>>>
>>> Can we keep this under drivers/perf/hisilicon/ please? I don't see the
>>> need to create a 'pci' directory here.
>>>
>> So how about drivers/perf/hisilicon/pci? as hisi_pcie_pmu.c do not use
>> hisi_uncore_pmu framework.
> 
> That's up to you. As long as it's _somewhere_ under drivers/perf/hisilicon/,
> then I'm not too fussed.
> 
ok, got it, will move the driver to perf/hisilicon in next version.
>>>> +static void hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu,
>>>> +				      u32 reg_off, u16 *arg0, u16 *arg1)
>>>> +{
>>>> +	u32 val = readl(pcie_pmu->base + reg_off);
>>>> +
>>>> +	*arg0 = val & 0xffff;
>>>> +	*arg1 = (val & 0xffff0000) >> 16;
>>>> +}
>>>
>>> Define a new type for the pair of values and return that directly?
>>>
>> Sorry, I'm not sure about how to fix this, do you mean add a union like
>> this?
>> union reg_val {
>> 	struct {
>> 		u16 arg0;
>> 		u16 arg1;
>> 	}
>> 	u32 val;
>> }
> 
> I was just thinking along the lines of:
> 
> struct hisi_pcie_reg_pair {
> 	u16 lo;
> 	u16 hi;
> };
> 
> static struct hisi_pcie_reg_pair
> hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmum u32 reg_off)
> {
> 	u32 val = readl_relaxed(pcie_pmu->base + reg_off);
> 	struct hisi_pcie_reg_pair regs = {
> 		.lo = val,
> 		.hi = val >> 16,
> 	};
> 
> 	return regs;
> }
> 
> Does that work?
> 
yes, will fix this, thanks.
>>>> +/*
>>>> + * The bandwidth, latency, bus utilization and buffer occupancy features are
>>>> + * calculated from data in HISI_PCIE_CNT and extended data in HISI_PCIE_EXT_CNT.
>>>> + * Other features are obtained only by HISI_PCIE_CNT.
>>>> + * So data and data_ext are processed in this function to get performanace
>>>> + * value like, bandwidth, latency, etc.
>>>> + */
>>>> +static u64 hisi_pcie_pmu_get_performance(struct perf_event *event, u64 data,
>>>> +					 u64 data_ext)
>>>> +{
>>>> +#define CONVERT_DW_TO_BYTE(x)	(sizeof(u32) * (x))
>>>
>>> I don't know what a "DW" is, but this macro adds nothing...
>>
>> DW means double words, and 1DW = 4Bytes, value in hardware counter means DW
>> so I wanna change it into Byte.
>> So how about using 4*data here and adding code comment to explain it.
> 
> Just remove the macro and replace it's single user with sizeof(u32) * x
> 
ok, thanks.
>>>> +	/* Process data to set unit of latency as "us". */
>>>> +	if (is_latency_event(idx))
>>>> +		return div64_u64(data * us_per_cycle, data_ext);
>>>> +
>>>> +	if (is_bus_util_event(idx))
>>>> +		return div64_u64(data * us_per_cycle, data_ext);
>>>> +
>>>> +	if (is_buf_util_event(idx))
>>>> +		return div64_u64(data, data_ext * us_per_cycle);
>>>
>>> Why do we need to do all this division in the kernel? Can't we just expose
>>> the underlying values and let userspace figure out what it wants to do with
>>> the numbers?
>>>
>> Our PMU hardware support 8 sets of counters to count bandwidth, latency and
>> utilization events.
>>
>> For example, when users set latency event, common counter will count delay
>> cycles, and extern counter count number of PCIe packets automaticly. And we
>> do not have a event number for counting number of PCIe packets.
>>
>> So this division cannot move to userspace tool.
> 
> Why can't you expose the packet counter as an extra event to userspace?
> 
Maybe I didn’t express it clearly.

As there is no hardware event number for PCIe packets counting, extern 
counter count packets *automaticly* when latency events is selected by 
users.

This means users cannot set "config=0xXX" to start packets counting 
event. So we can only get the value of counter and extern counter in 
driver and do the division, then pass the result to userspace.
> Will
> .
>
Will Deacon June 16, 2021, 1:42 p.m. UTC | #9
Hi,

On Wed, Jun 16, 2021 at 09:54:23AM +0800, liuqi (BA) wrote:
> On 2021/6/15 17:35, Will Deacon wrote:
> > On Tue, Jun 15, 2021 at 04:57:09PM +0800, liuqi (BA) wrote:
> > > On 2021/6/12 0:23, Will Deacon wrote:
> > > > On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
> > > > > +	/* Process data to set unit of latency as "us". */
> > > > > +	if (is_latency_event(idx))
> > > > > +		return div64_u64(data * us_per_cycle, data_ext);
> > > > > +
> > > > > +	if (is_bus_util_event(idx))
> > > > > +		return div64_u64(data * us_per_cycle, data_ext);
> > > > > +
> > > > > +	if (is_buf_util_event(idx))
> > > > > +		return div64_u64(data, data_ext * us_per_cycle);
> > > > 
> > > > Why do we need to do all this division in the kernel? Can't we just expose
> > > > the underlying values and let userspace figure out what it wants to do with
> > > > the numbers?
> > > > 
> > > Our PMU hardware support 8 sets of counters to count bandwidth, latency and
> > > utilization events.
> > > 
> > > For example, when users set latency event, common counter will count delay
> > > cycles, and extern counter count number of PCIe packets automaticly. And we
> > > do not have a event number for counting number of PCIe packets.
> > > 
> > > So this division cannot move to userspace tool.
> > 
> > Why can't you expose the packet counter as an extra event to userspace?
> > 
> Maybe I didn’t express it clearly.
> 
> As there is no hardware event number for PCIe packets counting, extern
> counter count packets *automaticly* when latency events is selected by
> users.
> 
> This means users cannot set "config=0xXX" to start packets counting event.
> So we can only get the value of counter and extern counter in driver and do
> the division, then pass the result to userspace.

I still think it would be ideal if we could expose both values to userspace
rather than combine them somehow. Hmm. Anyway...

I struggled to figure out exactly what's being counted from the
documentation patch (please update that). Please can you explain exactly
what appears in the HISI_PCIE_CNT and HISI_PCIE_EXT_CNT registers for the
different modes of operation? Without that, the ratios you've chosen to
report seem rather arbitrary.

I also couldn't figure out how the latency event works. For example, I was
assuming it would be a filter (a bit like the length), so you could say
things like "I'm only interested in packets with a latency higher than x"
but it doesn't look like it works that way.

Thanks,

Will
Bjorn Helgaas June 16, 2021, 2:14 p.m. UTC | #10
On Sat, Jun 12, 2021 at 01:33:55AM +0200, Krzysztof Wilczyński wrote:

> > +static ssize_t hisi_pcie_bus_show(struct device *dev,
> > +				  struct device_attribute *attr, char *buf)
> > +{
> > +	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
> > +
> > +	return sysfs_emit(buf, "0x%02x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
> > +}
> 
> Same as above, what about "%#02x"?

I think you'd need "%#04x" because the field width includes the
leading "0x", so printing 1 with "%#02x" would result in "0x1" instead
of "0x01".
Bjorn Helgaas June 16, 2021, 3:23 p.m. UTC | #11
On Wed, Jun 16, 2021 at 09:09:40AM +0800, liuqi (BA) wrote:
> On 2021/6/12 7:33, Krzysztof Wilczyński wrote:

> > > +static ssize_t hisi_pcie_event_sysfs_show(struct device *dev,
> > > +				   struct device_attribute *attr, char *buf)
> > > +{
> > > +	struct dev_ext_attribute *eattr;
> > > +
> > > +	eattr = container_of(attr, struct dev_ext_attribute, attr);
> > > +
> > > +	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var);
> > > +}
> > 
> > I am not that familiar with the perf drivers, thus I might be completely
> > wrong here, but usually for sysfs objects a single value is preferred,
> > so that this "config=" technically would not be needed, unless this is
> > somewhat essential to the consumers of this attribute to know what the
> > value is?  What do you think?
>
> "config=" is a supported for userspace tool, it is a kind of alias, so
> cannot be remover here, thanks.

I don't understand this part.  This is brand-new functionality for the
kernel, so there's no requirement to maintain compatibility for
existing userspace tools.

If there's a similar sysfs show function for other perf drivers, and
you need to be compatible with *that*, fine.  But if this is merely
about being compatible with userspace that uses out-of-tree kernel
functionality, that's not a real factor.

Bjorn
Will Deacon June 16, 2021, 5:27 p.m. UTC | #12
On Wed, Jun 16, 2021 at 10:23:43AM -0500, Bjorn Helgaas wrote:
> On Wed, Jun 16, 2021 at 09:09:40AM +0800, liuqi (BA) wrote:
> > On 2021/6/12 7:33, Krzysztof Wilczyński wrote:
> 
> > > > +static ssize_t hisi_pcie_event_sysfs_show(struct device *dev,
> > > > +				   struct device_attribute *attr, char *buf)
> > > > +{
> > > > +	struct dev_ext_attribute *eattr;
> > > > +
> > > > +	eattr = container_of(attr, struct dev_ext_attribute, attr);
> > > > +
> > > > +	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var);
> > > > +}
> > > 
> > > I am not that familiar with the perf drivers, thus I might be completely
> > > wrong here, but usually for sysfs objects a single value is preferred,
> > > so that this "config=" technically would not be needed, unless this is
> > > somewhat essential to the consumers of this attribute to know what the
> > > value is?  What do you think?
> >
> > "config=" is a supported for userspace tool, it is a kind of alias, so
> > cannot be remover here, thanks.
> 
> I don't understand this part.  This is brand-new functionality for the
> kernel, so there's no requirement to maintain compatibility for
> existing userspace tools.
> 
> If there's a similar sysfs show function for other perf drivers, and
> you need to be compatible with *that*, fine.  But if this is merely
> about being compatible with userspace that uses out-of-tree kernel
> functionality, that's not a real factor.

Right, I think this is standard for all perf drivers as it is how the perf
tool figures out how to select a given event in the perf_event_attr (which
has a 'config' field, which is what this refers to).

Will
liuqi (BA) June 17, 2021, 11 a.m. UTC | #13
On 2021/6/16 21:42, Will Deacon wrote:
> Hi,
> 
> On Wed, Jun 16, 2021 at 09:54:23AM +0800, liuqi (BA) wrote:
>> On 2021/6/15 17:35, Will Deacon wrote:
>>> On Tue, Jun 15, 2021 at 04:57:09PM +0800, liuqi (BA) wrote:
>>>> On 2021/6/12 0:23, Will Deacon wrote:
>>>>> On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
>>>>>> +	/* Process data to set unit of latency as "us". */
>>>>>> +	if (is_latency_event(idx))
>>>>>> +		return div64_u64(data * us_per_cycle, data_ext);
>>>>>> +
>>>>>> +	if (is_bus_util_event(idx))
>>>>>> +		return div64_u64(data * us_per_cycle, data_ext);
>>>>>> +
>>>>>> +	if (is_buf_util_event(idx))
>>>>>> +		return div64_u64(data, data_ext * us_per_cycle);
>>>>>
>>>>> Why do we need to do all this division in the kernel? Can't we just expose
>>>>> the underlying values and let userspace figure out what it wants to do with
>>>>> the numbers?
>>>>>
>>>> Our PMU hardware support 8 sets of counters to count bandwidth, latency and
>>>> utilization events.
>>>>
>>>> For example, when users set latency event, common counter will count delay
>>>> cycles, and extern counter count number of PCIe packets automaticly. And we
>>>> do not have a event number for counting number of PCIe packets.
>>>>
>>>> So this division cannot move to userspace tool.
>>>
>>> Why can't you expose the packet counter as an extra event to userspace?
>>>
>> Maybe I didn’t express it clearly.
>>
>> As there is no hardware event number for PCIe packets counting, extern
>> counter count packets *automaticly* when latency events is selected by
>> users.
>>
>> This means users cannot set "config=0xXX" to start packets counting event.
>> So we can only get the value of counter and extern counter in driver and do
>> the division, then pass the result to userspace.
> 
> I still think it would be ideal if we could expose both values to userspace
> rather than combine them somehow. Hmm. Anyway...
> 
> I struggled to figure out exactly what's being counted from the
> documentation patch (please update that). Please can you explain exactly
> what appears in the HISI_PCIE_CNT and HISI_PCIE_EXT_CNT registers for the
> different modes of operation? Without that, the ratios you've chosen to
> report seem rather arbitrary.
> 

Hi Will,

PCIe PMU events can be devided into 2 types: one type is counted by 
HISI_PCIE_CNT, the other type is counted by HISI_PCIE_EXT_CNT and 
HISI_PCIE_CNT, including bandwidth events, latency events, buffer 
utilization and bus utilization.

if user sets "event=0x10, subevent=0x02", this means "latency of RX 
memory read" is selected. HISI_PCIE_CNT counts total delay cycles and 
HISI_PCIE_EXT_CNT counts PCIe packets number at the same time. So PMU 
driver could obtain average latency by caculating: HISI_PCIE_CNT / 
HISI_PCIE_EXT_CNT.

if users sets "event=0x04, subevent=0x01", this means bandwidth of RX 
memory read is selected. HISI_PCIE_CNT counts total packet data volume 
and HISI_PCIE_EXT_CNT counts cycles, so PMU driver could obtain average 
bandwidth by caculating: HISI_PCIE_CNT / HISI_PCIE_EXT_CNT.

The same logic is used when calculating bus utilization and buffer 
utilization. Seems I should add this part in Document patch,I 'll do 
this in next version, thanks.

> I also couldn't figure out how the latency event works. For example, I was
> assuming it would be a filter (a bit like the length), so you could say
> things like "I'm only interested in packets with a latency higher than x"
> but it doesn't look like it works that way.
> 
> Thanks,
> 
latency is not a filter, PCIe PMU has a group of lactency events, their 
event number are within the latency_events_list, and the above explains 
how latency events work.

PMU drivers have TLP length filter for bandwidth events, users could set 
like "I only interested in bandwidth of packets with TLP length bigger 
than x".

Thanks,
Qi

> Will
> .
>
Will Deacon June 17, 2021, 5:57 p.m. UTC | #14
On Thu, Jun 17, 2021 at 07:00:26PM +0800, liuqi (BA) wrote:
> 
> 
> On 2021/6/16 21:42, Will Deacon wrote:
> > Hi,
> > 
> > On Wed, Jun 16, 2021 at 09:54:23AM +0800, liuqi (BA) wrote:
> > > On 2021/6/15 17:35, Will Deacon wrote:
> > > > On Tue, Jun 15, 2021 at 04:57:09PM +0800, liuqi (BA) wrote:
> > > > > On 2021/6/12 0:23, Will Deacon wrote:
> > > > > > On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
> > > > > > > +	/* Process data to set unit of latency as "us". */
> > > > > > > +	if (is_latency_event(idx))
> > > > > > > +		return div64_u64(data * us_per_cycle, data_ext);
> > > > > > > +
> > > > > > > +	if (is_bus_util_event(idx))
> > > > > > > +		return div64_u64(data * us_per_cycle, data_ext);
> > > > > > > +
> > > > > > > +	if (is_buf_util_event(idx))
> > > > > > > +		return div64_u64(data, data_ext * us_per_cycle);
> > > > > > 
> > > > > > Why do we need to do all this division in the kernel? Can't we just expose
> > > > > > the underlying values and let userspace figure out what it wants to do with
> > > > > > the numbers?
> > > > > > 
> > > > > Our PMU hardware support 8 sets of counters to count bandwidth, latency and
> > > > > utilization events.
> > > > > 
> > > > > For example, when users set latency event, common counter will count delay
> > > > > cycles, and extern counter count number of PCIe packets automaticly. And we
> > > > > do not have a event number for counting number of PCIe packets.
> > > > > 
> > > > > So this division cannot move to userspace tool.
> > > > 
> > > > Why can't you expose the packet counter as an extra event to userspace?
> > > > 
> > > Maybe I didn’t express it clearly.
> > > 
> > > As there is no hardware event number for PCIe packets counting, extern
> > > counter count packets *automaticly* when latency events is selected by
> > > users.
> > > 
> > > This means users cannot set "config=0xXX" to start packets counting event.
> > > So we can only get the value of counter and extern counter in driver and do
> > > the division, then pass the result to userspace.
> > 
> > I still think it would be ideal if we could expose both values to userspace
> > rather than combine them somehow. Hmm. Anyway...
> > 
> > I struggled to figure out exactly what's being counted from the
> > documentation patch (please update that). Please can you explain exactly
> > what appears in the HISI_PCIE_CNT and HISI_PCIE_EXT_CNT registers for the
> > different modes of operation? Without that, the ratios you've chosen to
> > report seem rather arbitrary.
> > 
> 
> PCIe PMU events can be devided into 2 types: one type is counted by
> HISI_PCIE_CNT, the other type is counted by HISI_PCIE_EXT_CNT and
> HISI_PCIE_CNT, including bandwidth events, latency events, buffer
> utilization and bus utilization.
> 
> if user sets "event=0x10, subevent=0x02", this means "latency of RX memory
> read" is selected. HISI_PCIE_CNT counts total delay cycles and
> HISI_PCIE_EXT_CNT counts PCIe packets number at the same time. So PMU driver
> could obtain average latency by caculating: HISI_PCIE_CNT /
> HISI_PCIE_EXT_CNT.
> 
> if users sets "event=0x04, subevent=0x01", this means bandwidth of RX memory
> read is selected. HISI_PCIE_CNT counts total packet data volume and
> HISI_PCIE_EXT_CNT counts cycles, so PMU driver could obtain average
> bandwidth by caculating: HISI_PCIE_CNT / HISI_PCIE_EXT_CNT.
> 
> The same logic is used when calculating bus utilization and buffer
> utilization. Seems I should add this part in Document patch,I 'll do this in
> next version, thanks.
> 
> > I also couldn't figure out how the latency event works. For example, I was
> > assuming it would be a filter (a bit like the length), so you could say
> > things like "I'm only interested in packets with a latency higher than x"
> > but it doesn't look like it works that way.
> > 
> > Thanks,
> > 
> latency is not a filter, PCIe PMU has a group of lactency events, their
> event number are within the latency_events_list, and the above explains how
> latency events work.
> 
> PMU drivers have TLP length filter for bandwidth events, users could set
> like "I only interested in bandwidth of packets with TLP length bigger than
> x".

Thanks for the explanations, I think I get it a bit better now. But I still
think we should be exposing both of the values to userspace instead of
reporting the ratio from which the individual counters are then
unrecoverable.

It will complicate the driver slightly, but can we instead expose the
events independently and then allowing scheduling some of them in groups?

That way we just treat HISI_PCIE_CNT and HISI_PCIE_EXT_CNT as separate
counters, but with a scheduling constraint that events in a register pair
must be in the same group.

Will
liuqi (BA) June 18, 2021, 9:32 a.m. UTC | #15
On 2021/6/18 1:57, Will Deacon wrote:
> On Thu, Jun 17, 2021 at 07:00:26PM +0800, liuqi (BA) wrote:
>>
>>
>> On 2021/6/16 21:42, Will Deacon wrote:
>>> Hi,
>>>
>>> On Wed, Jun 16, 2021 at 09:54:23AM +0800, liuqi (BA) wrote:
>>>> On 2021/6/15 17:35, Will Deacon wrote:
>>>>> On Tue, Jun 15, 2021 at 04:57:09PM +0800, liuqi (BA) wrote:
>>>>>> On 2021/6/12 0:23, Will Deacon wrote:
>>>>>>> On Mon, May 31, 2021 at 09:32:31PM +0800, Qi Liu wrote:
>>>>>>>> +	/* Process data to set unit of latency as "us". */
>>>>>>>> +	if (is_latency_event(idx))
>>>>>>>> +		return div64_u64(data * us_per_cycle, data_ext);
>>>>>>>> +
>>>>>>>> +	if (is_bus_util_event(idx))
>>>>>>>> +		return div64_u64(data * us_per_cycle, data_ext);
>>>>>>>> +
>>>>>>>> +	if (is_buf_util_event(idx))
>>>>>>>> +		return div64_u64(data, data_ext * us_per_cycle);
>>>>>>>
>>>>>>> Why do we need to do all this division in the kernel? Can't we just expose
>>>>>>> the underlying values and let userspace figure out what it wants to do with
>>>>>>> the numbers?
>>>>>>>
>>>>>> Our PMU hardware support 8 sets of counters to count bandwidth, latency and
>>>>>> utilization events.
>>>>>>
>>>>>> For example, when users set latency event, common counter will count delay
>>>>>> cycles, and extern counter count number of PCIe packets automaticly. And we
>>>>>> do not have a event number for counting number of PCIe packets.
>>>>>>
>>>>>> So this division cannot move to userspace tool.
>>>>>
>>>>> Why can't you expose the packet counter as an extra event to userspace?
>>>>>
>>>> Maybe I didn’t express it clearly.
>>>>
>>>> As there is no hardware event number for PCIe packets counting, extern
>>>> counter count packets *automaticly* when latency events is selected by
>>>> users.
>>>>
>>>> This means users cannot set "config=0xXX" to start packets counting event.
>>>> So we can only get the value of counter and extern counter in driver and do
>>>> the division, then pass the result to userspace.
>>>
>>> I still think it would be ideal if we could expose both values to userspace
>>> rather than combine them somehow. Hmm. Anyway...
>>>
>>> I struggled to figure out exactly what's being counted from the
>>> documentation patch (please update that). Please can you explain exactly
>>> what appears in the HISI_PCIE_CNT and HISI_PCIE_EXT_CNT registers for the
>>> different modes of operation? Without that, the ratios you've chosen to
>>> report seem rather arbitrary.
>>>
>>
>> PCIe PMU events can be devided into 2 types: one type is counted by
>> HISI_PCIE_CNT, the other type is counted by HISI_PCIE_EXT_CNT and
>> HISI_PCIE_CNT, including bandwidth events, latency events, buffer
>> utilization and bus utilization.
>>
>> if user sets "event=0x10, subevent=0x02", this means "latency of RX memory
>> read" is selected. HISI_PCIE_CNT counts total delay cycles and
>> HISI_PCIE_EXT_CNT counts PCIe packets number at the same time. So PMU driver
>> could obtain average latency by caculating: HISI_PCIE_CNT /
>> HISI_PCIE_EXT_CNT.
>>
>> if users sets "event=0x04, subevent=0x01", this means bandwidth of RX memory
>> read is selected. HISI_PCIE_CNT counts total packet data volume and
>> HISI_PCIE_EXT_CNT counts cycles, so PMU driver could obtain average
>> bandwidth by caculating: HISI_PCIE_CNT / HISI_PCIE_EXT_CNT.
>>
>> The same logic is used when calculating bus utilization and buffer
>> utilization. Seems I should add this part in Document patch,I 'll do this in
>> next version, thanks.
>>
>>> I also couldn't figure out how the latency event works. For example, I was
>>> assuming it would be a filter (a bit like the length), so you could say
>>> things like "I'm only interested in packets with a latency higher than x"
>>> but it doesn't look like it works that way.
>>>
>>> Thanks,
>>>
>> latency is not a filter, PCIe PMU has a group of lactency events, their
>> event number are within the latency_events_list, and the above explains how
>> latency events work.
>>
>> PMU drivers have TLP length filter for bandwidth events, users could set
>> like "I only interested in bandwidth of packets with TLP length bigger than
>> x".
> 
> Thanks for the explanations, I think I get it a bit better now. But I still
> think we should be exposing both of the values to userspace instead of
> reporting the ratio from which the individual counters are then
> unrecoverable.
> 
> It will complicate the driver slightly, but can we instead expose the
> events independently and then allowing scheduling some of them in groups?
> 
> That way we just treat HISI_PCIE_CNT and HISI_PCIE_EXT_CNT as separate
> counters, but with a scheduling constraint that events in a register pair
> must be in the same group.
> 
> Will

Hi Will,

I got what you mean, treating HISI_PCIE_CNT and HISI_PCIE_EXT_CNT as 
separate counters is a great idea, but here is a hardware limitation.

The behavior of HISI_PCIE_EXT_CNT is controlled by hardware logical, so 
HISI_PCIE_EXT_CNT only works when latency/bandwidth/... events number 
are set in HISI_PCIE_EVENT_CTRL. So driver cannot separate this two 
counters, they must work together because of hardware limitation.

We try to expose both values of counters at the same time, but there 
seems only one "event->count" for driver to expose value. Is there any 
method to do this?

Thanks,
Qi

> .
>
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 81e1ede..dd5c62d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8233,6 +8233,12 @@  W:	http://www.hisilicon.com
 F:	Documentation/admin-guide/perf/hisi-pmu.rst
 F:	drivers/perf/hisilicon
 
+HISILICON PCIE PMU DRIVER
+M:	Qi Liu <liuqi115@huawei.com>
+S:	Maintained
+F:	Documentation/admin-guide/perf/hisi-pcie-pmu.rst
+F:	drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
+
 HISILICON QM AND ZIP Controller DRIVER
 M:	Zhou Wang <wangzhou1@hisilicon.com>
 L:	linux-crypto@vger.kernel.org
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 77522e5..ddd82fa 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -139,4 +139,6 @@  config ARM_DMC620_PMU
 
 source "drivers/perf/hisilicon/Kconfig"
 
+source "drivers/perf/pci/Kconfig"
+
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 5260b11..1208c08 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -14,3 +14,4 @@  obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
+obj-y += pci/
diff --git a/drivers/perf/pci/Kconfig b/drivers/perf/pci/Kconfig
new file mode 100644
index 0000000..36b430f
--- /dev/null
+++ b/drivers/perf/pci/Kconfig
@@ -0,0 +1,16 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# PCIe Performance Monitor Drivers
+#
+menu "PCIe Performance Monitor"
+
+config HISI_PCIE_PMU
+	tristate "HiSilicon PCIE PERF PMU"
+	depends on PCI && (ARM64 || COMPILE_TEST)
+	help
+	  Provide support for HiSilicon PCIe performance monitoring unit (PMU)
+	  RCiEP devices.
+	  Adds the PCIe PMU into perf events system for monitoring latency,
+	  bandwidth etc.
+
+endmenu
diff --git a/drivers/perf/pci/Makefile b/drivers/perf/pci/Makefile
new file mode 100644
index 0000000..a56b1a9
--- /dev/null
+++ b/drivers/perf/pci/Makefile
@@ -0,0 +1,2 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y += hisilicon/
diff --git a/drivers/perf/pci/hisilicon/Makefile b/drivers/perf/pci/hisilicon/Makefile
new file mode 100644
index 0000000..65b0bd7
--- /dev/null
+++ b/drivers/perf/pci/hisilicon/Makefile
@@ -0,0 +1,3 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
diff --git a/drivers/perf/pci/hisilicon/hisi_pcie_pmu.c b/drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
new file mode 100644
index 0000000..ed411dd
--- /dev/null
+++ b/drivers/perf/pci/hisilicon/hisi_pcie_pmu.c
@@ -0,0 +1,1019 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This driver adds support for PCIe PMU RCiEP device. Related
+ * perf events are bandwidth, bandwidth utilization, latency
+ * etc.
+ *
+ * Copyright (C) 2021 HiSilicon Limited
+ * Author: Qi Liu<liuqi115@huawei.com>
+ */
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+
+#include <asm/div64.h>
+
+/* Define registers */
+#define HISI_PCIE_GLOBAL_CTRL		0x00
+#define HISI_PCIE_EVENT_CTRL		0x010
+#define HISI_PCIE_CNT			0x090
+#define HISI_PCIE_EXT_CNT		0x110
+#define HISI_PCIE_INT_STAT		0x150
+#define HISI_PCIE_INT_MASK		0x154
+#define HISI_PCIE_REG_BDF		0xfe0
+#define HISI_PCIE_REG_VERSION		0xfe4
+#define HISI_PCIE_REG_INFO		0xfe8
+#define HISI_PCIE_REG_FREQ		0xfec
+
+/* Define PCIE CTRL CMD */
+#define HISI_PCIE_GLOBAL_EN		0x01
+#define HISI_PCIE_GLOBAL_NONE		0
+#define HISI_PCIE_EVENT_EN		BIT_ULL(20)
+#define HISI_PCIE_RESET_CNT		BIT_ULL(22)
+#define HISI_PCIE_DEFAULT_SET		BIT_ULL(34)
+#define HISI_PCIE_THR_EN		BIT_ULL(26)
+#define HISI_PCIE_TARGET_EN		BIT_ULL(32)
+#define HISI_PCIE_TRIG_EN		BIT_ULL(52)
+
+/* Define offsets in event ctrl regesiter */
+#define HISI_PCIE_EVENT_M		GENMASK_ULL(7, 0)
+#define HISI_PCIE_SUBEVENT_M		GENMASK_ULL(15, 8)
+#define HISI_PCIE_THR_MODE_M		GENMASK_ULL(27, 27)
+#define HISI_PCIE_THR_M			GENMASK_ULL(31, 28)
+#define HISI_PCIE_TARGET_M		GENMASK_ULL(52, 36)
+#define HISI_PCIE_TRIG_MODE_M		GENMASK_ULL(53, 53)
+#define HISI_PCIE_TRIG_M		GENMASK_ULL(59, 56)
+
+#define HISI_PCIE_MAX_COUNTERS		8
+#define HISI_PCIE_REG_STEP		8
+#define HISI_PCIE_EVENT_MAX		0xa2
+#define HISI_PCIE_SUBEVENT_MAX		0x20
+#define HISI_PCIE_THR_MAX_VAL		10
+#define HISI_PCIE_TRIG_MAX_VAL		10
+#define HISI_PCIE_COUNTER_BITS		64
+#define HISI_PCIE_MAX_PERIOD		BIT_ULL(63)
+
+struct hisi_pcie_pmu {
+	struct perf_event *hw_events[HISI_PCIE_MAX_COUNTERS];
+	struct hlist_node node;
+	struct pci_dev *pdev;
+	struct pmu pmu;
+	void __iomem *base;
+	int irq;
+	u32 identifier;
+	/* Minimum and maximum bdf of root ports monitored by PMU */
+	u16 bdf_min;
+	u16 bdf_max;
+	int on_cpu;
+};
+
+#define to_pcie_pmu(p)  (container_of((p), struct hisi_pcie_pmu, pmu))
+#define GET_PCI_DEVFN(bdf)  ((bdf) & 0xff)
+
+#define HISI_PCIE_PMU_FILTER_ATTR(_name, _config, _hi, _lo)		  \
+	static u64 hisi_pcie_get_##_name(struct perf_event *event)	  \
+	{								  \
+		return FIELD_GET(GENMASK(_hi, _lo), event->attr._config); \
+	}								  \
+
+HISI_PCIE_PMU_FILTER_ATTR(event, config, 7, 0);
+HISI_PCIE_PMU_FILTER_ATTR(subevent, config, 15, 8);
+HISI_PCIE_PMU_FILTER_ATTR(thr_len, config1, 3, 0);
+HISI_PCIE_PMU_FILTER_ATTR(thr_mode, config1, 4, 4);
+HISI_PCIE_PMU_FILTER_ATTR(trig_len, config1, 8, 5);
+HISI_PCIE_PMU_FILTER_ATTR(trig_mode, config1, 9, 9);
+HISI_PCIE_PMU_FILTER_ATTR(port, config2, 15, 0);
+HISI_PCIE_PMU_FILTER_ATTR(bdf, config2, 31, 16);
+
+#define HISI_PCIE_BUILD_EVENTS(name)					\
+	static bool is_##name##_event(u32 idx)				\
+	{								\
+		return (idx >= name##_events_list[0] &&			\
+			idx <= name##_events_list[1]) ||		\
+			idx == name##_events_list[2];			\
+	}								\
+
+/*
+ * The first element of event list is minimum index of TL-layer events
+ * and the second element is maximum index. The third element is index
+ * of a DL-layer event.
+ */
+static const u32 bw_events_list[] = {0x04, 0x08, 0x84};
+static const u32 latency_events_list[] = {0x10, 0x15, 0x85};
+static const u32 bus_util_events_list[] = {0x20, 0x24, 0x09};
+static const u32 buf_util_events_list[] = {0x28, 0x2a, 0x33};
+
+HISI_PCIE_BUILD_EVENTS(bw);
+HISI_PCIE_BUILD_EVENTS(latency);
+HISI_PCIE_BUILD_EVENTS(bus_util);
+HISI_PCIE_BUILD_EVENTS(buf_util);
+
+static ssize_t hisi_pcie_format_sysfs_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
+}
+
+static ssize_t hisi_pcie_event_sysfs_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+	return sysfs_emit(buf, "config=0x%lx\n", (unsigned long)eattr->var);
+}
+
+#define HISI_PCIE_PMU_ATTR(_name, _func, _config)			\
+	(&((struct dev_ext_attribute[]) {				\
+		{ __ATTR(_name, 0444, _func, NULL), (void *)_config }   \
+	})[0].attr.attr)
+
+#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format)			\
+	HISI_PCIE_PMU_ATTR(_name, hisi_pcie_format_sysfs_show, (void *)_format)
+#define HISI_PCIE_PMU_EVENT_ATTR(_name, _event)			\
+	HISI_PCIE_PMU_ATTR(_name, hisi_pcie_event_sysfs_show, (void *)_event)
+
+static ssize_t hisi_pcie_cpumask_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "%d\n", pcie_pmu->on_cpu);
+}
+
+static ssize_t hisi_pcie_identifier_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "0x%x\n", pcie_pmu->identifier);
+}
+
+static ssize_t hisi_pcie_bus_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "0x%02x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
+}
+
+static void hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu,
+				      u32 reg_off, u16 *arg0, u16 *arg1)
+{
+	u32 val = readl(pcie_pmu->base + reg_off);
+
+	*arg0 = val & 0xffff;
+	*arg1 = (val & 0xffff0000) >> 16;
+}
+
+static u32 hisi_pcie_pmu_get_offset(u32 offset, u32 idx)
+{
+	return offset + HISI_PCIE_REG_STEP * idx;
+}
+
+static u32 hisi_pcie_pmu_readl(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
+			       u32 idx)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	return readl(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writel(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
+				 u32 idx, u32 val)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	writel(val, pcie_pmu->base + offset);
+}
+
+static u64 hisi_pcie_pmu_readq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
+			       u32 idx)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	return readq(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
+				 u32 idx, u64 val)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	writeq(val, pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_config_filter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 reg = HISI_PCIE_DEFAULT_SET;
+	u64 port, trig_len, thr_len;
+	u32 idx = hwc->idx;
+
+	/* Config HISI_PCIE_EVENT_CTRL according to event and subevent. */
+	reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_event(event)) |
+	       FIELD_PREP(HISI_PCIE_SUBEVENT_M, hisi_pcie_get_subevent(event));
+
+	/* Config HISI_PCIE_EVENT_CTRL according to ROOT PORT or EP device. */
+	port = hisi_pcie_get_port(event);
+	if (port)
+		reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port);
+	else
+		reg |= HISI_PCIE_TARGET_EN |
+		       FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event));
+
+	/* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */
+	trig_len = hisi_pcie_get_trig_len(event);
+	if (trig_len)
+		reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len) |
+		       FIELD_PREP(HISI_PCIE_TRIG_MODE_M,
+		       hisi_pcie_get_trig_mode(event)) | HISI_PCIE_TRIG_EN;
+
+	/* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */
+	thr_len = hisi_pcie_get_thr_len(event);
+	if (thr_len)
+		reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len) |
+		       FIELD_PREP(HISI_PCIE_THR_MODE_M,
+		       hisi_pcie_get_thr_mode(event)) | HISI_PCIE_THR_EN;
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, reg);
+}
+
+static void hisi_pcie_pmu_clear_filter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx,
+			     HISI_PCIE_DEFAULT_SET);
+}
+
+static bool hisi_pcie_pmu_valid_port(struct hisi_pcie_pmu *pcie_pmu, u16 rp_bdf)
+{
+	return rp_bdf >= pcie_pmu->bdf_min && rp_bdf <= pcie_pmu->bdf_max;
+}
+
+static bool hisi_pcie_pmu_valid_requester_id(struct hisi_pcie_pmu *pcie_pmu,
+					    u32 bdf)
+{
+	struct pci_dev *root_port, *pdev;
+
+	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pcie_pmu->pdev->bus),
+					   PCI_BUS_NUM(bdf),
+					   GET_PCI_DEVFN(bdf));
+	if (!pdev)
+		return false;
+
+	root_port = pcie_find_root_port(pdev);
+	if (!root_port)
+		return false;
+
+	pci_dev_put(pdev);
+	return hisi_pcie_pmu_valid_port(pcie_pmu, pci_dev_id(root_port));
+}
+
+static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
+				       struct hisi_pcie_pmu *pcie_pmu)
+{
+	u32 subev_idx = hisi_pcie_get_subevent(event);
+	u32 event_idx = hisi_pcie_get_event(event);
+	u32 requester_id = hisi_pcie_get_bdf(event);
+
+	if (subev_idx > HISI_PCIE_SUBEVENT_MAX ||
+	    event_idx > HISI_PCIE_EVENT_MAX) {
+		pci_err(pcie_pmu->pdev,
+			"Max event index and max subevent index is: %d, %d.\n",
+			HISI_PCIE_EVENT_MAX, HISI_PCIE_SUBEVENT_MAX);
+		return false;
+	}
+
+	if (hisi_pcie_get_thr_len(event) > HISI_PCIE_THR_MAX_VAL)
+		return false;
+
+	if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL)
+		return false;
+
+	if (requester_id) {
+		if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id))
+			return false;
+	}
+
+	return true;
+}
+
+static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	int counters = 1;
+
+	if (!is_software_event(leader)) {
+		if (leader->pmu != event->pmu)
+			return false;
+
+		if (leader != event)
+			counters++;
+	}
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (is_software_event(sibling))
+			continue;
+
+		if (sibling->pmu != event->pmu)
+			return false;
+
+		counters++;
+	}
+
+	return counters <= HISI_PCIE_MAX_COUNTERS;
+}
+
+static int hisi_pcie_pmu_event_init(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+
+	event->cpu = pcie_pmu->on_cpu;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Sampling is not supported. */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EOPNOTSUPP;
+
+	if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu)) {
+		pci_err(pcie_pmu->pdev, "Invalid filter!\n");
+		return -EINVAL;
+	}
+
+	if (!hisi_pcie_pmu_validate_event_group(event))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * The bandwidth, latency, bus utilization and buffer occupancy features are
+ * calculated from data in HISI_PCIE_CNT and extended data in HISI_PCIE_EXT_CNT.
+ * Other features are obtained only by HISI_PCIE_CNT.
+ * So data and data_ext are processed in this function to get performanace
+ * value like, bandwidth, latency, etc.
+ */
+static u64 hisi_pcie_pmu_get_performance(struct perf_event *event, u64 data,
+					 u64 data_ext)
+{
+#define CONVERT_DW_TO_BYTE(x)	(sizeof(u32) * (x))
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	u64 us_per_cycle = readl(pcie_pmu->base + HISI_PCIE_REG_FREQ);
+	u32 idx = hisi_pcie_get_event(event);
+
+	if (!data_ext)
+		return 0;
+
+	/* Process data to set unit of bandwidth as "Byte/ms". */
+	if (is_bw_event(idx)) {
+
+		if (!div64_u64(data_ext, 1000))
+			return 0;
+
+		return div64_u64(CONVERT_DW_TO_BYTE(data),
+				 div64_u64(data_ext, 1000));
+	}
+
+	/* Process data to set unit of latency as "us". */
+	if (is_latency_event(idx))
+		return div64_u64(data * us_per_cycle, data_ext);
+
+	if (is_bus_util_event(idx))
+		return div64_u64(data * us_per_cycle, data_ext);
+
+	if (is_buf_util_event(idx))
+		return div64_u64(data, data_ext * us_per_cycle);
+
+	return data;
+}
+
+static void hisi_pcie_pmu_read_counter(struct perf_event *event, u64 *cnt,
+				       u64 *cnt_ext)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	u32 idx = event->hw.idx;
+
+	*cnt = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_CNT, idx);
+	*cnt_ext = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EXT_CNT, idx);
+}
+
+static void hisi_pcie_pmu_write_counter(struct perf_event *event, u64 val,
+					u64 val_ext)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	u32 idx = event->hw.idx;
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_CNT, idx, val);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EXT_CNT, idx, val_ext);
+}
+
+static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu)
+{
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		if (!pcie_pmu->hw_events[idx])
+			return idx;
+	}
+
+	return -EINVAL;
+}
+
+static void hisi_pcie_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *hwc_ext = &hwc->extra_reg;
+	u64 new_cnt_ext, prev_cnt_ext;
+	u64 new_cnt, prev_cnt, delta;
+
+	hwc_ext = &hwc->extra_reg;
+	do {
+		prev_cnt = local64_read(&hwc->prev_count);
+		prev_cnt_ext = hwc_ext->config;
+		hisi_pcie_pmu_read_counter(event, &new_cnt, &new_cnt_ext);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_cnt,
+				 new_cnt) != prev_cnt);
+
+	hwc_ext->config = new_cnt_ext;
+
+	delta = hisi_pcie_pmu_get_performance(event, new_cnt - prev_cnt,
+					      new_cnt_ext - prev_cnt_ext);
+	local64_add(delta, &event->count);
+}
+
+static void hisi_pcie_pmu_read(struct perf_event *event)
+{
+	hisi_pcie_pmu_event_update(event);
+}
+
+static void hisi_pcie_pmu_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *hwc_ext;
+	u64 val = BIT_ULL(HISI_PCIE_COUNTER_BITS - 1);
+
+	hwc_ext = &hwc->extra_reg;
+	local64_set(&hwc->prev_count, val);
+	hwc_ext->config = 0;
+	hisi_pcie_pmu_write_counter(event, val, 0);
+}
+
+static void hisi_pcie_pmu_enable_counter(struct hisi_pcie_pmu *pcie_pmu,
+					 struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u64 val;
+
+	val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+	val |= HISI_PCIE_EVENT_EN;
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_disable_counter(struct hisi_pcie_pmu *pcie_pmu,
+					  struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u64 val;
+
+	val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+	val &= ~HISI_PCIE_EVENT_EN;
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_enable_int(struct hisi_pcie_pmu *pcie_pmu,
+				     struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 0);
+}
+
+static void hisi_pcie_pmu_disable_int(struct hisi_pcie_pmu *pcie_pmu,
+				      struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 1);
+}
+
+static void hisi_pcie_pmu_reset_counter(struct hisi_pcie_pmu *pcie_pmu,
+					struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx,
+			     HISI_PCIE_RESET_CNT);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx,
+			     HISI_PCIE_DEFAULT_SET);
+}
+
+static void hisi_pcie_pmu_start(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *hwc_ext;
+	u64 prev_cnt, prev_cnt_ext;
+
+	hwc_ext = &hwc->extra_reg;
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	hisi_pcie_pmu_config_filter(event);
+	hisi_pcie_pmu_enable_counter(pcie_pmu, hwc);
+	hisi_pcie_pmu_enable_int(pcie_pmu, hwc);
+	hisi_pcie_pmu_set_period(event);
+
+	if (flags & PERF_EF_RELOAD) {
+		prev_cnt = local64_read(&hwc->prev_count);
+		prev_cnt_ext = hwc_ext->config;
+		hisi_pcie_pmu_write_counter(event, prev_cnt, prev_cnt_ext);
+	}
+
+	perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_stop(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_event_update(event);
+	hisi_pcie_pmu_disable_int(pcie_pmu, hwc);
+	hisi_pcie_pmu_disable_counter(pcie_pmu, hwc);
+	hisi_pcie_pmu_clear_filter(event);
+	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int hisi_pcie_pmu_add(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	idx = hisi_pcie_pmu_get_event_idx(pcie_pmu);
+	if (idx < 0)
+		return idx;
+
+	hwc->idx = idx;
+	pcie_pmu->hw_events[idx] = event;
+
+	/* Reset Counter to avoid interference caused by previous statistic. */
+	hisi_pcie_pmu_reset_counter(pcie_pmu, hwc);
+
+	if (flags & PERF_EF_START)
+		hisi_pcie_pmu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void hisi_pcie_pmu_del(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_stop(event, PERF_EF_UPDATE);
+	pcie_pmu->hw_events[hwc->idx] = NULL;
+	perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_enable(struct pmu *pmu)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+	int num;
+
+	for (num = 0; num < HISI_PCIE_MAX_COUNTERS; num++) {
+		if (pcie_pmu->hw_events[num])
+			break;
+	}
+
+	if (num == HISI_PCIE_MAX_COUNTERS)
+		return;
+
+	writel(HISI_PCIE_GLOBAL_EN, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static void hisi_pcie_pmu_disable(struct pmu *pmu)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+
+	writel(HISI_PCIE_GLOBAL_NONE, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static irqreturn_t hisi_pcie_pmu_irq(int irq, void *data)
+{
+	struct hisi_pcie_pmu *pcie_pmu = data;
+	irqreturn_t ret = IRQ_NONE;
+	struct perf_event *event;
+	u32 overflown;
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		overflown = hisi_pcie_pmu_readl(pcie_pmu, HISI_PCIE_INT_STAT,
+						idx);
+		if (!overflown)
+			continue;
+
+		/* Clear status of interrupt. */
+		hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_STAT, idx, 1);
+		event = pcie_pmu->hw_events[idx];
+		if (!event)
+			continue;
+
+		hisi_pcie_pmu_event_update(event);
+		hisi_pcie_pmu_set_period(event);
+		ret = IRQ_HANDLED;
+	}
+
+	return ret;
+}
+
+static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev,
+				      struct hisi_pcie_pmu *pcie_pmu)
+{
+	int irq, ret;
+
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+	if (ret < 0) {
+		pci_err(pdev, "Failed to enable MSI vectors, ret = %d!\n", ret);
+		return ret;
+	}
+
+	irq = pci_irq_vector(pdev, 0);
+	ret = request_irq(irq, hisi_pcie_pmu_irq,
+			  IRQF_NOBALANCING | IRQF_NO_THREAD, "hisi_pcie_pmu",
+			  pcie_pmu);
+	if (ret) {
+		pci_err(pdev, "Failed to register irq, ret = %d!\n", ret);
+		pci_free_irq_vectors(pdev);
+		return ret;
+	}
+
+	pcie_pmu->irq = irq;
+
+	return 0;
+}
+
+static void hisi_pcie_pmu_irq_unregister(struct pci_dev *pdev,
+					 struct hisi_pcie_pmu *pcie_pmu)
+{
+	free_irq(pcie_pmu->irq, pcie_pmu);
+	pci_free_irq_vectors(pdev);
+}
+
+static int hisi_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node,
+					 struct hisi_pcie_pmu, node);
+
+	if (pcie_pmu->on_cpu == -1) {
+		pcie_pmu->on_cpu = cpu;
+		WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(cpu)));
+	}
+
+	return 0;
+}
+
+static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node,
+					 struct hisi_pcie_pmu, node);
+	unsigned int target;
+
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (pcie_pmu->on_cpu != cpu)
+		return 0;
+
+	/* Choose a new CPU from all online cpus. */
+	target = cpumask_first(cpu_online_mask);
+	if (target >= nr_cpu_ids) {
+		pci_err(pcie_pmu->pdev, "There is no cpu to set!\n");
+		return 0;
+	}
+
+	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
+	/* Use this CPU for event counting */
+	pcie_pmu->on_cpu = target;
+	WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(target)));
+
+	return 0;
+}
+
+/*
+ * Events with the "dl" suffix in their names count performance in DL layer,
+ * otherswise, events count performance in TL layer.
+ */
+static struct attribute *hisi_pcie_pmu_events_attr[] = {
+	HISI_PCIE_PMU_EVENT_ATTR(bw_rx_mwr, 0x0104),
+	HISI_PCIE_PMU_EVENT_ATTR(bw_rx_mrd, 0x1005),
+	HISI_PCIE_PMU_EVENT_ATTR(bw_tx_mwr, 0x0105),
+	HISI_PCIE_PMU_EVENT_ATTR(bw_tx_mrd, 0x2004),
+	HISI_PCIE_PMU_EVENT_ATTR(lat_rx_mwr, 0x0010),
+	HISI_PCIE_PMU_EVENT_ATTR(lat_rx_mrd, 0x0210),
+	HISI_PCIE_PMU_EVENT_ATTR(lat_tx_mrd, 0x0011),
+	HISI_PCIE_PMU_EVENT_ATTR(lat_tx_cfg, 0x0111),
+	HISI_PCIE_PMU_EVENT_ATTR(bw_rx_dl, 0x0184),
+	HISI_PCIE_PMU_EVENT_ATTR(bw_tx_dl, 0x0384),
+	NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_pcie_pmu_events_attr,
+};
+
+static struct attribute *hisi_pcie_pmu_format_attr[] = {
+	HISI_PCIE_PMU_FORMAT_ATTR(event, "config:0-7"),
+	HISI_PCIE_PMU_FORMAT_ATTR(subevent, "config:8-15"),
+	HISI_PCIE_PMU_FORMAT_ATTR(thr_len, "config1:0-3"),
+	HISI_PCIE_PMU_FORMAT_ATTR(thr_mode, "config1:4"),
+	HISI_PCIE_PMU_FORMAT_ATTR(trig_len, "config1:5-8"),
+	HISI_PCIE_PMU_FORMAT_ATTR(trig_mode, "config1:9"),
+	HISI_PCIE_PMU_FORMAT_ATTR(port, "config2:0-15"),
+	HISI_PCIE_PMU_FORMAT_ATTR(bdf, "config2:16-31"),
+	NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_pcie_pmu_format_attr,
+};
+
+static struct device_attribute hisi_pcie_pmu_bus_attr =
+	__ATTR(bus, 0444, hisi_pcie_bus_show, NULL);
+
+static struct attribute *hisi_pcie_pmu_bus_attrs[] = {
+	&hisi_pcie_pmu_bus_attr.attr,
+	NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_bus_attr_group = {
+	.attrs = hisi_pcie_pmu_bus_attrs,
+};
+
+static struct device_attribute hisi_pcie_pmu_cpumask_attr =
+	__ATTR(cpumask, 0444, hisi_pcie_cpumask_show, NULL);
+
+static struct attribute *hisi_pcie_pmu_cpumask_attrs[] = {
+	&hisi_pcie_pmu_cpumask_attr.attr,
+	NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_cpumask_attr_group = {
+	.attrs = hisi_pcie_pmu_cpumask_attrs,
+};
+
+static struct device_attribute hisi_pcie_pmu_identifier_attr =
+	__ATTR(identifier, 0444, hisi_pcie_identifier_show, NULL);
+
+static struct attribute *hisi_pcie_pmu_identifier_attrs[] = {
+	&hisi_pcie_pmu_identifier_attr.attr,
+	NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_identifier_attr_group = {
+	.attrs = hisi_pcie_pmu_identifier_attrs,
+};
+
+static const struct attribute_group *hisi_pcie_pmu_attr_groups[] = {
+	&hisi_pcie_pmu_events_group,
+	&hisi_pcie_pmu_format_group,
+	&hisi_pcie_pmu_bus_attr_group,
+	&hisi_pcie_pmu_cpumask_attr_group,
+	&hisi_pcie_pmu_identifier_attr_group,
+	NULL
+};
+
+static int hisi_pcie_alloc_pmu(struct pci_dev *pdev,
+			       struct hisi_pcie_pmu *pcie_pmu)
+{
+	u16 sicl_id, device_id;
+	char *name;
+
+	hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_BDF,
+				  &pcie_pmu->bdf_min, &pcie_pmu->bdf_max);
+	hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_INFO, &device_id,
+				  &sicl_id);
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL,
+			      "hisi_pcie%u_%u", sicl_id, device_id);
+	if (!name)
+		return -ENOMEM;
+
+	pcie_pmu->pdev = pdev;
+	pcie_pmu->on_cpu = -1;
+	pcie_pmu->identifier = readl(pcie_pmu->base + HISI_PCIE_REG_VERSION);
+	pcie_pmu->pmu = (struct pmu) {
+		.name		= name,
+		.module		= THIS_MODULE,
+		.event_init	= hisi_pcie_pmu_event_init,
+		.pmu_enable	= hisi_pcie_pmu_enable,
+		.pmu_disable	= hisi_pcie_pmu_disable,
+		.add		= hisi_pcie_pmu_add,
+		.del		= hisi_pcie_pmu_del,
+		.start		= hisi_pcie_pmu_start,
+		.stop		= hisi_pcie_pmu_stop,
+		.read		= hisi_pcie_pmu_read,
+		.task_ctx_nr	= perf_invalid_context,
+		.attr_groups	= hisi_pcie_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+	};
+
+	return 0;
+}
+
+static int hisi_pcie_init_pmu(struct pci_dev *pdev,
+			      struct hisi_pcie_pmu *pcie_pmu)
+{
+	int ret;
+
+	pcie_pmu->base = pci_ioremap_bar(pdev, 2);
+	if (!pcie_pmu->base) {
+		pci_err(pdev, "Ioremap failed for pcie_pmu resource.\n");
+		return -ENOMEM;
+	}
+
+	ret = hisi_pcie_alloc_pmu(pdev, pcie_pmu);
+	if (ret)
+		return ret;
+
+	ret = hisi_pcie_pmu_irq_register(pdev, pcie_pmu);
+	if (ret)
+		goto err_set_pmu_fail;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+				       &pcie_pmu->node);
+	if (ret) {
+		pci_err(pdev, "Failed to register hotplug, ret = %d.\n", ret);
+		goto err_irq_unregister;
+	}
+
+	ret = perf_pmu_register(&pcie_pmu->pmu, pcie_pmu->pmu.name, -1);
+	if (ret) {
+		pci_err(pdev, "Failed to register PCIe PMU, ret = %d.\n", ret);
+		goto err_hotplug_unregister;
+	}
+
+	return ret;
+
+err_hotplug_unregister:
+	cpuhp_state_remove_instance_nocalls(
+		CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+
+err_irq_unregister:
+	hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+
+err_set_pmu_fail:
+	iounmap(pcie_pmu->base);
+
+	return ret;
+}
+
+static void hisi_pcie_uninit_pmu(struct pci_dev *pdev)
+{
+	struct hisi_pcie_pmu *pcie_pmu = pci_get_drvdata(pdev);
+
+	perf_pmu_unregister(&pcie_pmu->pmu);
+	cpuhp_state_remove_instance_nocalls(
+		CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+	hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+	iounmap(pcie_pmu->base);
+}
+
+static int hisi_pcie_init_dev(struct pci_dev *pdev)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		pci_err(pdev, "Failed to enable pci device, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = pci_request_mem_regions(pdev, "hisi_pcie_pmu");
+	if (ret < 0) {
+		pci_err(pdev, "Failed to request pci mem regions, ret = %d.\n",
+			ret);
+		pci_disable_device(pdev);
+		return ret;
+	}
+
+	pci_set_master(pdev);
+
+	return 0;
+}
+
+static void hisi_pcie_uninit_dev(struct pci_dev *pdev)
+{
+	pci_clear_master(pdev);
+	pci_release_mem_regions(pdev);
+	pci_disable_device(pdev);
+}
+
+static int hisi_pcie_pmu_probe(struct pci_dev *pdev,
+			       const struct pci_device_id *id)
+{
+	struct hisi_pcie_pmu *pcie_pmu;
+	int ret;
+
+	pcie_pmu = devm_kzalloc(&pdev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
+	if (!pcie_pmu)
+		return -ENOMEM;
+
+	ret = hisi_pcie_init_dev(pdev);
+	if (ret)
+		return ret;
+
+	ret = hisi_pcie_init_pmu(pdev, pcie_pmu);
+	if (ret)
+		hisi_pcie_uninit_dev(pdev);
+
+	pci_set_drvdata(pdev, pcie_pmu);
+
+	return ret;
+}
+
+static void hisi_pcie_pmu_remove(struct pci_dev *pdev)
+{
+	hisi_pcie_uninit_pmu(pdev);
+	hisi_pcie_uninit_dev(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id hisi_pcie_pmu_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa12d) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, hisi_pcie_pmu_ids);
+
+static struct pci_driver hisi_pcie_pmu_driver = {
+	.name = "hisi_pcie_pmu",
+	.id_table = hisi_pcie_pmu_ids,
+	.probe = hisi_pcie_pmu_probe,
+	.remove = hisi_pcie_pmu_remove,
+};
+
+static int __init hisi_pcie_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+				      "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE",
+				      hisi_pcie_pmu_online_cpu,
+				      hisi_pcie_pmu_offline_cpu);
+	if (ret) {
+		pr_err("Failed to setup PCIE PMU hotplug, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = pci_register_driver(&hisi_pcie_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(
+				CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+
+	return ret;
+}
+module_init(hisi_pcie_module_init);
+
+static void __exit hisi_pcie_module_exit(void)
+{
+	pci_unregister_driver(&hisi_pcie_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+}
+module_exit(hisi_pcie_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon PCIe PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Qi Liu <liuqi115@huawei.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4a62b39..a9776a2 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -180,6 +180,7 @@  enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,