diff mbox series

[v9,3/4] drivers/perf: add DesignWare PCIe PMU driver

Message ID 20231020134230.53342-4-xueshuai@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series drivers/perf: add Synopsys DesignWare PCIe PMU driver support | expand

Commit Message

Shuai Xue Oct. 20, 2023, 1:42 p.m. UTC
This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
Core controller IP which provides statistics feature. The PMU is a PCIe
configuration space register block provided by each PCIe Root Port in a
Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
injection, and Statistics).

To facilitate collection of statistics the controller provides the
following two features for each Root Port:

- one 64-bit counter for Time Based Analysis (RX/TX data throughput and
  time spent in each low-power LTSSM state) and
- one 32-bit counter for Event Counting (error and non-error events for
  a specified lane)

Note: There is no interrupt for counter overflow.

This driver adds PMU devices for each PCIe Root Port. And the PMU device is
named based the BDF of Root Port. For example,

    30:03.0 PCI bridge: Device 1ded:8000 (rev 01)

the PMU device name for this Root Port is dwc_rootport_3018.

Example usage of counting PCIe RX TLP data payload (Units of bytes)::

    $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/

average RX bandwidth can be calculated like this:

    PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
---
 drivers/perf/Kconfig        |   7 +
 drivers/perf/Makefile       |   1 +
 drivers/perf/dwc_pcie_pmu.c | 770 ++++++++++++++++++++++++++++++++++++
 3 files changed, 778 insertions(+)
 create mode 100644 drivers/perf/dwc_pcie_pmu.c

Comments

Jonathan Cameron Oct. 20, 2023, 4:49 p.m. UTC | #1
On Fri, 20 Oct 2023 21:42:29 +0800
Shuai Xue <xueshuai@linux.alibaba.com> wrote:

> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is a PCIe
> configuration space register block provided by each PCIe Root Port in a
> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
> injection, and Statistics).
> 
> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
> 
> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>   time spent in each low-power LTSSM state) and
> - one 32-bit counter for Event Counting (error and non-error events for
>   a specified lane)
> 
> Note: There is no interrupt for counter overflow.
> 
> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
> 
>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
> 
> the PMU device name for this Root Port is dwc_rootport_3018.
> 
> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
> 
>     $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
> 
> average RX bandwidth can be calculated like this:
> 
>     PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
> 
> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
LGTM other than some really trivial stuff inline if you are doing a v10

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>


> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	bool notify = false;
> +	char *name;
> +	u32 bdf;
> +	int ret;
> +
> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
> +	for_each_pci_dev(pdev) {
> +		u16 vsec;
> +		u32 val;
> +
> +		if (!(pci_is_pcie(pdev) &&
> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
> +			continue;
> +
> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
> +						DWC_PCIE_VSEC_RAS_DES_ID);
> +		if (!vsec)
> +			continue;
> +
> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
> +			continue;
> +		pci_dbg(pdev,
> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
> +
> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
> +				      bdf);
> +		if (!name) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		/* All checks passed, go go go */
> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
> +		if (!pcie_pmu) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		pcie_pmu->pdev = pdev;
> +		pcie_pmu->ras_des_offset = vsec;
> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
> +		pcie_pmu->on_cpu = -1;
> +		pcie_pmu->pmu = (struct pmu){
> +			.module		= THIS_MODULE,
> +			.attr_groups	= dwc_pcie_attr_groups,
> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +			.task_ctx_nr	= perf_invalid_context,
> +			.event_init	= dwc_pcie_pmu_event_init,
> +			.add		= dwc_pcie_pmu_event_add,
> +			.del		= dwc_pcie_pmu_event_del,
> +			.start		= dwc_pcie_pmu_event_start,
> +			.stop		= dwc_pcie_pmu_event_stop,
> +			.read		= dwc_pcie_pmu_event_update,
> +		};
> +
> +		/* Add this instance to the list used by the offline callback */
> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
> +					       &pcie_pmu->cpuhp_node);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering hotplug @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Unwind when platform driver removes */
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
> +			&pcie_pmu->cpuhp_node);
> +		if (ret)
> +			goto out;
> +
> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering PMU @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Cache PMU to handle pci device hotplug */
> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
> +		pcie_pmu->registered = true;
> +		notify = true;
> +
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);

line wrapping is a bit ugly - I would move the &plat_dev->dev to previous line.
and I think you can get away with aligning the rest just after the (


> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
> +		return devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
> +
> +	return 0;
> +
> +out:
> +	pci_dev_put(pdev);
> +
> +	return ret;
> +}



> +static int __init dwc_pcie_pmu_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +				      "perf/dwc_pcie_pmu:online",
> +				      dwc_pcie_pmu_online_cpu,
> +				      dwc_pcie_pmu_offline_cpu);
> +	if (ret < 0)
> +		return ret;
> +
> +	dwc_pcie_pmu_hp_state = ret;
> +
> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
> +	if (ret)
> +		goto platform_driver_register_err;
> +
> +	dwc_pcie_pmu_dev = platform_device_register_simple(
> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
> +		goto platform_device_register_error;
> +	}
> +
> +	return 0;
> +
> +platform_device_register_error:

Trivial but I'd standardize on err or error, not mix them.

> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
> +platform_driver_register_err:
> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
> +
> +	return ret;
> +}
Shuai Xue Oct. 22, 2023, 7:27 a.m. UTC | #2
On 2023/10/21 00:49, Jonathan Cameron wrote:
> On Fri, 20 Oct 2023 21:42:29 +0800
> Shuai Xue <xueshuai@linux.alibaba.com> wrote:
> 
>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>> Core controller IP which provides statistics feature. The PMU is a PCIe
>> configuration space register block provided by each PCIe Root Port in a
>> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
>> injection, and Statistics).
>>
>> To facilitate collection of statistics the controller provides the
>> following two features for each Root Port:
>>
>> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>>   time spent in each low-power LTSSM state) and
>> - one 32-bit counter for Event Counting (error and non-error events for
>>   a specified lane)
>>
>> Note: There is no interrupt for counter overflow.
>>
>> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
>> named based the BDF of Root Port. For example,
>>
>>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
>>
>> the PMU device name for this Root Port is dwc_rootport_3018.
>>
>> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
>>
>>     $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
>>
>> average RX bandwidth can be calculated like this:
>>
>>     PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
>>
>> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> LGTM other than some really trivial stuff inline if you are doing a v10
> 
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Hi, Jonathan,

Thank you for your prompt response.

The fact that this [3/4] patch has received a Reviewed-by tag from the
community is a momentous milestone for both the patchset itself and for me
personally. I am deeply grateful for the time and effort you have dedicated
to providing valuable comments, as it has significantly improved the code's
quality.

I will address the inline comments in the upcoming version v10. However, I
kindly request some time to wait for feedback from esteemed reviewers such
as Yicong, Bjorn, Will, or anyone else who may find this patchset
intriguing.

Best Regards and Cheers.
Shuai


>> +		}
>> +
>> +		/* Unwind when platform driver removes */
>> +		ret = devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
>> +			&pcie_pmu->cpuhp_node);
>> +		if (ret)
>> +			goto out;
>> +
>> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>> +		if (ret) {
>> +			pci_err(pdev,
>> +				"Error %d registering PMU @%x\n", ret, bdf);
>> +			goto out;
>> +		}
>> +
>> +		/* Cache PMU to handle pci device hotplug */
>> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
>> +		pcie_pmu->registered = true;
>> +		notify = true;
>> +
>> +		ret = devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
> 
> line wrapping is a bit ugly - I would move the &plat_dev->dev to previous line.
> and I think you can get away with aligning the rest just after the (
> 

Actually, I warp this with VSCode automaticlly :(
Will move the &plat_dev->dev to previous line.


> 
> 
> 
>> +static int __init dwc_pcie_pmu_init(void)
>> +{
>> +	int ret;
>> +
>> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>> +				      "perf/dwc_pcie_pmu:online",
>> +				      dwc_pcie_pmu_online_cpu,
>> +				      dwc_pcie_pmu_offline_cpu);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	dwc_pcie_pmu_hp_state = ret;
>> +
>> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
>> +	if (ret)
>> +		goto platform_driver_register_err;
>> +
>> +	dwc_pcie_pmu_dev = platform_device_register_simple(
>> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
>> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
>> +		goto platform_device_register_error;
>> +	}
>> +
>> +	return 0;
>> +
>> +platform_device_register_error:
> 
> Trivial but I'd standardize on err or error, not mix them.

Sorry, will fix it.

> 
>> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
>> +platform_driver_register_err:
>> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
>> +
>> +	return ret;
>> +}
Shuai Xue Oct. 22, 2023, 7:47 a.m. UTC | #3
Hi, Baolin,

I droped your Revivewed-by tag due to that I made significant changes to this
patch previously, please explicty give me Revivewed-by tag again if you are
happy with the changes.

Thank you.
Best Regards,
Shuai

On 2023/10/20 21:42, Shuai Xue wrote:
> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is a PCIe
> configuration space register block provided by each PCIe Root Port in a
> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
> injection, and Statistics).
> 
> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
> 
> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>   time spent in each low-power LTSSM state) and
> - one 32-bit counter for Event Counting (error and non-error events for
>   a specified lane)
> 
> Note: There is no interrupt for counter overflow.
> 
> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
> 
>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
> 
> the PMU device name for this Root Port is dwc_rootport_3018.
> 
> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
> 
>     $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
> 
> average RX bandwidth can be calculated like this:
> 
>     PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
> 
> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> ---
>  drivers/perf/Kconfig        |   7 +
>  drivers/perf/Makefile       |   1 +
>  drivers/perf/dwc_pcie_pmu.c | 770 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 778 insertions(+)
>  create mode 100644 drivers/perf/dwc_pcie_pmu.c
> 
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 273d67ecf6d2..ec6e0d9194a1 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -217,6 +217,13 @@ config MARVELL_CN10K_DDR_PMU
>  	  Enable perf support for Marvell DDR Performance monitoring
>  	  event on CN10K platform.
>  
> +config DWC_PCIE_PMU
> +	tristate "Synopsys DesignWare PCIe PMU"
> +	depends on PCI
> +	help
> +	  Enable perf support for Synopsys DesignWare PCIe PMU Performance
> +	  monitoring event on platform including the Alibaba Yitian 710.
> +
>  source "drivers/perf/arm_cspmu/Kconfig"
>  
>  source "drivers/perf/amlogic/Kconfig"
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 16b3ec4db916..a06338e3401c 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -23,6 +23,7 @@ obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
>  obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
>  obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
>  obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
> +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
>  obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
>  obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
>  obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
> new file mode 100644
> index 000000000000..ddb06d763b0c
> --- /dev/null
> +++ b/drivers/perf/dwc_pcie_pmu.c
> @@ -0,0 +1,770 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Synopsys DesignWare PCIe PMU driver
> + *
> + * Copyright (C) 2021-2023 Alibaba Inc.
> + */
> +
> +#include <linux/bitfield.h>
> +#include <linux/bitops.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/perf_event.h>
> +#include <linux/pci.h>
> +#include <linux/platform_device.h>
> +#include <linux/smp.h>
> +#include <linux/sysfs.h>
> +#include <linux/types.h>
> +
> +#define DWC_PCIE_VSEC_RAS_DES_ID		0x02
> +#define DWC_PCIE_EVENT_CNT_CTL			0x8
> +
> +/*
> + * Event Counter Data Select includes two parts:
> + * - 27-24: Group number(4-bit: 0..0x7)
> + * - 23-16: Event number(8-bit: 0..0x13) within the Group
> + *
> + * Put them together as in TRM.
> + */
> +#define DWC_PCIE_CNT_EVENT_SEL			GENMASK(27, 16)
> +#define DWC_PCIE_CNT_LANE_SEL			GENMASK(11, 8)
> +#define DWC_PCIE_CNT_STATUS			BIT(7)
> +#define DWC_PCIE_CNT_ENABLE			GENMASK(4, 2)
> +#define DWC_PCIE_PER_EVENT_OFF			0x1
> +#define DWC_PCIE_PER_EVENT_ON			0x3
> +#define DWC_PCIE_EVENT_CLEAR			GENMASK(1, 0)
> +#define DWC_PCIE_EVENT_PER_CLEAR		0x1
> +
> +#define DWC_PCIE_EVENT_CNT_DATA			0xC
> +
> +#define DWC_PCIE_TIME_BASED_ANAL_CTL		0x10
> +#define DWC_PCIE_TIME_BASED_REPORT_SEL		GENMASK(31, 24)
> +#define DWC_PCIE_TIME_BASED_DURATION_SEL	GENMASK(15, 8)
> +#define DWC_PCIE_DURATION_MANUAL_CTL		0x0
> +#define DWC_PCIE_DURATION_1MS			0x1
> +#define DWC_PCIE_DURATION_10MS			0x2
> +#define DWC_PCIE_DURATION_100MS			0x3
> +#define DWC_PCIE_DURATION_1S			0x4
> +#define DWC_PCIE_DURATION_2S			0x5
> +#define DWC_PCIE_DURATION_4S			0x6
> +#define DWC_PCIE_DURATION_4US			0xFF
> +#define DWC_PCIE_TIME_BASED_TIMER_START		BIT(0)
> +#define DWC_PCIE_TIME_BASED_CNT_ENABLE		0x1
> +
> +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW	0x14
> +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH	0x18
> +
> +/* Event attributes */
> +#define DWC_PCIE_CONFIG_EVENTID			GENMASK(15, 0)
> +#define DWC_PCIE_CONFIG_TYPE			GENMASK(19, 16)
> +#define DWC_PCIE_CONFIG_LANE			GENMASK(27, 20)
> +
> +#define DWC_PCIE_EVENT_ID(event)	FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
> +#define DWC_PCIE_EVENT_TYPE(event)	FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
> +#define DWC_PCIE_EVENT_LANE(event)	FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
> +
> +enum dwc_pcie_event_type {
> +	DWC_PCIE_TIME_BASE_EVENT,
> +	DWC_PCIE_LANE_EVENT,
> +	DWC_PCIE_EVENT_TYPE_MAX,
> +};
> +
> +#define DWC_PCIE_LANE_EVENT_MAX_PERIOD		GENMASK_ULL(31, 0)
> +#define DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD	GENMASK_ULL(63, 0)
> +
> +struct dwc_pcie_pmu {
> +	struct pmu		pmu;
> +	struct pci_dev		*pdev;		/* Root Port device */
> +	u16			ras_des_offset;
> +	u32			nr_lanes;
> +
> +	struct list_head	pmu_node;
> +	struct hlist_node	cpuhp_node;
> +	struct perf_event	*event[DWC_PCIE_EVENT_TYPE_MAX];
> +	int			on_cpu;
> +	bool			registered;
> +};
> +
> +#define to_dwc_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
> +
> +static struct platform_device *dwc_pcie_pmu_dev;
> +static int dwc_pcie_pmu_hp_state;
> +static struct list_head dwc_pcie_pmu_head = LIST_HEAD_INIT(dwc_pcie_pmu_head);
> +
> +static ssize_t cpumask_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
> +}
> +static DEVICE_ATTR_RO(cpumask);
> +
> +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
> +	&dev_attr_cpumask.attr,
> +	NULL
> +};
> +
> +static struct attribute_group dwc_pcie_cpumask_attr_group = {
> +	.attrs = dwc_pcie_pmu_cpumask_attrs,
> +};
> +
> +struct dwc_pcie_format_attr {
> +	struct device_attribute attr;
> +	u64 field;
> +	int config;
> +};
> +
> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
> +	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
> +
> +	return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
> +}
> +
> +#define _dwc_pcie_format_attr(_name, _cfg, _fld)			    \
> +	(&((struct dwc_pcie_format_attr[]) {{				    \
> +		.attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL),\
> +		.config = _cfg,						    \
> +		.field = _fld,						    \
> +	}})[0].attr.attr)
> +
> +#define dwc_pcie_format_attr(_name, _fld)	_dwc_pcie_format_attr(_name, 0, _fld)
> +
> +static struct attribute *dwc_pcie_format_attrs[] = {
> +	dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
> +	dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
> +	dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
> +	NULL,
> +};
> +
> +static struct attribute_group dwc_pcie_format_attrs_group = {
> +	.name = "format",
> +	.attrs = dwc_pcie_format_attrs,
> +};
> +
> +struct dwc_pcie_event_attr {
> +	struct device_attribute attr;
> +	enum dwc_pcie_event_type type;
> +	u16 eventid;
> +	u8 lane;
> +};
> +
> +static ssize_t dwc_pcie_event_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct dwc_pcie_event_attr *eattr;
> +
> +	eattr = container_of(attr, typeof(*eattr), attr);
> +
> +	if (eattr->type == DWC_PCIE_LANE_EVENT)
> +		return sysfs_emit(buf, "eventid=0x%x,type=0x%x,lane=?\n",
> +				  eattr->eventid, eattr->type);
> +	else if (eattr->type == DWC_PCIE_TIME_BASE_EVENT)
> +		return sysfs_emit(buf, "eventid=0x%x,type=0x%x\n",
> +				  eattr->eventid, eattr->type);
> +
> +	return 0;
> +}
> +
> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane)		\
> +	(&((struct dwc_pcie_event_attr[]) {{				\
> +		.attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL),	\
> +		.type = _type,						\
> +		.eventid = _eventid,					\
> +		.lane = _lane,						\
> +	}})[0].attr.attr)
> +
> +#define DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(_name, _eventid)		\
> +	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
> +#define DWC_PCIE_PMU_LANE_EVENT_ATTR(_name, _eventid)			\
> +	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_LANE_EVENT, _eventid, 0)
> +
> +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
> +	/* Group #0 */
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(one_cycle, 0x00),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_L0S, 0x01),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(RX_L0S, 0x02),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L0, 0x03),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1, 0x04),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_1, 0x05),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_2, 0x06),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(CFG_RCVRY, 0x07),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_RX_L0S, 0x08),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09),
> +
> +	/* Group #1 */
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
> +
> +	/*
> +	 * Leave it to the user to specify the lane ID to avoid generating
> +	 * a list of hundreds of events.
> +	 */
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ack_dllp, 0x600),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_read, 0x703),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_write, 0x704),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_read, 0x705),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_without_data, 0x706),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_with_data, 0x707),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_message_tlp, 0x708),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_atomic, 0x709),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_tlp_with_prefix, 0x70A),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_write, 0x70B),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_read, 0x70C),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_write, 0x70F),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_read, 0x710),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_without_data, 0x711),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_with_data, 0x712),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_message_tlp, 0x713),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_atomic, 0x714),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_tlp_with_prefix, 0x715),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ccix_tlp, 0x716),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ccix_tlp, 0x717),
> +	NULL
> +};
> +
> +static const struct attribute_group dwc_pcie_event_attrs_group = {
> +	.name = "events",
> +	.attrs = dwc_pcie_pmu_time_event_attrs,
> +};
> +
> +static const struct attribute_group *dwc_pcie_attr_groups[] = {
> +	&dwc_pcie_event_attrs_group,
> +	&dwc_pcie_format_attrs_group,
> +	&dwc_pcie_cpumask_attr_group,
> +	NULL
> +};
> +
> +static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					   bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, &val);
> +
> +	/* Clear DWC_PCIE_CNT_ENABLE field first */
> +	val &= ~DWC_PCIE_CNT_ENABLE;
> +	if (enable)
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
> +	else
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, val);
> +}
> +
> +static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					  bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			      &val);
> +
> +	if (enable)
> +		val |= DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +	else
> +		val &= ~DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			       val);
> +}
> +
> +static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
> +
> +	return val;
> +}
> +
> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 lo, hi, ss;
> +
> +	/*
> +	 * The 64-bit value of the data counter is spread across two
> +	 * registers that are not synchronized. In order to read them
> +	 * atomically, ensure that the high 32 bits match before and after
> +	 * reading the low 32 bits.
> +	 */
> +	pci_read_config_dword(pdev, ras_des_offset +
> +		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
> +	do {
> +		/* snapshot the high 32 bits */
> +		ss = hi;
> +
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
> +			&lo);
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
> +			&hi);
> +	} while (hi != ss);
> +
> +	/*
> +	 * The Group#1 event measures the amount of data processed in 16-byte
> +	 * units. Simplify the end-user interface by multiplying the counter
> +	 * at the point of read.
> +	 */
> +	if (event_id >= 0x20 && event_id <= 0x23)
> +		return (((u64)hi << 32) | lo) << 4;
> +	else
> +		return (((u64)hi << 32) | lo);
> +}
> +
> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	u64 delta, prev, now;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +
> +		if (type == DWC_PCIE_LANE_EVENT)
> +			now = dwc_pcie_pmu_read_lane_event_counter(event);
> +		else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +			now = dwc_pcie_pmu_read_time_based_counter(event);
> +
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;
> +
> +	local64_add(delta, &event->count);
> +}
> +
> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct perf_event *sibling;
> +	u32 lane;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/* We don't support sampling */
> +	if (is_sampling_event(event))
> +		return -EINVAL;
> +
> +	/* We cannot support task bound events */
> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
> +		return -EINVAL;
> +
> +	if (event->group_leader != event &&
> +	    !is_software_event(event->group_leader))
> +		return -EINVAL;
> +
> +	for_each_sibling_event(sibling, event->group_leader) {
> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
> +			return -EINVAL;
> +	}
> +
> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		lane = DWC_PCIE_EVENT_LANE(event);
> +		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
> +			return -EINVAL;
> +	}
> +
> +	event->cpu = pcie_pmu->on_cpu;
> +
> +	return 0;
> +}
> +
> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
> +{
> +	local64_set(&hwc->prev_count, 0);
> +}
> +
> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	hwc->state = 0;
> +	dwc_pcie_pmu_set_period(hwc);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
> +}
> +
> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (event->hw.state & PERF_HES_STOPPED)
> +		return;
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
> +
> +	dwc_pcie_pmu_event_update(event);
> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	int lane = DWC_PCIE_EVENT_LANE(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 ctrl;
> +
> +	/* one counter for each type and it is in use */
> +	if (pcie_pmu->event[type])
> +		return -ENOSPC;
> +
> +	pcie_pmu->event[type] = event;
> +	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +
> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		/* EVENT_COUNTER_DATA_REG needs clear manually */
> +		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
> +			FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) |
> +			FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
> +		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
> +				       ctrl);
> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
> +		/*
> +		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
> +		 * use it with any manually controlled duration. And it is
> +		 * cleared when next measurement starts.
> +		 */
> +		ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL,
> +				   DWC_PCIE_DURATION_MANUAL_CTL) |
> +			DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +		pci_write_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl);
> +	}
> +
> +	if (flags & PERF_EF_START)
> +		dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
> +
> +	perf_event_update_userpage(event);
> +
> +	return 0;
> +}
> +
> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
> +	perf_event_update_userpage(event);
> +	pcie_pmu->event[type] = NULL;
> +}
> +
> +static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
> +{
> +	cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node);
> +}
> +
> +/*
> + * Find the PMU of a PCI device.
> + * @pdev: The PCI device.
> + */
> +static struct dwc_pcie_pmu *dwc_pcie_find_dev_pmu(struct pci_dev *pdev)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	list_for_each_entry(pcie_pmu, &dwc_pcie_pmu_head, pmu_node)
> +		if (pcie_pmu->pdev == pdev)
> +			return pcie_pmu;
> +
> +	return NULL;
> +}
> +
> +static void dwc_pcie_pmu_unregister_pmu(void *data)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = data;
> +
> +	if (!pcie_pmu->registered)
> +		return;
> +
> +	pcie_pmu->registered = false;
> +	list_del(&pcie_pmu->pmu_node);
> +	perf_pmu_unregister(&pcie_pmu->pmu);
> +}
> +
> +static int dwc_pcie_pmu_notifier(struct notifier_block *nb,
> +				     unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	/* Unregister the PMU when the device is going to be deleted. */
> +	if (action != BUS_NOTIFY_DEL_DEVICE)
> +		return NOTIFY_DONE;
> +
> +	pcie_pmu = dwc_pcie_find_dev_pmu(pdev);
> +	if (!pcie_pmu)
> +		return NOTIFY_DONE;
> +
> +	dwc_pcie_pmu_unregister_pmu(pcie_pmu);
> +
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block dwc_pcie_pmu_nb = {
> +	.notifier_call = dwc_pcie_pmu_notifier,
> +};
> +
> +static void dwc_pcie_pmu_unregister_nb(void *data)
> +{
> +	bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb);
> +}
> +
> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	bool notify = false;
> +	char *name;
> +	u32 bdf;
> +	int ret;
> +
> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
> +	for_each_pci_dev(pdev) {
> +		u16 vsec;
> +		u32 val;
> +
> +		if (!(pci_is_pcie(pdev) &&
> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
> +			continue;
> +
> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
> +						DWC_PCIE_VSEC_RAS_DES_ID);
> +		if (!vsec)
> +			continue;
> +
> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
> +			continue;
> +		pci_dbg(pdev,
> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
> +
> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
> +				      bdf);
> +		if (!name) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		/* All checks passed, go go go */
> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
> +		if (!pcie_pmu) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		pcie_pmu->pdev = pdev;
> +		pcie_pmu->ras_des_offset = vsec;
> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
> +		pcie_pmu->on_cpu = -1;
> +		pcie_pmu->pmu = (struct pmu){
> +			.module		= THIS_MODULE,
> +			.attr_groups	= dwc_pcie_attr_groups,
> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +			.task_ctx_nr	= perf_invalid_context,
> +			.event_init	= dwc_pcie_pmu_event_init,
> +			.add		= dwc_pcie_pmu_event_add,
> +			.del		= dwc_pcie_pmu_event_del,
> +			.start		= dwc_pcie_pmu_event_start,
> +			.stop		= dwc_pcie_pmu_event_stop,
> +			.read		= dwc_pcie_pmu_event_update,
> +		};
> +
> +		/* Add this instance to the list used by the offline callback */
> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
> +					       &pcie_pmu->cpuhp_node);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering hotplug @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Unwind when platform driver removes */
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
> +			&pcie_pmu->cpuhp_node);
> +		if (ret)
> +			goto out;
> +
> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering PMU @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Cache PMU to handle pci device hotplug */
> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
> +		pcie_pmu->registered = true;
> +		notify = true;
> +
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
> +		return devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
> +
> +	return 0;
> +
> +out:
> +	pci_dev_put(pdev);
> +
> +	return ret;
> +}
> +
> +static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	if (pcie_pmu->on_cpu == -1)
> +		pcie_pmu->on_cpu = cpumask_local_spread(
> +			0, dev_to_node(&pcie_pmu->pdev->dev));
> +
> +	return 0;
> +}
> +
> +static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	struct pci_dev *pdev;
> +	int node;
> +	cpumask_t mask;
> +	unsigned int target;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	/* Nothing to do if this CPU doesn't own the PMU */
> +	if (cpu != pcie_pmu->on_cpu)
> +		return 0;
> +
> +	pcie_pmu->on_cpu = -1;
> +	pdev = pcie_pmu->pdev;
> +	node = dev_to_node(&pdev->dev);
> +	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
> +	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
> +		target = cpumask_any(&mask);
> +	else
> +		target = cpumask_any_but(cpu_online_mask, cpu);
> +
> +	if (target >= nr_cpu_ids) {
> +		pci_err(pdev, "There is no CPU to set\n");
> +		return 0;
> +	}
> +
> +	/* This PMU does NOT support interrupt, just migrate context. */
> +	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
> +	pcie_pmu->on_cpu = target;
> +
> +	return 0;
> +}
> +
> +static struct platform_driver dwc_pcie_pmu_driver = {
> +	.probe = dwc_pcie_pmu_probe,
> +	.driver = {.name = "dwc_pcie_pmu",},
> +};
> +
> +static int __init dwc_pcie_pmu_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +				      "perf/dwc_pcie_pmu:online",
> +				      dwc_pcie_pmu_online_cpu,
> +				      dwc_pcie_pmu_offline_cpu);
> +	if (ret < 0)
> +		return ret;
> +
> +	dwc_pcie_pmu_hp_state = ret;
> +
> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
> +	if (ret)
> +		goto platform_driver_register_err;
> +
> +	dwc_pcie_pmu_dev = platform_device_register_simple(
> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
> +		goto platform_device_register_error;
> +	}
> +
> +	return 0;
> +
> +platform_device_register_error:
> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
> +platform_driver_register_err:
> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
> +
> +	return ret;
> +}
> +
> +static void __exit dwc_pcie_pmu_exit(void)
> +{
> +	platform_device_unregister(dwc_pcie_pmu_dev);
> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
> +}
> +
> +module_init(dwc_pcie_pmu_init);
> +module_exit(dwc_pcie_pmu_exit);
> +
> +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
> +MODULE_AUTHOR("Shuai Xue <xueshuai@linux.alibaba.com>");
> +MODULE_LICENSE("GPL v2");
Baolin Wang Oct. 23, 2023, 2:05 a.m. UTC | #4
On 10/22/2023 3:47 PM, Shuai Xue wrote:
> Hi, Baolin,
> 
> I droped your Revivewed-by tag due to that I made significant changes to this
> patch previously, please explicty give me Revivewed-by tag again if you are
> happy with the changes.

Yes, I am happy with this version (just some nits as below), and thanks 
for the review from other guys. Please feel free to add:

Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>

> On 2023/10/20 21:42, Shuai Xue wrote:
>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>> Core controller IP which provides statistics feature. The PMU is a PCIe
>> configuration space register block provided by each PCIe Root Port in a
>> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
>> injection, and Statistics).
>>
>> To facilitate collection of statistics the controller provides the
>> following two features for each Root Port:
>>
>> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>>    time spent in each low-power LTSSM state) and
>> - one 32-bit counter for Event Counting (error and non-error events for
>>    a specified lane)
>>
>> Note: There is no interrupt for counter overflow.
>>
>> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
>> named based the BDF of Root Port. For example,
>>
>>      30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
>>
>> the PMU device name for this Root Port is dwc_rootport_3018.
>>
>> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
>>
>>      $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
>>
>> average RX bandwidth can be calculated like this:
>>
>>      PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
>>
>> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
>> ---

[snip]

>> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>> +	int event_id = DWC_PCIE_EVENT_ID(event);
>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>> +	u32 lo, hi, ss;
>> +
>> +	/*
>> +	 * The 64-bit value of the data counter is spread across two
>> +	 * registers that are not synchronized. In order to read them
>> +	 * atomically, ensure that the high 32 bits match before and after
>> +	 * reading the low 32 bits.
>> +	 */
>> +	pci_read_config_dword(pdev, ras_des_offset +
>> +		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
>> +	do {
>> +		/* snapshot the high 32 bits */
>> +		ss = hi;
>> +
>> +		pci_read_config_dword(
>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
>> +			&lo);
>> +		pci_read_config_dword(
>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
>> +			&hi);
>> +	} while (hi != ss);
>> +
>> +	/*
>> +	 * The Group#1 event measures the amount of data processed in 16-byte
>> +	 * units. Simplify the end-user interface by multiplying the counter
>> +	 * at the point of read.
>> +	 */
>> +	if (event_id >= 0x20 && event_id <= 0x23)
>> +		return (((u64)hi << 32) | lo) << 4;
>> +	else

You can drop the 'else'.

>> +		return (((u64)hi << 32) | lo);
>> +}
>> +
>> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
>> +{
>> +	struct hw_perf_event *hwc = &event->hw;
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +	u64 delta, prev, now;
>> +
>> +	do {
>> +		prev = local64_read(&hwc->prev_count);
>> +
>> +		if (type == DWC_PCIE_LANE_EVENT)
>> +			now = dwc_pcie_pmu_read_lane_event_counter(event);
>> +		else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> +			now = dwc_pcie_pmu_read_time_based_counter(event);
>> +
>> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
>> +
>> +	if (type == DWC_PCIE_LANE_EVENT)
>> +		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> +		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;
>> +
>> +	local64_add(delta, &event->count);
>> +}
>> +
>> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +	struct perf_event *sibling;
>> +	u32 lane;
>> +
>> +	if (event->attr.type != event->pmu->type)
>> +		return -ENOENT;
>> +
>> +	/* We don't support sampling */
>> +	if (is_sampling_event(event))
>> +		return -EINVAL;
>> +
>> +	/* We cannot support task bound events */
>> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
>> +		return -EINVAL;
>> +
>> +	if (event->group_leader != event &&
>> +	    !is_software_event(event->group_leader))
>> +		return -EINVAL;
>> +
>> +	for_each_sibling_event(sibling, event->group_leader) {
>> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
>> +			return -EINVAL;
>> +	}
>> +
>> +	if (type == DWC_PCIE_LANE_EVENT) {
>> +		lane = DWC_PCIE_EVENT_LANE(event);
>> +		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
>> +			return -EINVAL;
>> +	}
>> +
>> +	event->cpu = pcie_pmu->on_cpu;
>> +
>> +	return 0;
>> +}
>> +
>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>> +{
>> +	local64_set(&hwc->prev_count, 0);
>> +}

Only dwc_pcie_pmu_event_start() will call this small function, why just 
remove this function and move local64_set() into dwc_pcie_pmu_event_start()?

>> +
>> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
>> +{
>> +	struct hw_perf_event *hwc = &event->hw;
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +
>> +	hwc->state = 0;
>> +	dwc_pcie_pmu_set_period(hwc);
>> +
>> +	if (type == DWC_PCIE_LANE_EVENT)
>> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
>> +}
>> +
Yicong Yang Oct. 23, 2023, 9:13 a.m. UTC | #5
Hi Shuai,

On 2023/10/20 21:42, Shuai Xue wrote:
> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is a PCIe
> configuration space register block provided by each PCIe Root Port in a
> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
> injection, and Statistics).
> 
> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
> 
> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>   time spent in each low-power LTSSM state) and
> - one 32-bit counter for Event Counting (error and non-error events for
>   a specified lane)
> 
> Note: There is no interrupt for counter overflow.
> 
> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
> 
>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
> 
> the PMU device name for this Root Port is dwc_rootport_3018.
> 
> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
> 
>     $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
> 
> average RX bandwidth can be calculated like this:
> 
>     PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
> 
> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>

Just one nit below. Otherwise looks good to me,

Reviewed-by: Yicong Yang <yangyicong@hisilicon.com>

> ---
>  drivers/perf/Kconfig        |   7 +
>  drivers/perf/Makefile       |   1 +
>  drivers/perf/dwc_pcie_pmu.c | 770 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 778 insertions(+)
>  create mode 100644 drivers/perf/dwc_pcie_pmu.c
> 
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 273d67ecf6d2..ec6e0d9194a1 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -217,6 +217,13 @@ config MARVELL_CN10K_DDR_PMU
>  	  Enable perf support for Marvell DDR Performance monitoring
>  	  event on CN10K platform.
>  
> +config DWC_PCIE_PMU
> +	tristate "Synopsys DesignWare PCIe PMU"
> +	depends on PCI
> +	help
> +	  Enable perf support for Synopsys DesignWare PCIe PMU Performance
> +	  monitoring event on platform including the Alibaba Yitian 710.
> +
>  source "drivers/perf/arm_cspmu/Kconfig"
>  
>  source "drivers/perf/amlogic/Kconfig"
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 16b3ec4db916..a06338e3401c 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -23,6 +23,7 @@ obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
>  obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
>  obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
>  obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
> +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
>  obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
>  obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
>  obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
> new file mode 100644
> index 000000000000..ddb06d763b0c
> --- /dev/null
> +++ b/drivers/perf/dwc_pcie_pmu.c
> @@ -0,0 +1,770 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Synopsys DesignWare PCIe PMU driver
> + *
> + * Copyright (C) 2021-2023 Alibaba Inc.
> + */
> +
> +#include <linux/bitfield.h>
> +#include <linux/bitops.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/perf_event.h>
> +#include <linux/pci.h>
> +#include <linux/platform_device.h>
> +#include <linux/smp.h>
> +#include <linux/sysfs.h>
> +#include <linux/types.h>
> +
> +#define DWC_PCIE_VSEC_RAS_DES_ID		0x02
> +#define DWC_PCIE_EVENT_CNT_CTL			0x8
> +
> +/*
> + * Event Counter Data Select includes two parts:
> + * - 27-24: Group number(4-bit: 0..0x7)
> + * - 23-16: Event number(8-bit: 0..0x13) within the Group
> + *
> + * Put them together as in TRM.
> + */
> +#define DWC_PCIE_CNT_EVENT_SEL			GENMASK(27, 16)
> +#define DWC_PCIE_CNT_LANE_SEL			GENMASK(11, 8)
> +#define DWC_PCIE_CNT_STATUS			BIT(7)
> +#define DWC_PCIE_CNT_ENABLE			GENMASK(4, 2)
> +#define DWC_PCIE_PER_EVENT_OFF			0x1
> +#define DWC_PCIE_PER_EVENT_ON			0x3
> +#define DWC_PCIE_EVENT_CLEAR			GENMASK(1, 0)
> +#define DWC_PCIE_EVENT_PER_CLEAR		0x1
> +
> +#define DWC_PCIE_EVENT_CNT_DATA			0xC
> +
> +#define DWC_PCIE_TIME_BASED_ANAL_CTL		0x10
> +#define DWC_PCIE_TIME_BASED_REPORT_SEL		GENMASK(31, 24)
> +#define DWC_PCIE_TIME_BASED_DURATION_SEL	GENMASK(15, 8)
> +#define DWC_PCIE_DURATION_MANUAL_CTL		0x0
> +#define DWC_PCIE_DURATION_1MS			0x1
> +#define DWC_PCIE_DURATION_10MS			0x2
> +#define DWC_PCIE_DURATION_100MS			0x3
> +#define DWC_PCIE_DURATION_1S			0x4
> +#define DWC_PCIE_DURATION_2S			0x5
> +#define DWC_PCIE_DURATION_4S			0x6
> +#define DWC_PCIE_DURATION_4US			0xFF
> +#define DWC_PCIE_TIME_BASED_TIMER_START		BIT(0)
> +#define DWC_PCIE_TIME_BASED_CNT_ENABLE		0x1
> +
> +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW	0x14
> +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH	0x18
> +
> +/* Event attributes */
> +#define DWC_PCIE_CONFIG_EVENTID			GENMASK(15, 0)
> +#define DWC_PCIE_CONFIG_TYPE			GENMASK(19, 16)
> +#define DWC_PCIE_CONFIG_LANE			GENMASK(27, 20)
> +
> +#define DWC_PCIE_EVENT_ID(event)	FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
> +#define DWC_PCIE_EVENT_TYPE(event)	FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
> +#define DWC_PCIE_EVENT_LANE(event)	FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
> +
> +enum dwc_pcie_event_type {
> +	DWC_PCIE_TIME_BASE_EVENT,
> +	DWC_PCIE_LANE_EVENT,
> +	DWC_PCIE_EVENT_TYPE_MAX,
> +};
> +
> +#define DWC_PCIE_LANE_EVENT_MAX_PERIOD		GENMASK_ULL(31, 0)
> +#define DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD	GENMASK_ULL(63, 0)
> +
> +struct dwc_pcie_pmu {
> +	struct pmu		pmu;
> +	struct pci_dev		*pdev;		/* Root Port device */
> +	u16			ras_des_offset;
> +	u32			nr_lanes;
> +
> +	struct list_head	pmu_node;
> +	struct hlist_node	cpuhp_node;
> +	struct perf_event	*event[DWC_PCIE_EVENT_TYPE_MAX];
> +	int			on_cpu;
> +	bool			registered;
> +};
> +
> +#define to_dwc_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
> +
> +static struct platform_device *dwc_pcie_pmu_dev;
> +static int dwc_pcie_pmu_hp_state;
> +static struct list_head dwc_pcie_pmu_head = LIST_HEAD_INIT(dwc_pcie_pmu_head);
> +
> +static ssize_t cpumask_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
> +}
> +static DEVICE_ATTR_RO(cpumask);
> +
> +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
> +	&dev_attr_cpumask.attr,
> +	NULL
> +};
> +
> +static struct attribute_group dwc_pcie_cpumask_attr_group = {
> +	.attrs = dwc_pcie_pmu_cpumask_attrs,
> +};
> +
> +struct dwc_pcie_format_attr {
> +	struct device_attribute attr;
> +	u64 field;
> +	int config;
> +};
> +
> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
> +	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
> +
> +	return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
> +}
> +
> +#define _dwc_pcie_format_attr(_name, _cfg, _fld)			    \
> +	(&((struct dwc_pcie_format_attr[]) {{				    \
> +		.attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL),\
> +		.config = _cfg,						    \
> +		.field = _fld,						    \
> +	}})[0].attr.attr)
> +
> +#define dwc_pcie_format_attr(_name, _fld)	_dwc_pcie_format_attr(_name, 0, _fld)
> +
> +static struct attribute *dwc_pcie_format_attrs[] = {
> +	dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
> +	dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
> +	dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
> +	NULL,
> +};
> +
> +static struct attribute_group dwc_pcie_format_attrs_group = {
> +	.name = "format",
> +	.attrs = dwc_pcie_format_attrs,
> +};
> +
> +struct dwc_pcie_event_attr {
> +	struct device_attribute attr;
> +	enum dwc_pcie_event_type type;
> +	u16 eventid;
> +	u8 lane;
> +};
> +
> +static ssize_t dwc_pcie_event_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct dwc_pcie_event_attr *eattr;
> +
> +	eattr = container_of(attr, typeof(*eattr), attr);
> +
> +	if (eattr->type == DWC_PCIE_LANE_EVENT)
> +		return sysfs_emit(buf, "eventid=0x%x,type=0x%x,lane=?\n",
> +				  eattr->eventid, eattr->type);
> +	else if (eattr->type == DWC_PCIE_TIME_BASE_EVENT)
> +		return sysfs_emit(buf, "eventid=0x%x,type=0x%x\n",
> +				  eattr->eventid, eattr->type);
> +
> +	return 0;
> +}
> +
> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane)		\
> +	(&((struct dwc_pcie_event_attr[]) {{				\
> +		.attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL),	\
> +		.type = _type,						\
> +		.eventid = _eventid,					\
> +		.lane = _lane,						\
> +	}})[0].attr.attr)
> +
> +#define DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(_name, _eventid)		\
> +	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
> +#define DWC_PCIE_PMU_LANE_EVENT_ATTR(_name, _eventid)			\
> +	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_LANE_EVENT, _eventid, 0)
> +
> +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
> +	/* Group #0 */
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(one_cycle, 0x00),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_L0S, 0x01),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(RX_L0S, 0x02),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L0, 0x03),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1, 0x04),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_1, 0x05),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_2, 0x06),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(CFG_RCVRY, 0x07),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_RX_L0S, 0x08),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09),
> +
> +	/* Group #1 */
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
> +
> +	/*
> +	 * Leave it to the user to specify the lane ID to avoid generating
> +	 * a list of hundreds of events.
> +	 */
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ack_dllp, 0x600),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_read, 0x703),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_write, 0x704),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_read, 0x705),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_without_data, 0x706),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_with_data, 0x707),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_message_tlp, 0x708),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_atomic, 0x709),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_tlp_with_prefix, 0x70A),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_write, 0x70B),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_read, 0x70C),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_write, 0x70F),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_read, 0x710),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_without_data, 0x711),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_with_data, 0x712),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_message_tlp, 0x713),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_atomic, 0x714),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_tlp_with_prefix, 0x715),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ccix_tlp, 0x716),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ccix_tlp, 0x717),
> +	NULL
> +};
> +
> +static const struct attribute_group dwc_pcie_event_attrs_group = {
> +	.name = "events",
> +	.attrs = dwc_pcie_pmu_time_event_attrs,
> +};
> +
> +static const struct attribute_group *dwc_pcie_attr_groups[] = {
> +	&dwc_pcie_event_attrs_group,
> +	&dwc_pcie_format_attrs_group,
> +	&dwc_pcie_cpumask_attr_group,
> +	NULL
> +};
> +
> +static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					   bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, &val);
> +
> +	/* Clear DWC_PCIE_CNT_ENABLE field first */
> +	val &= ~DWC_PCIE_CNT_ENABLE;
> +	if (enable)
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
> +	else
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, val);
> +}
> +
> +static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					  bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			      &val);
> +
> +	if (enable)
> +		val |= DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +	else
> +		val &= ~DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			       val);
> +}
> +
> +static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
> +
> +	return val;
> +}
> +
> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 lo, hi, ss;
> +
> +	/*
> +	 * The 64-bit value of the data counter is spread across two
> +	 * registers that are not synchronized. In order to read them
> +	 * atomically, ensure that the high 32 bits match before and after
> +	 * reading the low 32 bits.
> +	 */
> +	pci_read_config_dword(pdev, ras_des_offset +
> +		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
> +	do {
> +		/* snapshot the high 32 bits */
> +		ss = hi;
> +
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
> +			&lo);
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
> +			&hi);
> +	} while (hi != ss);
> +
> +	/*
> +	 * The Group#1 event measures the amount of data processed in 16-byte
> +	 * units. Simplify the end-user interface by multiplying the counter
> +	 * at the point of read.
> +	 */
> +	if (event_id >= 0x20 && event_id <= 0x23)
> +		return (((u64)hi << 32) | lo) << 4;
> +	else
> +		return (((u64)hi << 32) | lo);
> +}
> +
> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	u64 delta, prev, now;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +
> +		if (type == DWC_PCIE_LANE_EVENT)
> +			now = dwc_pcie_pmu_read_lane_event_counter(event);
> +		else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +			now = dwc_pcie_pmu_read_time_based_counter(event);
> +
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;
> +
> +	local64_add(delta, &event->count);
> +}
> +
> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct perf_event *sibling;
> +	u32 lane;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/* We don't support sampling */
> +	if (is_sampling_event(event))
> +		return -EINVAL;
> +
> +	/* We cannot support task bound events */
> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
> +		return -EINVAL;
> +
> +	if (event->group_leader != event &&
> +	    !is_software_event(event->group_leader))
> +		return -EINVAL;
> +
> +	for_each_sibling_event(sibling, event->group_leader) {
> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
> +			return -EINVAL;
> +	}
> +
> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		lane = DWC_PCIE_EVENT_LANE(event);
> +		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
> +			return -EINVAL;
> +	}
> +
> +	event->cpu = pcie_pmu->on_cpu;
> +
> +	return 0;
> +}
> +
> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
> +{
> +	local64_set(&hwc->prev_count, 0);
> +}
> +
> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	hwc->state = 0;
> +	dwc_pcie_pmu_set_period(hwc);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
> +}
> +
> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (event->hw.state & PERF_HES_STOPPED)
> +		return;
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
> +
> +	dwc_pcie_pmu_event_update(event);
> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	int lane = DWC_PCIE_EVENT_LANE(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 ctrl;
> +
> +	/* one counter for each type and it is in use */
> +	if (pcie_pmu->event[type])
> +		return -ENOSPC;
> +
> +	pcie_pmu->event[type] = event;
> +	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +
> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		/* EVENT_COUNTER_DATA_REG needs clear manually */
> +		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
> +			FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) |
> +			FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
> +		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
> +				       ctrl);
> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
> +		/*
> +		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
> +		 * use it with any manually controlled duration. And it is
> +		 * cleared when next measurement starts.
> +		 */
> +		ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL,
> +				   DWC_PCIE_DURATION_MANUAL_CTL) |
> +			DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +		pci_write_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl);
> +	}
> +
> +	if (flags & PERF_EF_START)
> +		dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
> +
> +	perf_event_update_userpage(event);
> +
> +	return 0;
> +}
> +
> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
> +	perf_event_update_userpage(event);
> +	pcie_pmu->event[type] = NULL;
> +}
> +
> +static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
> +{
> +	cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node);
> +}
> +
> +/*
> + * Find the PMU of a PCI device.
> + * @pdev: The PCI device.
> + */
> +static struct dwc_pcie_pmu *dwc_pcie_find_dev_pmu(struct pci_dev *pdev)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	list_for_each_entry(pcie_pmu, &dwc_pcie_pmu_head, pmu_node)
> +		if (pcie_pmu->pdev == pdev)
> +			return pcie_pmu;
> +
> +	return NULL;
> +}
> +
> +static void dwc_pcie_pmu_unregister_pmu(void *data)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = data;
> +
> +	if (!pcie_pmu->registered)
> +		return;
> +
> +	pcie_pmu->registered = false;
> +	list_del(&pcie_pmu->pmu_node);
> +	perf_pmu_unregister(&pcie_pmu->pmu);
> +}
> +
> +static int dwc_pcie_pmu_notifier(struct notifier_block *nb,
> +				     unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	/* Unregister the PMU when the device is going to be deleted. */
> +	if (action != BUS_NOTIFY_DEL_DEVICE)
> +		return NOTIFY_DONE;
> +
> +	pcie_pmu = dwc_pcie_find_dev_pmu(pdev);
> +	if (!pcie_pmu)
> +		return NOTIFY_DONE;
> +
> +	dwc_pcie_pmu_unregister_pmu(pcie_pmu);
> +
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block dwc_pcie_pmu_nb = {
> +	.notifier_call = dwc_pcie_pmu_notifier,
> +};
> +
> +static void dwc_pcie_pmu_unregister_nb(void *data)
> +{
> +	bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb);
> +}
> +
> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	bool notify = false;
> +	char *name;
> +	u32 bdf;
> +	int ret;
> +
> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
> +	for_each_pci_dev(pdev) {
> +		u16 vsec;
> +		u32 val;
> +
> +		if (!(pci_is_pcie(pdev) &&
> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
> +			continue;
> +
> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
> +						DWC_PCIE_VSEC_RAS_DES_ID);
> +		if (!vsec)
> +			continue;
> +
> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
> +			continue;
> +		pci_dbg(pdev,
> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
> +
> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
> +				      bdf);
> +		if (!name) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		/* All checks passed, go go go */
> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
> +		if (!pcie_pmu) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		pcie_pmu->pdev = pdev;
> +		pcie_pmu->ras_des_offset = vsec;
> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
> +		pcie_pmu->on_cpu = -1;
> +		pcie_pmu->pmu = (struct pmu){
> +			.module		= THIS_MODULE,
> +			.attr_groups	= dwc_pcie_attr_groups,
> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +			.task_ctx_nr	= perf_invalid_context,
> +			.event_init	= dwc_pcie_pmu_event_init,
> +			.add		= dwc_pcie_pmu_event_add,
> +			.del		= dwc_pcie_pmu_event_del,
> +			.start		= dwc_pcie_pmu_event_start,
> +			.stop		= dwc_pcie_pmu_event_stop,
> +			.read		= dwc_pcie_pmu_event_update,
> +		};
> +
> +		/* Add this instance to the list used by the offline callback */
> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
> +					       &pcie_pmu->cpuhp_node);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering hotplug @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Unwind when platform driver removes */
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
> +			&pcie_pmu->cpuhp_node);
> +		if (ret)
> +			goto out;
> +
> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering PMU @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Cache PMU to handle pci device hotplug */
> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
> +		pcie_pmu->registered = true;
> +		notify = true;
> +
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
> +		return devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);

Maybe you can register the notifier firstly in the probe(). It'll be unregistered
once failed to add the PMU. If no PMU registered it also should be ok since the
PMU list will be empty and notifier callback will do nothing.

This may address one potential race on driver removal. Since the notifier will be
unregistered firstly but the PMU's still registered and may have chance to
access pointer to the root port. However it's so extreme so may never happen.

> +
> +	return 0;
> +
> +out:
> +	pci_dev_put(pdev);
> +
> +	return ret;
> +}
> +
> +static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	if (pcie_pmu->on_cpu == -1)
> +		pcie_pmu->on_cpu = cpumask_local_spread(
> +			0, dev_to_node(&pcie_pmu->pdev->dev));
> +
> +	return 0;
> +}
> +
> +static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	struct pci_dev *pdev;
> +	int node;
> +	cpumask_t mask;
> +	unsigned int target;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	/* Nothing to do if this CPU doesn't own the PMU */
> +	if (cpu != pcie_pmu->on_cpu)
> +		return 0;
> +
> +	pcie_pmu->on_cpu = -1;
> +	pdev = pcie_pmu->pdev;
> +	node = dev_to_node(&pdev->dev);
> +	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
> +	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
> +		target = cpumask_any(&mask);
> +	else
> +		target = cpumask_any_but(cpu_online_mask, cpu);
> +
> +	if (target >= nr_cpu_ids) {
> +		pci_err(pdev, "There is no CPU to set\n");
> +		return 0;
> +	}
> +
> +	/* This PMU does NOT support interrupt, just migrate context. */
> +	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
> +	pcie_pmu->on_cpu = target;
> +
> +	return 0;
> +}
> +
> +static struct platform_driver dwc_pcie_pmu_driver = {
> +	.probe = dwc_pcie_pmu_probe,
> +	.driver = {.name = "dwc_pcie_pmu",},
> +};
> +
> +static int __init dwc_pcie_pmu_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +				      "perf/dwc_pcie_pmu:online",
> +				      dwc_pcie_pmu_online_cpu,
> +				      dwc_pcie_pmu_offline_cpu);
> +	if (ret < 0)
> +		return ret;
> +
> +	dwc_pcie_pmu_hp_state = ret;
> +
> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
> +	if (ret)
> +		goto platform_driver_register_err;
> +
> +	dwc_pcie_pmu_dev = platform_device_register_simple(
> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
> +		goto platform_device_register_error;
> +	}
> +
> +	return 0;
> +
> +platform_device_register_error:
> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
> +platform_driver_register_err:
> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
> +
> +	return ret;
> +}
> +
> +static void __exit dwc_pcie_pmu_exit(void)
> +{
> +	platform_device_unregister(dwc_pcie_pmu_dev);
> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
> +}
> +
> +module_init(dwc_pcie_pmu_init);
> +module_exit(dwc_pcie_pmu_exit);
> +
> +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
> +MODULE_AUTHOR("Shuai Xue <xueshuai@linux.alibaba.com>");
> +MODULE_LICENSE("GPL v2");
>
Shuai Xue Oct. 23, 2023, 9:39 a.m. UTC | #6
On 2023/10/23 10:05, Baolin Wang wrote:
> 
> 
> On 10/22/2023 3:47 PM, Shuai Xue wrote:
>> Hi, Baolin,
>>
>> I droped your Revivewed-by tag due to that I made significant changes to this
>> patch previously, please explicty give me Revivewed-by tag again if you are
>> happy with the changes.
> 
> Yes, I am happy with this version (just some nits as below), and thanks for the review from other guys. Please feel free to add:
> 
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>


Thank you.

Best Regards,
Shuai

> 
>> On 2023/10/20 21:42, Shuai Xue wrote:
>>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>>> Core controller IP which provides statistics feature. The PMU is a PCIe
>>> configuration space register block provided by each PCIe Root Port in a
>>> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
>>> injection, and Statistics).
>>>
>>> To facilitate collection of statistics the controller provides the
>>> following two features for each Root Port:
>>>
>>> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>>>    time spent in each low-power LTSSM state) and
>>> - one 32-bit counter for Event Counting (error and non-error events for
>>>    a specified lane)
>>>
>>> Note: There is no interrupt for counter overflow.
>>>
>>> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
>>> named based the BDF of Root Port. For example,
>>>
>>>      30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
>>>
>>> the PMU device name for this Root Port is dwc_rootport_3018.
>>>
>>> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
>>>
>>>      $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
>>>
>>> average RX bandwidth can be calculated like this:
>>>
>>>      PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
>>>
>>> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
>>> ---
> 
> [snip]
> 
>>> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
>>> +{
>>> +    struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +    struct pci_dev *pdev = pcie_pmu->pdev;
>>> +    int event_id = DWC_PCIE_EVENT_ID(event);
>>> +    u16 ras_des_offset = pcie_pmu->ras_des_offset;
>>> +    u32 lo, hi, ss;
>>> +
>>> +    /*
>>> +     * The 64-bit value of the data counter is spread across two
>>> +     * registers that are not synchronized. In order to read them
>>> +     * atomically, ensure that the high 32 bits match before and after
>>> +     * reading the low 32 bits.
>>> +     */
>>> +    pci_read_config_dword(pdev, ras_des_offset +
>>> +        DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
>>> +    do {
>>> +        /* snapshot the high 32 bits */
>>> +        ss = hi;
>>> +
>>> +        pci_read_config_dword(
>>> +            pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
>>> +            &lo);
>>> +        pci_read_config_dword(
>>> +            pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
>>> +            &hi);
>>> +    } while (hi != ss);
>>> +
>>> +    /*
>>> +     * The Group#1 event measures the amount of data processed in 16-byte
>>> +     * units. Simplify the end-user interface by multiplying the counter
>>> +     * at the point of read.
>>> +     */
>>> +    if (event_id >= 0x20 && event_id <= 0x23)
>>> +        return (((u64)hi << 32) | lo) << 4;
>>> +    else
> 
> You can drop the 'else'.

Agreed, will fix it in next version.


>>> +
>>> +    event->cpu = pcie_pmu->on_cpu;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>>> +{
>>> +    local64_set(&hwc->prev_count, 0);
>>> +}
> 
> Only dwc_pcie_pmu_event_start() will call this small function, why just remove this function and move local64_set() into dwc_pcie_pmu_event_start()?

Good suggestion, will fix it.
Shuai Xue Oct. 23, 2023, 12:26 p.m. UTC | #7
On 2023/10/23 17:13, Yicong Yang wrote:
> Hi Shuai,
> 
> On 2023/10/20 21:42, Shuai Xue wrote:
>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>> Core controller IP which provides statistics feature. The PMU is a PCIe
>> configuration space register block provided by each PCIe Root Port in a
>> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
>> injection, and Statistics).
>>
>> To facilitate collection of statistics the controller provides the
>> following two features for each Root Port:
>>
>> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>>   time spent in each low-power LTSSM state) and
>> - one 32-bit counter for Event Counting (error and non-error events for
>>   a specified lane)
>>
>> Note: There is no interrupt for counter overflow.
>>
>> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
>> named based the BDF of Root Port. For example,
>>
>>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
>>
>> the PMU device name for this Root Port is dwc_rootport_3018.
>>
>> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
>>
>>     $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
>>
>> average RX bandwidth can be calculated like this:
>>
>>     PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
>>
>> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> 
> Just one nit below. Otherwise looks good to me,
> 
> Reviewed-by: Yicong Yang <yangyicong@hisilicon.com>


Hi, Yicong,

Time flies indeed. I am pleasantly surprised to realize that it has been
over a year since you sayed "Glad to see another PCIe PMU device!" in the
initial version. I want to express my deepest gratitude for the significant
time and effort you have invested in providing invaluable comments. Your
dedication has played a pivotal role in greatly enhancing the quality of
the code.

Thank you so much.

Best Regards,
Shuai

>> +
>> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct dwc_pcie_pmu *pcie_pmu;
>> +	bool notify = false;
>> +	char *name;
>> +	u32 bdf;
>> +	int ret;
>> +
>> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
>> +	for_each_pci_dev(pdev) {
>> +		u16 vsec;
>> +		u32 val;
>> +
>> +		if (!(pci_is_pcie(pdev) &&
>> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
>> +			continue;
>> +
>> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
>> +						DWC_PCIE_VSEC_RAS_DES_ID);
>> +		if (!vsec)
>> +			continue;
>> +
>> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
>> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
>> +			continue;
>> +		pci_dbg(pdev,
>> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
>> +
>> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
>> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
>> +				      bdf);
>> +		if (!name) {
>> +			ret = -ENOMEM;
>> +			goto out;
>> +		}
>> +
>> +		/* All checks passed, go go go */
>> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
>> +		if (!pcie_pmu) {
>> +			ret = -ENOMEM;
>> +			goto out;
>> +		}
>> +
>> +		pcie_pmu->pdev = pdev;
>> +		pcie_pmu->ras_des_offset = vsec;
>> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
>> +		pcie_pmu->on_cpu = -1;
>> +		pcie_pmu->pmu = (struct pmu){
>> +			.module		= THIS_MODULE,
>> +			.attr_groups	= dwc_pcie_attr_groups,
>> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
>> +			.task_ctx_nr	= perf_invalid_context,
>> +			.event_init	= dwc_pcie_pmu_event_init,
>> +			.add		= dwc_pcie_pmu_event_add,
>> +			.del		= dwc_pcie_pmu_event_del,
>> +			.start		= dwc_pcie_pmu_event_start,
>> +			.stop		= dwc_pcie_pmu_event_stop,
>> +			.read		= dwc_pcie_pmu_event_update,
>> +		};
>> +
>> +		/* Add this instance to the list used by the offline callback */
>> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
>> +					       &pcie_pmu->cpuhp_node);
>> +		if (ret) {
>> +			pci_err(pdev,
>> +				"Error %d registering hotplug @%x\n", ret, bdf);
>> +			goto out;
>> +		}
>> +
>> +		/* Unwind when platform driver removes */
>> +		ret = devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
>> +			&pcie_pmu->cpuhp_node);
>> +		if (ret)
>> +			goto out;
>> +
>> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>> +		if (ret) {
>> +			pci_err(pdev,
>> +				"Error %d registering PMU @%x\n", ret, bdf);
>> +			goto out;
>> +		}
>> +
>> +		/* Cache PMU to handle pci device hotplug */
>> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
>> +		pcie_pmu->registered = true;
>> +		notify = true;
>> +
>> +		ret = devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
>> +		if (ret)
>> +			goto out;
>> +	}
>> +
>> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
>> +		return devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
> 
> Maybe you can register the notifier firstly in the probe(). It'll be unregistered
> once failed to add the PMU. If no PMU registered it also should be ok since the
> PMU list will be empty and notifier callback will do nothing.
> 
> This may address one potential race on driver removal. Since the notifier will be
> unregistered firstly 

You are right, the added action will be released in reverse order.

> but the PMU's still registered and may have chance to
> access pointer to the root port. However it's so extreme so may never happen.
> 

Good point, I will move to bus_register_notifier() to first order in probe().
Will Deacon Oct. 23, 2023, 12:32 p.m. UTC | #8
On Fri, Oct 20, 2023 at 09:42:29PM +0800, Shuai Xue wrote:
> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is a PCIe
> configuration space register block provided by each PCIe Root Port in a
> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
> injection, and Statistics).

Thanks for this. It all looks pretty well written to me, especially the
documentation (thanks again!).

I just have a few comments inline...

> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
> 
> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>   time spent in each low-power LTSSM state) and
> - one 32-bit counter for Event Counting (error and non-error events for
>   a specified lane)
> 
> Note: There is no interrupt for counter overflow.
> 
> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
> 
>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
> 
> the PMU device name for this Root Port is dwc_rootport_3018.

Why not print this in b:d.f formatting then? For example,

	dwc_rootport_30:03.0

Does that confuse perf?

Also, should the segment/domain be factored in as well, in case we get
multiple instances of the IP and a resulting name collision?

> +struct dwc_pcie_format_attr {
> +	struct device_attribute attr;
> +	u64 field;
> +	int config;
> +};
> +
> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
> +	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
> +
> +	return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
> +}
> +
> +#define _dwc_pcie_format_attr(_name, _cfg, _fld)			    \
> +	(&((struct dwc_pcie_format_attr[]) {{				    \
> +		.attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL),\
> +		.config = _cfg,						    \
> +		.field = _fld,						    \
> +	}})[0].attr.attr)
> +
> +#define dwc_pcie_format_attr(_name, _fld)	_dwc_pcie_format_attr(_name, 0, _fld)
> +
> +static struct attribute *dwc_pcie_format_attrs[] = {
> +	dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
> +	dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
> +	dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
> +	NULL,
> +};
> +
> +static struct attribute_group dwc_pcie_format_attrs_group = {
> +	.name = "format",
> +	.attrs = dwc_pcie_format_attrs,
> +};
> +
> +struct dwc_pcie_event_attr {
> +	struct device_attribute attr;
> +	enum dwc_pcie_event_type type;
> +	u16 eventid;
> +	u8 lane;
> +};

There are a bunch of helpers in linux/perf_event.h for handling some of
this sysfs stuff. For example, have a look at PMU_FORMAT_ATTR() and
friends to see if they work for you (some of the other PMU drivers under
drivers/perf/ use these).

> +static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					   bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, &val);
> +
> +	/* Clear DWC_PCIE_CNT_ENABLE field first */
> +	val &= ~DWC_PCIE_CNT_ENABLE;
> +	if (enable)
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
> +	else
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, val);
> +}
> +
> +static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					  bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			      &val);
> +
> +	if (enable)
> +		val |= DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +	else
> +		val &= ~DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			       val);
> +}

I think you could implement both of these _enable() functions as simple
wrappers around something like pci_clear_and_set_dword() -- maybe that
could move into a header out of aspm.c?

> +static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
> +
> +	return val;
> +}
> +
> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 lo, hi, ss;
> +
> +	/*
> +	 * The 64-bit value of the data counter is spread across two
> +	 * registers that are not synchronized. In order to read them
> +	 * atomically, ensure that the high 32 bits match before and after
> +	 * reading the low 32 bits.
> +	 */
> +	pci_read_config_dword(pdev, ras_des_offset +
> +		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
> +	do {
> +		/* snapshot the high 32 bits */
> +		ss = hi;
> +
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
> +			&lo);
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
> +			&hi);
> +	} while (hi != ss);

I think it would be a good idea to bound this loop based on either number of
retries or a timeout. If the hardware wedges for whatever reason, we're
going to get stuck in here.

> +
> +	/*
> +	 * The Group#1 event measures the amount of data processed in 16-byte
> +	 * units. Simplify the end-user interface by multiplying the counter
> +	 * at the point of read.
> +	 */
> +	if (event_id >= 0x20 && event_id <= 0x23)
> +		return (((u64)hi << 32) | lo) << 4;
> +	else
> +		return (((u64)hi << 32) | lo);

nit, but I think it would be clearer to do:

	ret = ((u64)hi << 32) | lo;

	/* ... */
	if (event_id >= 0x20 && event_id <= 0x23)
		ret <<= 4;

	return ret;

> +}
> +
> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	u64 delta, prev, now;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +
> +		if (type == DWC_PCIE_LANE_EVENT)
> +			now = dwc_pcie_pmu_read_lane_event_counter(event);
> +		else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +			now = dwc_pcie_pmu_read_time_based_counter(event);
> +
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;

Similarly here, I think it would be clearer to construct a 'u64 max_period'
variable and then just unconditionally mask against that. In general, you
have quite a lot of 'if (type == LANE) ... else if (type == TIME) ...'
code in this driver. I think that's probably fine as long as we have two
event types, but if this extends in the future then it's probably worth
looking at having separate 'ops' structures for the event types and
dispatching to them directly.

> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct perf_event *sibling;
> +	u32 lane;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/* We don't support sampling */
> +	if (is_sampling_event(event))
> +		return -EINVAL;
> +
> +	/* We cannot support task bound events */
> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
> +		return -EINVAL;
> +
> +	if (event->group_leader != event &&
> +	    !is_software_event(event->group_leader))
> +		return -EINVAL;
> +
> +	for_each_sibling_event(sibling, event->group_leader) {
> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
> +			return -EINVAL;
> +	}
> +
> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		lane = DWC_PCIE_EVENT_LANE(event);
> +		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
> +			return -EINVAL;
> +	}
> +
> +	event->cpu = pcie_pmu->on_cpu;
> +
> +	return 0;
> +}
> +
> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
> +{
> +	local64_set(&hwc->prev_count, 0);
> +}
> +
> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	hwc->state = 0;
> +	dwc_pcie_pmu_set_period(hwc);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
> +}
> +
> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (event->hw.state & PERF_HES_STOPPED)
> +		return;
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
> +
> +	dwc_pcie_pmu_event_update(event);
> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	int lane = DWC_PCIE_EVENT_LANE(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 ctrl;
> +
> +	/* one counter for each type and it is in use */
> +	if (pcie_pmu->event[type])
> +		return -ENOSPC;

I'm a bit worried about this -- isn't the type basically funneled in
directly from userspace? If so, it's not safe to use it as index like
this. It's probably better to sanitise the input early in
dwc_pcie_pmu_event_init(), so that we know we have either a lane or a
time event everywhere else.

If you haven't tried it, there's a decent fuzzing tool for perf, so it's
probably worth taking that for a spin (it might need educating about your
driver):

https://web.eece.maine.edu/~vweaver/projects/perf_events/fuzzer/

> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		/* EVENT_COUNTER_DATA_REG needs clear manually */
> +		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
> +			FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) |
> +			FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
> +		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
> +				       ctrl);
> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
> +		/*
> +		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
> +		 * use it with any manually controlled duration. And it is
> +		 * cleared when next measurement starts.
> +		 */
> +		ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL,
> +				   DWC_PCIE_DURATION_MANUAL_CTL) |
> +			DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +		pci_write_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl);

Maybe move these into separate lane/time helpers rather than clutter this
function with the field definitions?

> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
> +	perf_event_update_userpage(event);
> +	pcie_pmu->event[type] = NULL;
> +}
> +
> +static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
> +{
> +	cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node);
> +}
> +
> +/*
> + * Find the PMU of a PCI device.
> + * @pdev: The PCI device.
> + */
> +static struct dwc_pcie_pmu *dwc_pcie_find_dev_pmu(struct pci_dev *pdev)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	list_for_each_entry(pcie_pmu, &dwc_pcie_pmu_head, pmu_node)
> +		if (pcie_pmu->pdev == pdev)
> +			return pcie_pmu;
> +
> +	return NULL;
> +}
> +
> +static void dwc_pcie_pmu_unregister_pmu(void *data)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = data;
> +
> +	if (!pcie_pmu->registered)
> +		return;
> +
> +	pcie_pmu->registered = false;
> +	list_del(&pcie_pmu->pmu_node);
> +	perf_pmu_unregister(&pcie_pmu->pmu);

Do you not need any locking here? The cpu hotplug callbacks are still live
and I'm not seeing how you prevent them from picking up the PMU from the
list right before you unregister it.

> +}
> +
> +static int dwc_pcie_pmu_notifier(struct notifier_block *nb,
> +				     unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	/* Unregister the PMU when the device is going to be deleted. */
> +	if (action != BUS_NOTIFY_DEL_DEVICE)
> +		return NOTIFY_DONE;
> +
> +	pcie_pmu = dwc_pcie_find_dev_pmu(pdev);
> +	if (!pcie_pmu)
> +		return NOTIFY_DONE;
> +
> +	dwc_pcie_pmu_unregister_pmu(pcie_pmu);
> +
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block dwc_pcie_pmu_nb = {
> +	.notifier_call = dwc_pcie_pmu_notifier,
> +};
> +
> +static void dwc_pcie_pmu_unregister_nb(void *data)
> +{
> +	bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb);
> +}
> +
> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	bool notify = false;
> +	char *name;
> +	u32 bdf;
> +	int ret;
> +
> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
> +	for_each_pci_dev(pdev) {
> +		u16 vsec;
> +		u32 val;
> +
> +		if (!(pci_is_pcie(pdev) &&
> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
> +			continue;
> +
> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
> +						DWC_PCIE_VSEC_RAS_DES_ID);
> +		if (!vsec)
> +			continue;
> +
> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
> +			continue;
> +		pci_dbg(pdev,
> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
> +
> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
> +				      bdf);
> +		if (!name) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		/* All checks passed, go go go */
> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
> +		if (!pcie_pmu) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		pcie_pmu->pdev = pdev;
> +		pcie_pmu->ras_des_offset = vsec;
> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
> +		pcie_pmu->on_cpu = -1;
> +		pcie_pmu->pmu = (struct pmu){
> +			.module		= THIS_MODULE,
> +			.attr_groups	= dwc_pcie_attr_groups,
> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +			.task_ctx_nr	= perf_invalid_context,
> +			.event_init	= dwc_pcie_pmu_event_init,
> +			.add		= dwc_pcie_pmu_event_add,
> +			.del		= dwc_pcie_pmu_event_del,
> +			.start		= dwc_pcie_pmu_event_start,
> +			.stop		= dwc_pcie_pmu_event_stop,
> +			.read		= dwc_pcie_pmu_event_update,
> +		};
> +
> +		/* Add this instance to the list used by the offline callback */
> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
> +					       &pcie_pmu->cpuhp_node);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering hotplug @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Unwind when platform driver removes */
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
> +			&pcie_pmu->cpuhp_node);
> +		if (ret)
> +			goto out;
> +
> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering PMU @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Cache PMU to handle pci device hotplug */
> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
> +		pcie_pmu->registered = true;
> +		notify = true;
> +
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
> +		if (ret)
> +			goto out;

Hmm, why do you need the PCI bus notifier on BUS_NOTIFY_DEL_DEVICE if you
register this action callback? I'm struggling to get my head around how the
following interact:

  - Driver loading/unloading
  - CPU hotplug events
  - PCI device add/del events

as well as the lifetime of the platform device relative to the PCI device.

> +	}
> +
> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
> +		return devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
> +
> +	return 0;
> +
> +out:
> +	pci_dev_put(pdev);
> +
> +	return ret;
> +}
> +
> +static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	if (pcie_pmu->on_cpu == -1)
> +		pcie_pmu->on_cpu = cpumask_local_spread(
> +			0, dev_to_node(&pcie_pmu->pdev->dev));
> +
> +	return 0;
> +}
> +
> +static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	struct pci_dev *pdev;
> +	int node;
> +	cpumask_t mask;
> +	unsigned int target;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	/* Nothing to do if this CPU doesn't own the PMU */
> +	if (cpu != pcie_pmu->on_cpu)
> +		return 0;
> +
> +	pcie_pmu->on_cpu = -1;
> +	pdev = pcie_pmu->pdev;
> +	node = dev_to_node(&pdev->dev);
> +	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
> +	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
> +		target = cpumask_any(&mask);
> +	else
> +		target = cpumask_any_but(cpu_online_mask, cpu);
> +
> +	if (target >= nr_cpu_ids) {
> +		pci_err(pdev, "There is no CPU to set\n");
> +		return 0;
> +	}
> +
> +	/* This PMU does NOT support interrupt, just migrate context. */
> +	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
> +	pcie_pmu->on_cpu = target;
> +
> +	return 0;
> +}
> +
> +static struct platform_driver dwc_pcie_pmu_driver = {
> +	.probe = dwc_pcie_pmu_probe,
> +	.driver = {.name = "dwc_pcie_pmu",},
> +};
> +
> +static int __init dwc_pcie_pmu_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +				      "perf/dwc_pcie_pmu:online",
> +				      dwc_pcie_pmu_online_cpu,
> +				      dwc_pcie_pmu_offline_cpu);
> +	if (ret < 0)
> +		return ret;
> +
> +	dwc_pcie_pmu_hp_state = ret;
> +
> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
> +	if (ret)
> +		goto platform_driver_register_err;
> +
> +	dwc_pcie_pmu_dev = platform_device_register_simple(
> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
> +		goto platform_device_register_error;
> +	}

I'm a bit confused as to why you're having to create a platform device
for a PCI device -- is this because the main designware driver has already
bound to it? A comment here explaining why you need to do this would be
very helpful. In particular, is there any dependency on another driver
to make sure that e.g. config space accesses work properly? If so, we
probably need to enforce module load ordering or something like that.

Will
Robin Murphy Oct. 23, 2023, 6:51 p.m. UTC | #9
On 2023-10-23 13:32, Will Deacon wrote:
[...]
>> +
>> +	/*
>> +	 * The Group#1 event measures the amount of data processed in 16-byte
>> +	 * units. Simplify the end-user interface by multiplying the counter
>> +	 * at the point of read.
>> +	 */
>> +	if (event_id >= 0x20 && event_id <= 0x23)
>> +		return (((u64)hi << 32) | lo) << 4;
>> +	else
>> +		return (((u64)hi << 32) | lo);
> 
> nit, but I think it would be clearer to do:
> 
> 	ret = ((u64)hi << 32) | lo;
> 
> 	/* ... */
> 	if (event_id >= 0x20 && event_id <= 0x23)
> 		ret <<= 4;

Nit: "ret *= 16;" since the comment says it's multiplying a value, not 
moving a bitfield. The compiler already knows the most efficient way to 
implement constant multiplication.

> 
> 	return ret;
> 
[...]
>> +static int __init dwc_pcie_pmu_init(void)
>> +{
>> +	int ret;
>> +
>> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>> +				      "perf/dwc_pcie_pmu:online",
>> +				      dwc_pcie_pmu_online_cpu,
>> +				      dwc_pcie_pmu_offline_cpu);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	dwc_pcie_pmu_hp_state = ret;
>> +
>> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
>> +	if (ret)
>> +		goto platform_driver_register_err;
>> +
>> +	dwc_pcie_pmu_dev = platform_device_register_simple(
>> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
>> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
>> +		goto platform_device_register_error;
>> +	}
> 
> I'm a bit confused as to why you're having to create a platform device
> for a PCI device -- is this because the main designware driver has already
> bound to it? A comment here explaining why you need to do this would be
> very helpful. In particular, is there any dependency on another driver
> to make sure that e.g. config space accesses work properly? If so, we
> probably need to enforce module load ordering or something like that.

AFAICS the platform device/driver serve no purpose other than being a 
hilariously roundabout way to run the for_each_pci_dev() loop in 
dwc_pcie_pmu_probe() upon module init, and to save explicitly freeing 
the PMU name/data. Furthermore the devres action for 
dwc_pcie_pmu_remove_cpuhp_instance() is apparently going for even more 
style points at module exit by not even relying on the corresponding 
.remove callback of the tenuous platform driver to undo what its .probe 
did, but (ab)using the device's devres list to avoid having to keep 
track of an explicit list of PMU instances at all.

Frankly I think it would be a lot more straightforward to just maintain 
that explicit list of PMU instances, do the PMU creation directly in 
dwc_pcie_pmu_init(), then unregister and free them in 
dwc_pcie_pmu_exit(). Not every driver has to contain a literal struct 
device_driver.

It also smells a bit odd that it handles PCI hot-remove but not hot-add 
- if the underlying device really is hotpluggable, wouldn't we also want 
to handle new ones turning up after module load? Conversely if it isn't, 
why pretend to handle it being removed? Even if it's not to do with 
physical hotplug of the PMU but with the user unloading the PCI 
controller driver itself (since there's no module/driver-level 
dependency enforced) and thus tearing down the whole PCI bus, then the 
same point still applies - if that *can* happen, then what if the user 
then re-loads it again, or indeed if this module loads first to begin 
with; wouldn't we want to be able to (re-)discover the PMUs rather than 
leave the whole PMU driver degraded to a useless state?

Thanks,
Robin.
Shuai Xue Oct. 24, 2023, 8:27 a.m. UTC | #10
Hi, Will,

On 2023/10/23 20:32, Will Deacon wrote:
> On Fri, Oct 20, 2023 at 09:42:29PM +0800, Shuai Xue wrote:
>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>> Core controller IP which provides statistics feature. The PMU is a PCIe
>> configuration space register block provided by each PCIe Root Port in a
>> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
>> injection, and Statistics).
> 
> Thanks for this. It all looks pretty well written to me, especially the
> documentation (thanks again!).


Thank you :)

> 
> I just have a few comments inline...
> 
>> To facilitate collection of statistics the controller provides the
>> following two features for each Root Port:
>>
>> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>>   time spent in each low-power LTSSM state) and
>> - one 32-bit counter for Event Counting (error and non-error events for
>>   a specified lane)
>>
>> Note: There is no interrupt for counter overflow.
>>
>> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
>> named based the BDF of Root Port. For example,
>>
>>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
>>
>> the PMU device name for this Root Port is dwc_rootport_3018.
> 
> Why not print this in b:d.f formatting then? For example,
> 
> 	dwc_rootport_30:03.0
> 
> Does that confuse perf?

I am afraid, yes. The perf tool can not parse "b:d.f" format,


    Reading a token: Next token is token PE_VALUE (1.18: )
    Error: popping token ':' (1.17: )
    Stack now 0 1 9 52
    Error: popping token PE_NAME (1.0: )
    Stack now 0 1 9
    Error: popping token PE_EVENT_NAME (1.0: )
    Stack now 0 1
    Error: popping token PE_START_EVENTS (1.1: )
    Stack now 0
    Cleanup: discarding lookahead token PE_VALUE (1.18: )
    Stack now 0
    event syntax error: '..otport_0000:30:03.0/Rx_PCIe_TLP_Data_Payload/'
                                      \___ parser error
    Run 'perf list' for a list of valid events

":" may not be legal. I am not familiar with perf parser, +@Ian for help.


> 
> Also, should the segment/domain be factored in as well, in case we get
> multiple instances of the IP and a resulting name collision?

Each instance has different BDF, so IMHO, it will not result name collision.

    #ls /sys/bus/event_source/devices/ | grep dwc
    dwc_rootport_0
    dwc_rootport_10
    dwc_rootport_1000
    dwc_rootport_18
    dwc_rootport_3000
    dwc_rootport_3008
    dwc_rootport_3010
    dwc_rootport_3018
    dwc_rootport_8
    dwc_rootport_8000
    dwc_rootport_9800
    dwc_rootport_9808
    dwc_rootport_9810
    dwc_rootport_9818
    dwc_rootport_b000

I used to use `dwc_rootport_300300` in v1, the subfix is kind of "b:d.f"
format created by:

	+#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func)	\
	+	(((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))

> 
> - `dwc` indicates the PMU is for Synopsys DesignWare Cores PCIe controller IP
> - `rootport` indicates the PMU is for a root port device
> - `100000` indicates the device address

But Robin and Jonathan suggested to use the standard bdf address. Are you
ask me to change back? I would like to check back :)

> 
>> +struct dwc_pcie_format_attr {
>> +	struct device_attribute attr;
>> +	u64 field;
>> +	int config;
>> +};
>> +
>> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
>> +					struct device_attribute *attr,
>> +					char *buf)
>> +{
>> +	struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
>> +	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
>> +
>> +	return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
>> +}
>> +
>> +#define _dwc_pcie_format_attr(_name, _cfg, _fld)			    \
>> +	(&((struct dwc_pcie_format_attr[]) {{				    \
>> +		.attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL),\
>> +		.config = _cfg,						    \
>> +		.field = _fld,						    \
>> +	}})[0].attr.attr)
>> +
>> +#define dwc_pcie_format_attr(_name, _fld)	_dwc_pcie_format_attr(_name, 0, _fld)
>> +
>> +static struct attribute *dwc_pcie_format_attrs[] = {
>> +	dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
>> +	dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
>> +	dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
>> +	NULL,
>> +};
>> +
>> +static struct attribute_group dwc_pcie_format_attrs_group = {
>> +	.name = "format",
>> +	.attrs = dwc_pcie_format_attrs,
>> +};
>> +
>> +struct dwc_pcie_event_attr {
>> +	struct device_attribute attr;
>> +	enum dwc_pcie_event_type type;
>> +	u16 eventid;
>> +	u8 lane;
>> +};
> 
> There are a bunch of helpers in linux/perf_event.h for handling some of
> this sysfs stuff. For example, have a look at PMU_FORMAT_ATTR() and
> friends to see if they work for you (some of the other PMU drivers under
> drivers/perf/ use these).

I will PMU_FORMAT_ATTR to simplify format sysfs stuff, thank you.

perf_pmu_events_attr is quite simple and only one `id` filed, I have to
extend a `type` filed to distinguish two types (DWC_PCIE_LANE_EVENT,
DWC_PCIE_TIME_BASE_EVENT) of DWC PMU, so I will not use PMU_EVENT_ATTR().

> 
>> +static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
>> +					   bool enable)
>> +{
>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>> +	u32 val;
>> +
>> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, &val);
>> +
>> +	/* Clear DWC_PCIE_CNT_ENABLE field first */
>> +	val &= ~DWC_PCIE_CNT_ENABLE;
>> +	if (enable)
>> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
>> +	else
>> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
>> +
>> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, val);
>> +}
>> +
>> +static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
>> +					  bool enable)
>> +{
>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>> +	u32 val;
>> +
>> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
>> +			      &val);
>> +
>> +	if (enable)
>> +		val |= DWC_PCIE_TIME_BASED_CNT_ENABLE;
>> +	else
>> +		val &= ~DWC_PCIE_TIME_BASED_CNT_ENABLE;
>> +
>> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
>> +			       val);
>> +}
> 
> I think you could implement both of these _enable() functions as simple
> wrappers around something like pci_clear_and_set_dword() -- maybe that
> could move into a header out of aspm.c?

Agreed, I will add a separate patch to move pci_clear_and_set_dword() out
of aspm.c and then use it to simplify these two _enable() functions.

> 
>> +static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>> +	u32 val;
>> +
>> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
>> +
>> +	return val;
>> +}
>> +
>> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>> +	int event_id = DWC_PCIE_EVENT_ID(event);
>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>> +	u32 lo, hi, ss;
>> +
>> +	/*
>> +	 * The 64-bit value of the data counter is spread across two
>> +	 * registers that are not synchronized. In order to read them
>> +	 * atomically, ensure that the high 32 bits match before and after
>> +	 * reading the low 32 bits.
>> +	 */
>> +	pci_read_config_dword(pdev, ras_des_offset +
>> +		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
>> +	do {
>> +		/* snapshot the high 32 bits */
>> +		ss = hi;
>> +
>> +		pci_read_config_dword(
>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
>> +			&lo);
>> +		pci_read_config_dword(
>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
>> +			&hi);
>> +	} while (hi != ss);
> 
> I think it would be a good idea to bound this loop based on either number of
> retries or a timeout. If the hardware wedges for whatever reason, we're
> going to get stuck in here.

I looked all drivers in kernel which use similar trick, but did not find
example implementation.

Do we really need it?

> 
>> +
>> +	/*
>> +	 * The Group#1 event measures the amount of data processed in 16-byte
>> +	 * units. Simplify the end-user interface by multiplying the counter
>> +	 * at the point of read.
>> +	 */
>> +	if (event_id >= 0x20 && event_id <= 0x23)
>> +		return (((u64)hi << 32) | lo) << 4;
>> +	else
>> +		return (((u64)hi << 32) | lo);
> 
> nit, but I think it would be clearer to do:
> 
> 	ret = ((u64)hi << 32) | lo;
> 
> 	/* ... */
> 	if (event_id >= 0x20 && event_id <= 0x23)
> 		ret <<= 4;
> 
> 	return ret;
> 

Quite beautiful, will fix it.

>> +}
>> +
>> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
>> +{
>> +	struct hw_perf_event *hwc = &event->hw;
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +	u64 delta, prev, now;
>> +
>> +	do {
>> +		prev = local64_read(&hwc->prev_count);
>> +
>> +		if (type == DWC_PCIE_LANE_EVENT)
>> +			now = dwc_pcie_pmu_read_lane_event_counter(event);
>> +		else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> +			now = dwc_pcie_pmu_read_time_based_counter(event);
>> +
>> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
>> +
>> +	if (type == DWC_PCIE_LANE_EVENT)
>> +		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> +		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;
> 
> Similarly here, I think it would be clearer to construct a 'u64 max_period'
> variable and then just unconditionally mask against that. 

Will fix it.

> In general, you
> have quite a lot of 'if (type == LANE) ... else if (type == TIME) ...'
> code in this driver. I think that's probably fine as long as we have two
> event types, but if this extends in the future then it's probably worth
> looking at having separate 'ops' structures for the event types and
> dispatching to them directly.

Agreed, will dispatch separately if more types are added in the future.

> 
>> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +	struct perf_event *sibling;
>> +	u32 lane;
>> +
>> +	if (event->attr.type != event->pmu->type)
>> +		return -ENOENT;
>> +
>> +	/* We don't support sampling */
>> +	if (is_sampling_event(event))
>> +		return -EINVAL;
>> +
>> +	/* We cannot support task bound events */
>> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
>> +		return -EINVAL;
>> +
>> +	if (event->group_leader != event &&
>> +	    !is_software_event(event->group_leader))
>> +		return -EINVAL;
>> +
>> +	for_each_sibling_event(sibling, event->group_leader) {
>> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
>> +			return -EINVAL;
>> +	}
>> +
>> +	if (type == DWC_PCIE_LANE_EVENT) {
>> +		lane = DWC_PCIE_EVENT_LANE(event);
>> +		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
>> +			return -EINVAL;
>> +	}
>> +
>> +	event->cpu = pcie_pmu->on_cpu;
>> +
>> +	return 0;
>> +}
>> +
>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>> +{
>> +	local64_set(&hwc->prev_count, 0);
>> +}
>> +
>> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
>> +{
>> +	struct hw_perf_event *hwc = &event->hw;
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +
>> +	hwc->state = 0;
>> +	dwc_pcie_pmu_set_period(hwc);
>> +
>> +	if (type == DWC_PCIE_LANE_EVENT)
>> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
>> +}
>> +
>> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +	struct hw_perf_event *hwc = &event->hw;
>> +
>> +	if (event->hw.state & PERF_HES_STOPPED)
>> +		return;
>> +
>> +	if (type == DWC_PCIE_LANE_EVENT)
>> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
>> +
>> +	dwc_pcie_pmu_event_update(event);
>> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
>> +}
>> +
>> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>> +	struct hw_perf_event *hwc = &event->hw;
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +	int event_id = DWC_PCIE_EVENT_ID(event);
>> +	int lane = DWC_PCIE_EVENT_LANE(event);
>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>> +	u32 ctrl;
>> +
>> +	/* one counter for each type and it is in use */
>> +	if (pcie_pmu->event[type])
>> +		return -ENOSPC;
> 
> I'm a bit worried about this -- isn't the type basically funneled in
> directly from userspace? If so, it's not safe to use it as index like
> this. It's probably better to sanitise the input early in
> dwc_pcie_pmu_event_init(), so that we know we have either a lane or a
> time event everywhere else.

Good catch, I will sanitise it in dwc_pcie_pmu_event_init().

> 
> If you haven't tried it, there's a decent fuzzing tool for perf, so it's
> probably worth taking that for a spin (it might need educating about your
> driver):
> 
> https://web.eece.maine.edu/~vweaver/projects/perf_events/fuzzer/

Sorry, I haven't. I will spin before a new version sended.

> 
>> +	if (type == DWC_PCIE_LANE_EVENT) {
>> +		/* EVENT_COUNTER_DATA_REG needs clear manually */
>> +		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
>> +			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
>> +			FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) |
>> +			FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
>> +		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
>> +				       ctrl);
>> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
>> +		/*
>> +		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
>> +		 * use it with any manually controlled duration. And it is
>> +		 * cleared when next measurement starts.
>> +		 */
>> +		ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) |
>> +			FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL,
>> +				   DWC_PCIE_DURATION_MANUAL_CTL) |
>> +			DWC_PCIE_TIME_BASED_CNT_ENABLE;
>> +		pci_write_config_dword(
>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl);
> 
> Maybe move these into separate lane/time helpers rather than clutter this
> function with the field definitions?

Aha, I used to. Robin complained that the helpers were already confusing enough
so warp out control register configuration from sub-function to .add().

> 
>> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>> +
>> +	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
>> +	perf_event_update_userpage(event);
>> +	pcie_pmu->event[type] = NULL;
>> +}
>> +
>> +static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
>> +{
>> +	cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node);
>> +}
>> +
>> +/*
>> + * Find the PMU of a PCI device.
>> + * @pdev: The PCI device.
>> + */
>> +static struct dwc_pcie_pmu *dwc_pcie_find_dev_pmu(struct pci_dev *pdev)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu;
>> +
>> +	list_for_each_entry(pcie_pmu, &dwc_pcie_pmu_head, pmu_node)
>> +		if (pcie_pmu->pdev == pdev)
>> +			return pcie_pmu;
>> +
>> +	return NULL;
>> +}
>> +
>> +static void dwc_pcie_pmu_unregister_pmu(void *data)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu = data;
>> +
>> +	if (!pcie_pmu->registered)
>> +		return;
>> +
>> +	pcie_pmu->registered = false;
>> +	list_del(&pcie_pmu->pmu_node);
>> +	perf_pmu_unregister(&pcie_pmu->pmu);
> 
> Do you not need any locking here? The cpu hotplug callbacks are still live
> and I'm not seeing how you prevent them from picking up the PMU from the
> list right before you unregister it.

The hotplug callball also try to pick up the PMU to unregister, but if
the PMU is already unregistered here, pcie_pmu->registered will be set as
false, so the PMU will not unregistered again.

So, I think pcie_pmu->registered is some kind of lock? Please correct me if
I missed anything else.

> 
>> +}
>> +
>> +static int dwc_pcie_pmu_notifier(struct notifier_block *nb,
>> +				     unsigned long action, void *data)
>> +{
>> +	struct device *dev = data;
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +	struct dwc_pcie_pmu *pcie_pmu;
>> +
>> +	/* Unregister the PMU when the device is going to be deleted. */
>> +	if (action != BUS_NOTIFY_DEL_DEVICE)
>> +		return NOTIFY_DONE;
>> +
>> +	pcie_pmu = dwc_pcie_find_dev_pmu(pdev);
>> +	if (!pcie_pmu)
>> +		return NOTIFY_DONE;
>> +
>> +	dwc_pcie_pmu_unregister_pmu(pcie_pmu);
>> +
>> +	return NOTIFY_OK;
>> +}
>> +
>> +static struct notifier_block dwc_pcie_pmu_nb = {
>> +	.notifier_call = dwc_pcie_pmu_notifier,
>> +};
>> +
>> +static void dwc_pcie_pmu_unregister_nb(void *data)
>> +{
>> +	bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb);
>> +}
>> +
>> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct dwc_pcie_pmu *pcie_pmu;
>> +	bool notify = false;
>> +	char *name;
>> +	u32 bdf;
>> +	int ret;
>> +
>> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
>> +	for_each_pci_dev(pdev) {
>> +		u16 vsec;
>> +		u32 val;
>> +
>> +		if (!(pci_is_pcie(pdev) &&
>> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
>> +			continue;
>> +
>> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
>> +						DWC_PCIE_VSEC_RAS_DES_ID);
>> +		if (!vsec)
>> +			continue;
>> +
>> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
>> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
>> +			continue;
>> +		pci_dbg(pdev,
>> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
>> +
>> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
>> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
>> +				      bdf);
>> +		if (!name) {
>> +			ret = -ENOMEM;
>> +			goto out;
>> +		}
>> +
>> +		/* All checks passed, go go go */
>> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
>> +		if (!pcie_pmu) {
>> +			ret = -ENOMEM;
>> +			goto out;
>> +		}
>> +
>> +		pcie_pmu->pdev = pdev;
>> +		pcie_pmu->ras_des_offset = vsec;
>> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
>> +		pcie_pmu->on_cpu = -1;
>> +		pcie_pmu->pmu = (struct pmu){
>> +			.module		= THIS_MODULE,
>> +			.attr_groups	= dwc_pcie_attr_groups,
>> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
>> +			.task_ctx_nr	= perf_invalid_context,
>> +			.event_init	= dwc_pcie_pmu_event_init,
>> +			.add		= dwc_pcie_pmu_event_add,
>> +			.del		= dwc_pcie_pmu_event_del,
>> +			.start		= dwc_pcie_pmu_event_start,
>> +			.stop		= dwc_pcie_pmu_event_stop,
>> +			.read		= dwc_pcie_pmu_event_update,
>> +		};
>> +
>> +		/* Add this instance to the list used by the offline callback */
>> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
>> +					       &pcie_pmu->cpuhp_node);
>> +		if (ret) {
>> +			pci_err(pdev,
>> +				"Error %d registering hotplug @%x\n", ret, bdf);
>> +			goto out;
>> +		}
>> +
>> +		/* Unwind when platform driver removes */
>> +		ret = devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
>> +			&pcie_pmu->cpuhp_node);
>> +		if (ret)
>> +			goto out;
>> +
>> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>> +		if (ret) {
>> +			pci_err(pdev,
>> +				"Error %d registering PMU @%x\n", ret, bdf);
>> +			goto out;
>> +		}
>> +
>> +		/* Cache PMU to handle pci device hotplug */
>> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
>> +		pcie_pmu->registered = true;
>> +		notify = true;
>> +
>> +		ret = devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
>> +		if (ret)
>> +			goto out;
> 
> Hmm, why do you need the PCI bus notifier on BUS_NOTIFY_DEL_DEVICE if you
> register this action callback? I'm struggling to get my head around how the
> following interact:
> 
>   - Driver loading/unloading
>   - CPU hotplug events
>   - PCI device add/del events
> 
> as well as the lifetime of the platform device relative to the PCI device.

Yes, they are a bit complex.

The event triggers of the above three parts of PMU, CPU and PCI device are
quite independent,

 - Driver loading/unloading: the lifetime of platform device
	insmod/rmmod module of this driver
 - CPU hotplug events:
	echo 0 > /sys/devices/system/cpu/cpu0/online
	echo 1 > /sys/devices/system/cpu/cpu0/online
 - PCI device add/del events (a.k.a PCI hotplug events), e.g
	echo 1 > /sys/bus/pci/devices/0000\:30\:02.0/remove
	echo 1 > /sys/bus/pci/rescan

The lifecycles of PMU, CPU, and PCI devices have mutual influence on each other.

1. The CPU hotplug just as other PMUs in drivers/perf, let's talk about it
   first.

   The PMU context is binded to a CPU picked from the same NUMA node of PCI
   device, so if the picked CPU is offlined at runtime, we need to migate
   the context to another local online CPU in the same NUMA node.

2. The Driver loading/unloading is independent, for exmaple, rmmod module
   if not built in or unbinds the driver. Then all PMUs of PCI device will
   be unregistered as expected, and the PCI device is not affected.

3. The PMU holds the PCI device to which it belongs, so that it can access
   the PCI DES capability. If the PCI device is unplugged at runtime, the
   PMU should also be unregistered.  It's the basic idea suggested by
   @Yicong, just as x86 does in uncore_bus_notify().
	


> 
>> +	}
>> +
>> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
>> +		return devm_add_action_or_reset(
>> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
>> +
>> +	return 0;
>> +
>> +out:
>> +	pci_dev_put(pdev);
>> +
>> +	return ret;
>> +}
>> +
>> +static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu;
>> +
>> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
>> +	if (pcie_pmu->on_cpu == -1)
>> +		pcie_pmu->on_cpu = cpumask_local_spread(
>> +			0, dev_to_node(&pcie_pmu->pdev->dev));
>> +
>> +	return 0;
>> +}
>> +
>> +static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
>> +{
>> +	struct dwc_pcie_pmu *pcie_pmu;
>> +	struct pci_dev *pdev;
>> +	int node;
>> +	cpumask_t mask;
>> +	unsigned int target;
>> +
>> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
>> +	/* Nothing to do if this CPU doesn't own the PMU */
>> +	if (cpu != pcie_pmu->on_cpu)
>> +		return 0;
>> +
>> +	pcie_pmu->on_cpu = -1;
>> +	pdev = pcie_pmu->pdev;
>> +	node = dev_to_node(&pdev->dev);
>> +	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
>> +	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
>> +		target = cpumask_any(&mask);
>> +	else
>> +		target = cpumask_any_but(cpu_online_mask, cpu);
>> +
>> +	if (target >= nr_cpu_ids) {
>> +		pci_err(pdev, "There is no CPU to set\n");
>> +		return 0;
>> +	}
>> +
>> +	/* This PMU does NOT support interrupt, just migrate context. */
>> +	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
>> +	pcie_pmu->on_cpu = target;
>> +
>> +	return 0;
>> +}
>> +
>> +static struct platform_driver dwc_pcie_pmu_driver = {
>> +	.probe = dwc_pcie_pmu_probe,
>> +	.driver = {.name = "dwc_pcie_pmu",},
>> +};
>> +
>> +static int __init dwc_pcie_pmu_init(void)
>> +{
>> +	int ret;
>> +
>> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>> +				      "perf/dwc_pcie_pmu:online",
>> +				      dwc_pcie_pmu_online_cpu,
>> +				      dwc_pcie_pmu_offline_cpu);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	dwc_pcie_pmu_hp_state = ret;
>> +
>> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
>> +	if (ret)
>> +		goto platform_driver_register_err;
>> +
>> +	dwc_pcie_pmu_dev = platform_device_register_simple(
>> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
>> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
>> +		goto platform_device_register_error;
>> +	}
> 
> I'm a bit confused as to why you're having to create a platform device
> for a PCI device -- is this because the main designware driver has already
> bound to it? A comment here explaining why you need to do this would be
> very helpful. 

The problem here is that we need to do that fundamental redesign of the
way the PCI ports drivers work so that the PCIe VSEC/DVSEC capability, e.g
RAS_DES PMU here could probe and remove, hotplug and unhotplug more gracefully.
I think we have discussed the current limitation in the previous version[1].

>> Given that we have a appropriate way to tear down the PMUs via devm_add_action_or_reset(),
>> I am going to remove the redundant probe/remove framework via platform_driver_{un}register().
>> for_each probe process in __dwc_pcie_pmu_probe() will be move into dwc_pcie_pmu_init().
>> Is it a better way?
>
> I think I'd prefer to see a standard driver creation / probe flow even if you could in theory
avoid it. [2]

I discussed with @Jonathan about the probe flow. Jonathan prefers the standard driver
creation/probe flow. What's your opinion?

If you are happy with the current implementation flow, I will just add a comment.


> In particular, is there any dependency on another driver
> to make sure that e.g. config space accesses work properly? If so, we
> probably need to enforce module load ordering or something like that.

Of course, at least it depends on
	- pci_driver_init called by postcore_initcall, early order 2
	- acpi_pci_init called by arch_initcall, early order 3

so I think module_init called by device_initcall, early order 6 is ok?


Thank you for valuable comments,
Best Regards,
Shuai

[1] https://lore.kernel.org/lkml/634f4762-cf2e-4535-f369-4032d65093f0@linux.alibaba.com/t/#ma82c49a12d579c2e497b321f46f3f56789be5d2c
[2] https://lore.kernel.org/lkml/634f4762-cf2e-4535-f369-4032d65093f0@linux.alibaba.com/t/#m595e169995b1d61a2737e67925468929cf0dba6a
[3] https://lore.kernel.org/lkml/20230522035428.69441-5-xueshuai@linux.alibaba.com/T/#m8f5aec1cb50b42825739a5977629c8ea98710a6e
Shuai Xue Oct. 24, 2023, 9:29 a.m. UTC | #11
+ Will, Jonathan, Bjorn and Yicong for probe and hotplug handing.

On 2023/10/24 02:51, Robin Murphy wrote:
> On 2023-10-23 13:32, Will Deacon wrote:
> [...]
>>> +
>>> +    /*
>>> +     * The Group#1 event measures the amount of data processed in 16-byte
>>> +     * units. Simplify the end-user interface by multiplying the counter
>>> +     * at the point of read.
>>> +     */
>>> +    if (event_id >= 0x20 && event_id <= 0x23)
>>> +        return (((u64)hi << 32) | lo) << 4;
>>> +    else
>>> +        return (((u64)hi << 32) | lo);
>>
>> nit, but I think it would be clearer to do:
>>
>>     ret = ((u64)hi << 32) | lo;
>>
>>     /* ... */
>>     if (event_id >= 0x20 && event_id <= 0x23)
>>         ret <<= 4;
> 
> Nit: "ret *= 16;" since the comment says it's multiplying a value, not moving a bitfield. The compiler already knows the most efficient way to implement constant multiplication.

Cool, will use multiplication directly.

> 
>>
>>     return ret;
>>
> [...]
>>> +static int __init dwc_pcie_pmu_init(void)
>>> +{
>>> +    int ret;
>>> +
>>> +    ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>>> +                      "perf/dwc_pcie_pmu:online",
>>> +                      dwc_pcie_pmu_online_cpu,
>>> +                      dwc_pcie_pmu_offline_cpu);
>>> +    if (ret < 0)
>>> +        return ret;
>>> +
>>> +    dwc_pcie_pmu_hp_state = ret;
>>> +
>>> +    ret = platform_driver_register(&dwc_pcie_pmu_driver);
>>> +    if (ret)
>>> +        goto platform_driver_register_err;
>>> +
>>> +    dwc_pcie_pmu_dev = platform_device_register_simple(
>>> +                "dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>>> +    if (IS_ERR(dwc_pcie_pmu_dev)) {
>>> +        ret = PTR_ERR(dwc_pcie_pmu_dev);
>>> +        goto platform_device_register_error;
>>> +    }
>>
>> I'm a bit confused as to why you're having to create a platform device
>> for a PCI device -- is this because the main designware driver has already
>> bound to it? A comment here explaining why you need to do this would be
>> very helpful. In particular, is there any dependency on another driver
>> to make sure that e.g. config space accesses work properly? If so, we
>> probably need to enforce module load ordering or something like that.
> 
> AFAICS the platform device/driver serve no purpose other than being a hilariously roundabout way to run the for_each_pci_dev() loop in dwc_pcie_pmu_probe() upon module init, and to save explicitly freeing the PMU name/data. Furthermore the devres action for dwc_pcie_pmu_remove_cpuhp_instance() is apparently going for even more style points at module exit by not even relying on the corresponding .remove callback of the tenuous platform driver to undo what its .probe did, but (ab)using the device's devres list to avoid having to keep track of an explicit list of PMU instances at all.

You are right.

> 
> Frankly I think it would be a lot more straightforward to just maintain that explicit list of PMU instances, do the PMU creation directly in dwc_pcie_pmu_init(), then unregister and free them in dwc_pcie_pmu_exit(). Not every driver has to contain a literal struct device_driver.

Agreed, it might be more straightforward. But personally speaking, I prefer
current implementation.

    - standard driver creation / probe flow is more normal
    - it avoid maintaining list of PMU instances
    - IMHO, both of them are temporary solution, if PCI core addes a
      standard mechanism to discover and enbale PCIe VSEC/DVSEC capability,
      the driver will use the standard way.


> 
> It also smells a bit odd that it handles PCI hot-remove but not hot-add - if the underlying device really is hotpluggable, wouldn't we also want to handle new ones turning up after module load? Conversely if it isn't, why pretend to handle it being removed? Even if it's not to do with physical hotplug of the PMU but with the user unloading the PCI controller driver itself (since there's no module/driver-level dependency enforced) and thus tearing down the whole PCI bus, then the same point still applies - if that *can* happen, then what if the user then re-loads it again, or indeed if this module loads first to begin with; wouldn't we want to be able to (re-)discover the PMUs rather than leave the whole PMU driver degraded to a useless state?
> 

I see you point, there are three casees:
1. hot-remove PCI root port firstly and then load the PMU module, the PMU
of the removed PCI device will not be registered.

I think it is the expected behavior.

2. load the PMU module firstly and then hot-remove PCI root port, the PMU
of the removed PCI device will be unregistered.

it is what the dwc_pcie_pmu_unregister_nb() does upon BUS_NOTIFY_DEL_DEVICE

3. load the PMU module firstly, hot-remove PCI root port, and then hot-plug
the PCI root port the PMU of the hot-pluged device will not load again by
current design upon BUS_NOTIFY_DEL_DEVICE.

I guess it is the really problem. It can be workaround be reload the PMU
module. It has been a bit complex around how the following interact:

    - Driver loading/unloading
    - CPU hotplug events
    - PCI device add/del events

We can also add action for BUS_NOTIFY_ADD_DEVICE to address the problem,

    - scan all PCI device
     - check RAS_DES cap
     - check cached PMU node
     - registers its PMU

But I prefer leave as it is, just as x86 does in uncore_bus_notify().
Certainly, if the community deems it necessary to implement
BUS_NOTIFY_ADD_DEVICE action, I also would like to extend it.
Jonathan Cameron Oct. 26, 2023, 1:44 p.m. UTC | #12
On Tue, 24 Oct 2023 17:29:34 +0800
Shuai Xue <xueshuai@linux.alibaba.com> wrote:

> + Will, Jonathan, Bjorn and Yicong for probe and hotplug handing.
> 
> On 2023/10/24 02:51, Robin Murphy wrote:
> > On 2023-10-23 13:32, Will Deacon wrote:
> > [...]  
> >>> +
> >>> +    /*
> >>> +     * The Group#1 event measures the amount of data processed in 16-byte
> >>> +     * units. Simplify the end-user interface by multiplying the counter
> >>> +     * at the point of read.
> >>> +     */
> >>> +    if (event_id >= 0x20 && event_id <= 0x23)
> >>> +        return (((u64)hi << 32) | lo) << 4;
> >>> +    else
> >>> +        return (((u64)hi << 32) | lo);  
> >>
> >> nit, but I think it would be clearer to do:
> >>
> >>     ret = ((u64)hi << 32) | lo;
> >>
> >>     /* ... */
> >>     if (event_id >= 0x20 && event_id <= 0x23)
> >>         ret <<= 4;  
> > 
> > Nit: "ret *= 16;" since the comment says it's multiplying a value, not moving a bitfield. The compiler already knows the most efficient way to implement constant multiplication.  
> 
> Cool, will use multiplication directly.
> 
> >   
> >>
> >>     return ret;
> >>  
> > [...]  
> >>> +static int __init dwc_pcie_pmu_init(void)
> >>> +{
> >>> +    int ret;
> >>> +
> >>> +    ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> >>> +                      "perf/dwc_pcie_pmu:online",
> >>> +                      dwc_pcie_pmu_online_cpu,
> >>> +                      dwc_pcie_pmu_offline_cpu);
> >>> +    if (ret < 0)
> >>> +        return ret;
> >>> +
> >>> +    dwc_pcie_pmu_hp_state = ret;
> >>> +
> >>> +    ret = platform_driver_register(&dwc_pcie_pmu_driver);
> >>> +    if (ret)
> >>> +        goto platform_driver_register_err;
> >>> +
> >>> +    dwc_pcie_pmu_dev = platform_device_register_simple(
> >>> +                "dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
> >>> +    if (IS_ERR(dwc_pcie_pmu_dev)) {
> >>> +        ret = PTR_ERR(dwc_pcie_pmu_dev);
> >>> +        goto platform_device_register_error;
> >>> +    }  
> >>
> >> I'm a bit confused as to why you're having to create a platform device
> >> for a PCI device -- is this because the main designware driver has already
> >> bound to it? A comment here explaining why you need to do this would be
> >> very helpful. In particular, is there any dependency on another driver
> >> to make sure that e.g. config space accesses work properly? If so, we
> >> probably need to enforce module load ordering or something like that.  
> > 
> > AFAICS the platform device/driver serve no purpose other than being a hilariously roundabout way to run the for_each_pci_dev() loop in dwc_pcie_pmu_probe() upon module init, and to save explicitly freeing the PMU name/data. Furthermore the devres action for dwc_pcie_pmu_remove_cpuhp_instance() is apparently going for even more style points at module exit by not even relying on the corresponding .remove callback of the tenuous platform driver to undo what its .probe did, but (ab)using the device's devres list to avoid having to keep track of an explicit list of PMU instances at all.  
> 
> You are right.

Also provides a (potential) parent for the PMU devices which is something
we were trying to clean up for existing PMUs (which end up in the
wrong directly in sysfs because they typically don't have parents).


> 
> > 
> > Frankly I think it would be a lot more straightforward to just maintain that explicit list of PMU instances, do the PMU creation directly in dwc_pcie_pmu_init(), then unregister and free them in dwc_pcie_pmu_exit(). Not every driver has to contain a literal struct device_driver.  
> 
> Agreed, it might be more straightforward. But personally speaking, I prefer
> current implementation.
> 
>     - standard driver creation / probe flow is more normal
>     - it avoid maintaining list of PMU instances
>     - IMHO, both of them are temporary solution, if PCI core addes a
>       standard mechanism to discover and enbale PCIe VSEC/DVSEC capability,
>       the driver will use the standard way.
> 
> 
> > 
> > It also smells a bit odd that it handles PCI hot-remove but not hot-add - if the underlying device really is hotpluggable, wouldn't we also want to handle new ones turning up after module load? Conversely if it isn't, why pretend to handle it being removed? Even if it's not to do with physical hotplug of the PMU but with the user unloading the PCI controller driver itself (since there's no module/driver-level dependency enforced) and thus tearing down the whole PCI bus, then the same point still applies - if that *can* happen, then what if the user then re-loads it again, or indeed if this module loads first to begin with; wouldn't we want to be able to (re-)discover the PMUs rather than leave the whole PMU driver degraded to a useless state?
> >   
> 
> I see you point, there are three casees:
> 1. hot-remove PCI root port firstly and then load the PMU module, the PMU
> of the removed PCI device will not be registered.
> 
> I think it is the expected behavior.
> 
> 2. load the PMU module firstly and then hot-remove PCI root port, the PMU
> of the removed PCI device will be unregistered.
> 
> it is what the dwc_pcie_pmu_unregister_nb() does upon BUS_NOTIFY_DEL_DEVICE
> 
> 3. load the PMU module firstly, hot-remove PCI root port, and then hot-plug
> the PCI root port the PMU of the hot-pluged device will not load again by
> current design upon BUS_NOTIFY_DEL_DEVICE.
> 
> I guess it is the really problem. It can be workaround be reload the PMU
> module. It has been a bit complex around how the following interact:
> 
>     - Driver loading/unloading
>     - CPU hotplug events
>     - PCI device add/del events
> 
> We can also add action for BUS_NOTIFY_ADD_DEVICE to address the problem,
> 
>     - scan all PCI device
>      - check RAS_DES cap
>      - check cached PMU node
>      - registers its PMU
> 
> But I prefer leave as it is, just as x86 does in uncore_bus_notify().
> Certainly, if the community deems it necessary to implement
> BUS_NOTIFY_ADD_DEVICE action, I also would like to extend it.
>
Robin Murphy Oct. 26, 2023, 4:52 p.m. UTC | #13
On 26/10/2023 2:44 pm, Jonathan Cameron wrote:
> On Tue, 24 Oct 2023 17:29:34 +0800
> Shuai Xue <xueshuai@linux.alibaba.com> wrote:
> 
>> + Will, Jonathan, Bjorn and Yicong for probe and hotplug handing.
>>
>> On 2023/10/24 02:51, Robin Murphy wrote:
>>> On 2023-10-23 13:32, Will Deacon wrote:
>>> [...]
>>>>> +
>>>>> +    /*
>>>>> +     * The Group#1 event measures the amount of data processed in 16-byte
>>>>> +     * units. Simplify the end-user interface by multiplying the counter
>>>>> +     * at the point of read.
>>>>> +     */
>>>>> +    if (event_id >= 0x20 && event_id <= 0x23)
>>>>> +        return (((u64)hi << 32) | lo) << 4;
>>>>> +    else
>>>>> +        return (((u64)hi << 32) | lo);
>>>>
>>>> nit, but I think it would be clearer to do:
>>>>
>>>>      ret = ((u64)hi << 32) | lo;
>>>>
>>>>      /* ... */
>>>>      if (event_id >= 0x20 && event_id <= 0x23)
>>>>          ret <<= 4;
>>>
>>> Nit: "ret *= 16;" since the comment says it's multiplying a value, not moving a bitfield. The compiler already knows the most efficient way to implement constant multiplication.
>>
>> Cool, will use multiplication directly.
>>
>>>    
>>>>
>>>>      return ret;
>>>>   
>>> [...]
>>>>> +static int __init dwc_pcie_pmu_init(void)
>>>>> +{
>>>>> +    int ret;
>>>>> +
>>>>> +    ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>>>>> +                      "perf/dwc_pcie_pmu:online",
>>>>> +                      dwc_pcie_pmu_online_cpu,
>>>>> +                      dwc_pcie_pmu_offline_cpu);
>>>>> +    if (ret < 0)
>>>>> +        return ret;
>>>>> +
>>>>> +    dwc_pcie_pmu_hp_state = ret;
>>>>> +
>>>>> +    ret = platform_driver_register(&dwc_pcie_pmu_driver);
>>>>> +    if (ret)
>>>>> +        goto platform_driver_register_err;
>>>>> +
>>>>> +    dwc_pcie_pmu_dev = platform_device_register_simple(
>>>>> +                "dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>>>>> +    if (IS_ERR(dwc_pcie_pmu_dev)) {
>>>>> +        ret = PTR_ERR(dwc_pcie_pmu_dev);
>>>>> +        goto platform_device_register_error;
>>>>> +    }
>>>>
>>>> I'm a bit confused as to why you're having to create a platform device
>>>> for a PCI device -- is this because the main designware driver has already
>>>> bound to it? A comment here explaining why you need to do this would be
>>>> very helpful. In particular, is there any dependency on another driver
>>>> to make sure that e.g. config space accesses work properly? If so, we
>>>> probably need to enforce module load ordering or something like that.
>>>
>>> AFAICS the platform device/driver serve no purpose other than being a hilariously roundabout way to run the for_each_pci_dev() loop in dwc_pcie_pmu_probe() upon module init, and to save explicitly freeing the PMU name/data. Furthermore the devres action for dwc_pcie_pmu_remove_cpuhp_instance() is apparently going for even more style points at module exit by not even relying on the corresponding .remove callback of the tenuous platform driver to undo what its .probe did, but (ab)using the device's devres list to avoid having to keep track of an explicit list of PMU instances at all.
>>
>> You are right.
> 
> Also provides a (potential) parent for the PMU devices which is something
> we were trying to clean up for existing PMUs (which end up in the
> wrong directly in sysfs because they typically don't have parents).

Surely the relevant PCI device would be an even more appropriate parent, 
though, since that's the true topology?

Thanks,
Robin.
Robin Murphy Oct. 26, 2023, 6:06 p.m. UTC | #14
On 24/10/2023 10:29 am, Shuai Xue wrote:
> + Will, Jonathan, Bjorn and Yicong for probe and hotplug handing.
> 
> On 2023/10/24 02:51, Robin Murphy wrote:
>> On 2023-10-23 13:32, Will Deacon wrote:
>> [...]
>>>> +
>>>> +    /*
>>>> +     * The Group#1 event measures the amount of data processed in 16-byte
>>>> +     * units. Simplify the end-user interface by multiplying the counter
>>>> +     * at the point of read.
>>>> +     */
>>>> +    if (event_id >= 0x20 && event_id <= 0x23)
>>>> +        return (((u64)hi << 32) | lo) << 4;
>>>> +    else
>>>> +        return (((u64)hi << 32) | lo);
>>>
>>> nit, but I think it would be clearer to do:
>>>
>>>      ret = ((u64)hi << 32) | lo;
>>>
>>>      /* ... */
>>>      if (event_id >= 0x20 && event_id <= 0x23)
>>>          ret <<= 4;
>>
>> Nit: "ret *= 16;" since the comment says it's multiplying a value, not moving a bitfield. The compiler already knows the most efficient way to implement constant multiplication.
> 
> Cool, will use multiplication directly.
> 
>>
>>>
>>>      return ret;
>>>
>> [...]
>>>> +static int __init dwc_pcie_pmu_init(void)
>>>> +{
>>>> +    int ret;
>>>> +
>>>> +    ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>>>> +                      "perf/dwc_pcie_pmu:online",
>>>> +                      dwc_pcie_pmu_online_cpu,
>>>> +                      dwc_pcie_pmu_offline_cpu);
>>>> +    if (ret < 0)
>>>> +        return ret;
>>>> +
>>>> +    dwc_pcie_pmu_hp_state = ret;
>>>> +
>>>> +    ret = platform_driver_register(&dwc_pcie_pmu_driver);
>>>> +    if (ret)
>>>> +        goto platform_driver_register_err;
>>>> +
>>>> +    dwc_pcie_pmu_dev = platform_device_register_simple(
>>>> +                "dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>>>> +    if (IS_ERR(dwc_pcie_pmu_dev)) {
>>>> +        ret = PTR_ERR(dwc_pcie_pmu_dev);
>>>> +        goto platform_device_register_error;
>>>> +    }
>>>
>>> I'm a bit confused as to why you're having to create a platform device
>>> for a PCI device -- is this because the main designware driver has already
>>> bound to it? A comment here explaining why you need to do this would be
>>> very helpful. In particular, is there any dependency on another driver
>>> to make sure that e.g. config space accesses work properly? If so, we
>>> probably need to enforce module load ordering or something like that.
>>
>> AFAICS the platform device/driver serve no purpose other than being a hilariously roundabout way to run the for_each_pci_dev() loop in dwc_pcie_pmu_probe() upon module init, and to save explicitly freeing the PMU name/data. Furthermore the devres action for dwc_pcie_pmu_remove_cpuhp_instance() is apparently going for even more style points at module exit by not even relying on the corresponding .remove callback of the tenuous platform driver to undo what its .probe did, but (ab)using the device's devres list to avoid having to keep track of an explicit list of PMU instances at all.
> 
> You are right.
> 
>>
>> Frankly I think it would be a lot more straightforward to just maintain that explicit list of PMU instances, do the PMU creation directly in dwc_pcie_pmu_init(), then unregister and free them in dwc_pcie_pmu_exit(). Not every driver has to contain a literal struct device_driver.
> 
> Agreed, it might be more straightforward. But personally speaking, I prefer
> current implementation.
> 
>      - standard driver creation / probe flow is more normal

It's really not, though. We have a weird singleton platform device 
appearing out of nowhere which effectively represents the module being 
loaded, rather than anything about the actual underlying hardware. If 
you want this to look like "normal" driver model usage, then create a 
separate platform device for each physical PCI PMU instance you discover 
(potentially via both a one-time scan at module_init and an ADD_DEVICE 
hotplug notifier later), then have the platform driver just register the 
corresponding PMU device in .probe and unregister it in .remove, without 
confusing devres action tricks.

Thanks,
Robin.

>      - it avoid maintaining list of PMU instances
>      - IMHO, both of them are temporary solution, if PCI core addes a
>        standard mechanism to discover and enbale PCIe VSEC/DVSEC capability,
>        the driver will use the standard way.
> 
> 
>>
>> It also smells a bit odd that it handles PCI hot-remove but not hot-add - if the underlying device really is hotpluggable, wouldn't we also want to handle new ones turning up after module load? Conversely if it isn't, why pretend to handle it being removed? Even if it's not to do with physical hotplug of the PMU but with the user unloading the PCI controller driver itself (since there's no module/driver-level dependency enforced) and thus tearing down the whole PCI bus, then the same point still applies - if that *can* happen, then what if the user then re-loads it again, or indeed if this module loads first to begin with; wouldn't we want to be able to (re-)discover the PMUs rather than leave the whole PMU driver degraded to a useless state?
>>
> 
> I see you point, there are three casees:
> 1. hot-remove PCI root port firstly and then load the PMU module, the PMU
> of the removed PCI device will not be registered.
> 
> I think it is the expected behavior.
> 
> 2. load the PMU module firstly and then hot-remove PCI root port, the PMU
> of the removed PCI device will be unregistered.
> 
> it is what the dwc_pcie_pmu_unregister_nb() does upon BUS_NOTIFY_DEL_DEVICE
> 
> 3. load the PMU module firstly, hot-remove PCI root port, and then hot-plug
> the PCI root port the PMU of the hot-pluged device will not load again by
> current design upon BUS_NOTIFY_DEL_DEVICE.
> 
> I guess it is the really problem. It can be workaround be reload the PMU
> module. It has been a bit complex around how the following interact:
> 
>      - Driver loading/unloading
>      - CPU hotplug events
>      - PCI device add/del events
> 
> We can also add action for BUS_NOTIFY_ADD_DEVICE to address the problem,
> 
>      - scan all PCI device
>       - check RAS_DES cap
>       - check cached PMU node
>       - registers its PMU
> 
> But I prefer leave as it is, just as x86 does in uncore_bus_notify().
> Certainly, if the community deems it necessary to implement
> BUS_NOTIFY_ADD_DEVICE action, I also would like to extend it.
Shuai Xue Oct. 30, 2023, 3:53 a.m. UTC | #15
On 2023/10/27 02:06, Robin Murphy wrote:
...
>>>>      return ret;
>>>>
>>> [...]
>>>>> +static int __init dwc_pcie_pmu_init(void)
>>>>> +{
>>>>> +    int ret;
>>>>> +
>>>>> +    ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>>>>> +                      "perf/dwc_pcie_pmu:online",
>>>>> +                      dwc_pcie_pmu_online_cpu,
>>>>> +                      dwc_pcie_pmu_offline_cpu);
>>>>> +    if (ret < 0)
>>>>> +        return ret;
>>>>> +
>>>>> +    dwc_pcie_pmu_hp_state = ret;
>>>>> +
>>>>> +    ret = platform_driver_register(&dwc_pcie_pmu_driver);
>>>>> +    if (ret)
>>>>> +        goto platform_driver_register_err;
>>>>> +
>>>>> +    dwc_pcie_pmu_dev = platform_device_register_simple(
>>>>> +                "dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>>>>> +    if (IS_ERR(dwc_pcie_pmu_dev)) {
>>>>> +        ret = PTR_ERR(dwc_pcie_pmu_dev);
>>>>> +        goto platform_device_register_error;
>>>>> +    }
>>>>
>>>> I'm a bit confused as to why you're having to create a platform device
>>>> for a PCI device -- is this because the main designware driver has already
>>>> bound to it? A comment here explaining why you need to do this would be
>>>> very helpful. In particular, is there any dependency on another driver
>>>> to make sure that e.g. config space accesses work properly? If so, we
>>>> probably need to enforce module load ordering or something like that.
>>>
>>> AFAICS the platform device/driver serve no purpose other than being a hilariously roundabout way to run the for_each_pci_dev() loop in dwc_pcie_pmu_probe() upon module init, and to save explicitly freeing the PMU name/data. Furthermore the devres action for dwc_pcie_pmu_remove_cpuhp_instance() is apparently going for even more style points at module exit by not even relying on the corresponding .remove callback of the tenuous platform driver to undo what its .probe did, but (ab)using the device's devres list to avoid having to keep track of an explicit list of PMU instances at all.
>>
>> You are right.
>>
>>>
>>> Frankly I think it would be a lot more straightforward to just maintain that explicit list of PMU instances, do the PMU creation directly in dwc_pcie_pmu_init(), then unregister and free them in dwc_pcie_pmu_exit(). Not every driver has to contain a literal struct device_driver.
>>
>> Agreed, it might be more straightforward. But personally speaking, I prefer
>> current implementation.
>>
>>      - standard driver creation / probe flow is more normal
> 
> It's really not, though. We have a weird singleton platform device appearing out of nowhere which effectively represents the module being loaded, rather than anything about the actual underlying hardware. If you want this to look like "normal" driver model usage, then create a separate platform device for each physical PCI PMU instance you discover (potentially via both a one-time scan at module_init and an ADD_DEVICE hotplug notifier later), then have the platform driver just register the corresponding PMU device in .probe and unregister it in .remove, without confusing devres action tricks.
> 

Got it. If IIUC, I should register a platform device for each matched pci
device in module_init() or when BUS_NOTIFY_ADD_DEVICE event triggered, and
unwind it in module exit() and when BUS_NOTIFY_DEL_DEVICE event triggered.

Thank you for valuable comments.

Best Regards,
Shuai
Shuai Xue Oct. 30, 2023, 4:54 a.m. UTC | #16
On 2023/10/24 16:27, Shuai Xue wrote:
> 
> Hi, Will,
> 
> On 2023/10/23 20:32, Will Deacon wrote:
>> On Fri, Oct 20, 2023 at 09:42:29PM +0800, Shuai Xue wrote:
>>> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
>>> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
>>> Core controller IP which provides statistics feature. The PMU is a PCIe
>>> configuration space register block provided by each PCIe Root Port in a
>>> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
>>> injection, and Statistics).
>>
>> Thanks for this. It all looks pretty well written to me, especially the
>> documentation (thanks again!).
> 
> 
> Thank you :)
> 
>>
>> I just have a few comments inline...
>>
>>> To facilitate collection of statistics the controller provides the
>>> following two features for each Root Port:
>>>
>>> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>>>   time spent in each low-power LTSSM state) and
>>> - one 32-bit counter for Event Counting (error and non-error events for
>>>   a specified lane)
>>>
>>> Note: There is no interrupt for counter overflow.
>>>
>>> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
>>> named based the BDF of Root Port. For example,
>>>
>>>     30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
>>>
>>> the PMU device name for this Root Port is dwc_rootport_3018.
>>
>> Why not print this in b:d.f formatting then? For example,
>>
>> 	dwc_rootport_30:03.0
>>
>> Does that confuse perf?
> 
> I am afraid, yes. The perf tool can not parse "b:d.f" format,
> 
> 
>     Reading a token: Next token is token PE_VALUE (1.18: )
>     Error: popping token ':' (1.17: )
>     Stack now 0 1 9 52
>     Error: popping token PE_NAME (1.0: )
>     Stack now 0 1 9
>     Error: popping token PE_EVENT_NAME (1.0: )
>     Stack now 0 1
>     Error: popping token PE_START_EVENTS (1.1: )
>     Stack now 0
>     Cleanup: discarding lookahead token PE_VALUE (1.18: )
>     Stack now 0
>     event syntax error: '..otport_0000:30:03.0/Rx_PCIe_TLP_Data_Payload/'
>                                       \___ parser error
>     Run 'perf list' for a list of valid events
> 
> ":" may not be legal. I am not familiar with perf parser, +@Ian for help.
> 
> 
>>
>> Also, should the segment/domain be factored in as well, in case we get
>> multiple instances of the IP and a resulting name collision?
> 
> Each instance has different BDF, so IMHO, it will not result name collision.
> 
>     #ls /sys/bus/event_source/devices/ | grep dwc
>     dwc_rootport_0
>     dwc_rootport_10
>     dwc_rootport_1000
>     dwc_rootport_18
>     dwc_rootport_3000
>     dwc_rootport_3008
>     dwc_rootport_3010
>     dwc_rootport_3018
>     dwc_rootport_8
>     dwc_rootport_8000
>     dwc_rootport_9800
>     dwc_rootport_9808
>     dwc_rootport_9810
>     dwc_rootport_9818
>     dwc_rootport_b000
> 
> I used to use `dwc_rootport_300300` in v1, the subfix is kind of "b:d.f"
> format created by:
> 
> 	+#define DWC_PCIE_CREATE_BDF(seg, bus, dev, func)	\
> 	+	(((seg) << 24) | (((bus) & 0xFF) << 16) | (((dev) & 0xFF) << 8) | (func))
> 
>>
>> - `dwc` indicates the PMU is for Synopsys DesignWare Cores PCIe controller IP
>> - `rootport` indicates the PMU is for a root port device
>> - `100000` indicates the device address
> 
> But Robin and Jonathan suggested to use the standard bdf address. Are you
> ask me to change back? I would like to check back :)
> 
>>
>>> +struct dwc_pcie_format_attr {
>>> +	struct device_attribute attr;
>>> +	u64 field;
>>> +	int config;
>>> +};
>>> +
>>> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
>>> +					struct device_attribute *attr,
>>> +					char *buf)
>>> +{
>>> +	struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
>>> +	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
>>> +
>>> +	return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
>>> +}
>>> +
>>> +#define _dwc_pcie_format_attr(_name, _cfg, _fld)			    \
>>> +	(&((struct dwc_pcie_format_attr[]) {{				    \
>>> +		.attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL),\
>>> +		.config = _cfg,						    \
>>> +		.field = _fld,						    \
>>> +	}})[0].attr.attr)
>>> +
>>> +#define dwc_pcie_format_attr(_name, _fld)	_dwc_pcie_format_attr(_name, 0, _fld)
>>> +
>>> +static struct attribute *dwc_pcie_format_attrs[] = {
>>> +	dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
>>> +	dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
>>> +	dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
>>> +	NULL,
>>> +};
>>> +
>>> +static struct attribute_group dwc_pcie_format_attrs_group = {
>>> +	.name = "format",
>>> +	.attrs = dwc_pcie_format_attrs,
>>> +};
>>> +
>>> +struct dwc_pcie_event_attr {
>>> +	struct device_attribute attr;
>>> +	enum dwc_pcie_event_type type;
>>> +	u16 eventid;
>>> +	u8 lane;
>>> +};
>>
>> There are a bunch of helpers in linux/perf_event.h for handling some of
>> this sysfs stuff. For example, have a look at PMU_FORMAT_ATTR() and
>> friends to see if they work for you (some of the other PMU drivers under
>> drivers/perf/ use these).
> 
> I will PMU_FORMAT_ATTR to simplify format sysfs stuff, thank you.
> 
> perf_pmu_events_attr is quite simple and only one `id` filed, I have to
> extend a `type` filed to distinguish two types (DWC_PCIE_LANE_EVENT,
> DWC_PCIE_TIME_BASE_EVENT) of DWC PMU, so I will not use PMU_EVENT_ATTR().
> 
>>
>>> +static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
>>> +					   bool enable)
>>> +{
>>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>>> +	u32 val;
>>> +
>>> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, &val);
>>> +
>>> +	/* Clear DWC_PCIE_CNT_ENABLE field first */
>>> +	val &= ~DWC_PCIE_CNT_ENABLE;
>>> +	if (enable)
>>> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
>>> +	else
>>> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
>>> +
>>> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, val);
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
>>> +					  bool enable)
>>> +{
>>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>>> +	u32 val;
>>> +
>>> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
>>> +			      &val);
>>> +
>>> +	if (enable)
>>> +		val |= DWC_PCIE_TIME_BASED_CNT_ENABLE;
>>> +	else
>>> +		val &= ~DWC_PCIE_TIME_BASED_CNT_ENABLE;
>>> +
>>> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
>>> +			       val);
>>> +}
>>
>> I think you could implement both of these _enable() functions as simple
>> wrappers around something like pci_clear_and_set_dword() -- maybe that
>> could move into a header out of aspm.c?
> 
> Agreed, I will add a separate patch to move pci_clear_and_set_dword() out
> of aspm.c and then use it to simplify these two _enable() functions.
> 
>>
>>> +static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>>> +	u32 val;
>>> +
>>> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
>>> +
>>> +	return val;
>>> +}
>>> +
>>> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>>> +	int event_id = DWC_PCIE_EVENT_ID(event);
>>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>>> +	u32 lo, hi, ss;
>>> +
>>> +	/*
>>> +	 * The 64-bit value of the data counter is spread across two
>>> +	 * registers that are not synchronized. In order to read them
>>> +	 * atomically, ensure that the high 32 bits match before and after
>>> +	 * reading the low 32 bits.
>>> +	 */
>>> +	pci_read_config_dword(pdev, ras_des_offset +
>>> +		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
>>> +	do {
>>> +		/* snapshot the high 32 bits */
>>> +		ss = hi;
>>> +
>>> +		pci_read_config_dword(
>>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
>>> +			&lo);
>>> +		pci_read_config_dword(
>>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
>>> +			&hi);
>>> +	} while (hi != ss);
>>
>> I think it would be a good idea to bound this loop based on either number of
>> retries or a timeout. If the hardware wedges for whatever reason, we're
>> going to get stuck in here.
> 
> I looked all drivers in kernel which use similar trick, but did not find
> example implementation.
> 
> Do we really need it?
> 
>>
>>> +
>>> +	/*
>>> +	 * The Group#1 event measures the amount of data processed in 16-byte
>>> +	 * units. Simplify the end-user interface by multiplying the counter
>>> +	 * at the point of read.
>>> +	 */
>>> +	if (event_id >= 0x20 && event_id <= 0x23)
>>> +		return (((u64)hi << 32) | lo) << 4;
>>> +	else
>>> +		return (((u64)hi << 32) | lo);
>>
>> nit, but I think it would be clearer to do:
>>
>> 	ret = ((u64)hi << 32) | lo;
>>
>> 	/* ... */
>> 	if (event_id >= 0x20 && event_id <= 0x23)
>> 		ret <<= 4;
>>
>> 	return ret;
>>
> 
> Quite beautiful, will fix it.
> 
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
>>> +{
>>> +	struct hw_perf_event *hwc = &event->hw;
>>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +	u64 delta, prev, now;
>>> +
>>> +	do {
>>> +		prev = local64_read(&hwc->prev_count);
>>> +
>>> +		if (type == DWC_PCIE_LANE_EVENT)
>>> +			now = dwc_pcie_pmu_read_lane_event_counter(event);
>>> +		else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>> +			now = dwc_pcie_pmu_read_time_based_counter(event);
>>> +
>>> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
>>> +
>>> +	if (type == DWC_PCIE_LANE_EVENT)
>>> +		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
>>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>> +		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;
>>
>> Similarly here, I think it would be clearer to construct a 'u64 max_period'
>> variable and then just unconditionally mask against that. 
> 
> Will fix it.
> 
>> In general, you
>> have quite a lot of 'if (type == LANE) ... else if (type == TIME) ...'
>> code in this driver. I think that's probably fine as long as we have two
>> event types, but if this extends in the future then it's probably worth
>> looking at having separate 'ops' structures for the event types and
>> dispatching to them directly.
> 
> Agreed, will dispatch separately if more types are added in the future.
> 
>>
>>> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +	struct perf_event *sibling;
>>> +	u32 lane;
>>> +
>>> +	if (event->attr.type != event->pmu->type)
>>> +		return -ENOENT;
>>> +
>>> +	/* We don't support sampling */
>>> +	if (is_sampling_event(event))
>>> +		return -EINVAL;
>>> +
>>> +	/* We cannot support task bound events */
>>> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
>>> +		return -EINVAL;
>>> +
>>> +	if (event->group_leader != event &&
>>> +	    !is_software_event(event->group_leader))
>>> +		return -EINVAL;
>>> +
>>> +	for_each_sibling_event(sibling, event->group_leader) {
>>> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
>>> +			return -EINVAL;
>>> +	}
>>> +
>>> +	if (type == DWC_PCIE_LANE_EVENT) {
>>> +		lane = DWC_PCIE_EVENT_LANE(event);
>>> +		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
>>> +			return -EINVAL;
>>> +	}
>>> +
>>> +	event->cpu = pcie_pmu->on_cpu;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
>>> +{
>>> +	local64_set(&hwc->prev_count, 0);
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
>>> +{
>>> +	struct hw_perf_event *hwc = &event->hw;
>>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +
>>> +	hwc->state = 0;
>>> +	dwc_pcie_pmu_set_period(hwc);
>>> +
>>> +	if (type == DWC_PCIE_LANE_EVENT)
>>> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
>>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +	struct hw_perf_event *hwc = &event->hw;
>>> +
>>> +	if (event->hw.state & PERF_HES_STOPPED)
>>> +		return;
>>> +
>>> +	if (type == DWC_PCIE_LANE_EVENT)
>>> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
>>> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
>>> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
>>> +
>>> +	dwc_pcie_pmu_event_update(event);
>>> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +	struct pci_dev *pdev = pcie_pmu->pdev;
>>> +	struct hw_perf_event *hwc = &event->hw;
>>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +	int event_id = DWC_PCIE_EVENT_ID(event);
>>> +	int lane = DWC_PCIE_EVENT_LANE(event);
>>> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
>>> +	u32 ctrl;
>>> +
>>> +	/* one counter for each type and it is in use */
>>> +	if (pcie_pmu->event[type])
>>> +		return -ENOSPC;
>>
>> I'm a bit worried about this -- isn't the type basically funneled in
>> directly from userspace? If so, it's not safe to use it as index like
>> this. It's probably better to sanitise the input early in
>> dwc_pcie_pmu_event_init(), so that we know we have either a lane or a
>> time event everywhere else.
> 
> Good catch, I will sanitise it in dwc_pcie_pmu_event_init().
> 
>>
>> If you haven't tried it, there's a decent fuzzing tool for perf, so it's
>> probably worth taking that for a spin (it might need educating about your
>> driver):
>>
>> https://web.eece.maine.edu/~vweaver/projects/perf_events/fuzzer/
> 
> Sorry, I haven't. I will spin before a new version sended.
> 
>>
>>> +	if (type == DWC_PCIE_LANE_EVENT) {
>>> +		/* EVENT_COUNTER_DATA_REG needs clear manually */
>>> +		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
>>> +			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
>>> +			FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) |
>>> +			FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
>>> +		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
>>> +				       ctrl);
>>> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
>>> +		/*
>>> +		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
>>> +		 * use it with any manually controlled duration. And it is
>>> +		 * cleared when next measurement starts.
>>> +		 */
>>> +		ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) |
>>> +			FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL,
>>> +				   DWC_PCIE_DURATION_MANUAL_CTL) |
>>> +			DWC_PCIE_TIME_BASED_CNT_ENABLE;
>>> +		pci_write_config_dword(
>>> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl);
>>
>> Maybe move these into separate lane/time helpers rather than clutter this
>> function with the field definitions?
> 
> Aha, I used to. Robin complained that the helpers were already confusing enough
> so warp out control register configuration from sub-function to .add().
> 
>>
>>> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
>>> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
>>> +
>>> +	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
>>> +	perf_event_update_userpage(event);
>>> +	pcie_pmu->event[type] = NULL;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
>>> +{
>>> +	cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node);
>>> +}
>>> +
>>> +/*
>>> + * Find the PMU of a PCI device.
>>> + * @pdev: The PCI device.
>>> + */
>>> +static struct dwc_pcie_pmu *dwc_pcie_find_dev_pmu(struct pci_dev *pdev)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu;
>>> +
>>> +	list_for_each_entry(pcie_pmu, &dwc_pcie_pmu_head, pmu_node)
>>> +		if (pcie_pmu->pdev == pdev)
>>> +			return pcie_pmu;
>>> +
>>> +	return NULL;
>>> +}
>>> +
>>> +static void dwc_pcie_pmu_unregister_pmu(void *data)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu = data;
>>> +
>>> +	if (!pcie_pmu->registered)
>>> +		return;
>>> +
>>> +	pcie_pmu->registered = false;
>>> +	list_del(&pcie_pmu->pmu_node);
>>> +	perf_pmu_unregister(&pcie_pmu->pmu);
>>
>> Do you not need any locking here? The cpu hotplug callbacks are still live
>> and I'm not seeing how you prevent them from picking up the PMU from the
>> list right before you unregister it.
> 
> The hotplug callball also try to pick up the PMU to unregister, but if
> the PMU is already unregistered here, pcie_pmu->registered will be set as
> false, so the PMU will not unregistered again.
> 
> So, I think pcie_pmu->registered is some kind of lock? Please correct me if
> I missed anything else.
> 
>>
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_notifier(struct notifier_block *nb,
>>> +				     unsigned long action, void *data)
>>> +{
>>> +	struct device *dev = data;
>>> +	struct pci_dev *pdev = to_pci_dev(dev);
>>> +	struct dwc_pcie_pmu *pcie_pmu;
>>> +
>>> +	/* Unregister the PMU when the device is going to be deleted. */
>>> +	if (action != BUS_NOTIFY_DEL_DEVICE)
>>> +		return NOTIFY_DONE;
>>> +
>>> +	pcie_pmu = dwc_pcie_find_dev_pmu(pdev);
>>> +	if (!pcie_pmu)
>>> +		return NOTIFY_DONE;
>>> +
>>> +	dwc_pcie_pmu_unregister_pmu(pcie_pmu);
>>> +
>>> +	return NOTIFY_OK;
>>> +}
>>> +
>>> +static struct notifier_block dwc_pcie_pmu_nb = {
>>> +	.notifier_call = dwc_pcie_pmu_notifier,
>>> +};
>>> +
>>> +static void dwc_pcie_pmu_unregister_nb(void *data)
>>> +{
>>> +	bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb);
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
>>> +{
>>> +	struct pci_dev *pdev = NULL;
>>> +	struct dwc_pcie_pmu *pcie_pmu;
>>> +	bool notify = false;
>>> +	char *name;
>>> +	u32 bdf;
>>> +	int ret;
>>> +
>>> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
>>> +	for_each_pci_dev(pdev) {
>>> +		u16 vsec;
>>> +		u32 val;
>>> +
>>> +		if (!(pci_is_pcie(pdev) &&
>>> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
>>> +			continue;
>>> +
>>> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
>>> +						DWC_PCIE_VSEC_RAS_DES_ID);
>>> +		if (!vsec)
>>> +			continue;
>>> +
>>> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
>>> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
>>> +			continue;
>>> +		pci_dbg(pdev,
>>> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
>>> +
>>> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
>>> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
>>> +				      bdf);
>>> +		if (!name) {
>>> +			ret = -ENOMEM;
>>> +			goto out;
>>> +		}
>>> +
>>> +		/* All checks passed, go go go */
>>> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
>>> +		if (!pcie_pmu) {
>>> +			ret = -ENOMEM;
>>> +			goto out;
>>> +		}
>>> +
>>> +		pcie_pmu->pdev = pdev;
>>> +		pcie_pmu->ras_des_offset = vsec;
>>> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
>>> +		pcie_pmu->on_cpu = -1;
>>> +		pcie_pmu->pmu = (struct pmu){
>>> +			.module		= THIS_MODULE,
>>> +			.attr_groups	= dwc_pcie_attr_groups,
>>> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
>>> +			.task_ctx_nr	= perf_invalid_context,
>>> +			.event_init	= dwc_pcie_pmu_event_init,
>>> +			.add		= dwc_pcie_pmu_event_add,
>>> +			.del		= dwc_pcie_pmu_event_del,
>>> +			.start		= dwc_pcie_pmu_event_start,
>>> +			.stop		= dwc_pcie_pmu_event_stop,
>>> +			.read		= dwc_pcie_pmu_event_update,
>>> +		};
>>> +
>>> +		/* Add this instance to the list used by the offline callback */
>>> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
>>> +					       &pcie_pmu->cpuhp_node);
>>> +		if (ret) {
>>> +			pci_err(pdev,
>>> +				"Error %d registering hotplug @%x\n", ret, bdf);
>>> +			goto out;
>>> +		}
>>> +
>>> +		/* Unwind when platform driver removes */
>>> +		ret = devm_add_action_or_reset(
>>> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
>>> +			&pcie_pmu->cpuhp_node);
>>> +		if (ret)
>>> +			goto out;
>>> +
>>> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
>>> +		if (ret) {
>>> +			pci_err(pdev,
>>> +				"Error %d registering PMU @%x\n", ret, bdf);
>>> +			goto out;
>>> +		}
>>> +
>>> +		/* Cache PMU to handle pci device hotplug */
>>> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
>>> +		pcie_pmu->registered = true;
>>> +		notify = true;
>>> +
>>> +		ret = devm_add_action_or_reset(
>>> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
>>> +		if (ret)
>>> +			goto out;
>>
>> Hmm, why do you need the PCI bus notifier on BUS_NOTIFY_DEL_DEVICE if you
>> register this action callback? I'm struggling to get my head around how the
>> following interact:
>>
>>   - Driver loading/unloading
>>   - CPU hotplug events
>>   - PCI device add/del events
>>
>> as well as the lifetime of the platform device relative to the PCI device.
> 
> Yes, they are a bit complex.
> 
> The event triggers of the above three parts of PMU, CPU and PCI device are
> quite independent,
> 
>  - Driver loading/unloading: the lifetime of platform device
> 	insmod/rmmod module of this driver
>  - CPU hotplug events:
> 	echo 0 > /sys/devices/system/cpu/cpu0/online
> 	echo 1 > /sys/devices/system/cpu/cpu0/online
>  - PCI device add/del events (a.k.a PCI hotplug events), e.g
> 	echo 1 > /sys/bus/pci/devices/0000\:30\:02.0/remove
> 	echo 1 > /sys/bus/pci/rescan
> 
> The lifecycles of PMU, CPU, and PCI devices have mutual influence on each other.
> 
> 1. The CPU hotplug just as other PMUs in drivers/perf, let's talk about it
>    first.
> 
>    The PMU context is binded to a CPU picked from the same NUMA node of PCI
>    device, so if the picked CPU is offlined at runtime, we need to migate
>    the context to another local online CPU in the same NUMA node.
> 
> 2. The Driver loading/unloading is independent, for exmaple, rmmod module
>    if not built in or unbinds the driver. Then all PMUs of PCI device will
>    be unregistered as expected, and the PCI device is not affected.
> 
> 3. The PMU holds the PCI device to which it belongs, so that it can access
>    the PCI DES capability. If the PCI device is unplugged at runtime, the
>    PMU should also be unregistered.  It's the basic idea suggested by
>    @Yicong, just as x86 does in uncore_bus_notify().
> 	
> 
> 
>>
>>> +	}
>>> +
>>> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
>>> +		return devm_add_action_or_reset(
>>> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
>>> +
>>> +	return 0;
>>> +
>>> +out:
>>> +	pci_dev_put(pdev);
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu;
>>> +
>>> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
>>> +	if (pcie_pmu->on_cpu == -1)
>>> +		pcie_pmu->on_cpu = cpumask_local_spread(
>>> +			0, dev_to_node(&pcie_pmu->pdev->dev));
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
>>> +{
>>> +	struct dwc_pcie_pmu *pcie_pmu;
>>> +	struct pci_dev *pdev;
>>> +	int node;
>>> +	cpumask_t mask;
>>> +	unsigned int target;
>>> +
>>> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
>>> +	/* Nothing to do if this CPU doesn't own the PMU */
>>> +	if (cpu != pcie_pmu->on_cpu)
>>> +		return 0;
>>> +
>>> +	pcie_pmu->on_cpu = -1;
>>> +	pdev = pcie_pmu->pdev;
>>> +	node = dev_to_node(&pdev->dev);
>>> +	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
>>> +	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
>>> +		target = cpumask_any(&mask);
>>> +	else
>>> +		target = cpumask_any_but(cpu_online_mask, cpu);
>>> +
>>> +	if (target >= nr_cpu_ids) {
>>> +		pci_err(pdev, "There is no CPU to set\n");
>>> +		return 0;
>>> +	}
>>> +
>>> +	/* This PMU does NOT support interrupt, just migrate context. */
>>> +	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
>>> +	pcie_pmu->on_cpu = target;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static struct platform_driver dwc_pcie_pmu_driver = {
>>> +	.probe = dwc_pcie_pmu_probe,
>>> +	.driver = {.name = "dwc_pcie_pmu",},
>>> +};
>>> +
>>> +static int __init dwc_pcie_pmu_init(void)
>>> +{
>>> +	int ret;
>>> +
>>> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
>>> +				      "perf/dwc_pcie_pmu:online",
>>> +				      dwc_pcie_pmu_online_cpu,
>>> +				      dwc_pcie_pmu_offline_cpu);
>>> +	if (ret < 0)
>>> +		return ret;
>>> +
>>> +	dwc_pcie_pmu_hp_state = ret;
>>> +
>>> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
>>> +	if (ret)
>>> +		goto platform_driver_register_err;
>>> +
>>> +	dwc_pcie_pmu_dev = platform_device_register_simple(
>>> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
>>> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
>>> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
>>> +		goto platform_device_register_error;
>>> +	}
>>
>> I'm a bit confused as to why you're having to create a platform device
>> for a PCI device -- is this because the main designware driver has already
>> bound to it? A comment here explaining why you need to do this would be
>> very helpful. 
> 
> The problem here is that we need to do that fundamental redesign of the
> way the PCI ports drivers work so that the PCIe VSEC/DVSEC capability, e.g
> RAS_DES PMU here could probe and remove, hotplug and unhotplug more gracefully.
> I think we have discussed the current limitation in the previous version[1].
> 
>>> Given that we have a appropriate way to tear down the PMUs via devm_add_action_or_reset(),
>>> I am going to remove the redundant probe/remove framework via platform_driver_{un}register().
>>> for_each probe process in __dwc_pcie_pmu_probe() will be move into dwc_pcie_pmu_init().
>>> Is it a better way?
>>
>> I think I'd prefer to see a standard driver creation / probe flow even if you could in theory
> avoid it. [2]
> 
> I discussed with @Jonathan about the probe flow. Jonathan prefers the standard driver
> creation/probe flow. What's your opinion?
> 
> If you are happy with the current implementation flow, I will just add a comment.
> 
> 
>> In particular, is there any dependency on another driver
>> to make sure that e.g. config space accesses work properly? If so, we
>> probably need to enforce module load ordering or something like that.
> 
> Of course, at least it depends on
> 	- pci_driver_init called by postcore_initcall, early order 2
> 	- acpi_pci_init called by arch_initcall, early order 3
> 
> so I think module_init called by device_initcall, early order 6 is ok?
> 
> 
> Thank you for valuable comments,
> Best Regards,
> Shuai
> 
> [1] https://lore.kernel.org/lkml/634f4762-cf2e-4535-f369-4032d65093f0@linux.alibaba.com/t/#ma82c49a12d579c2e497b321f46f3f56789be5d2c
> [2] https://lore.kernel.org/lkml/634f4762-cf2e-4535-f369-4032d65093f0@linux.alibaba.com/t/#m595e169995b1d61a2737e67925468929cf0dba6a
> [3] https://lore.kernel.org/lkml/20230522035428.69441-5-xueshuai@linux.alibaba.com/T/#m8f5aec1cb50b42825739a5977629c8ea98710a6e


Hi, Will,

Any feedback?

Thank you.
Best Regards,
Shuai
Krishna chaitanya chundru Oct. 30, 2023, 6:28 a.m. UTC | #17
On 10/20/2023 7:12 PM, Shuai Xue wrote:
> This commit adds the PCIe Performance Monitoring Unit (PMU) driver support
> for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express
> Core controller IP which provides statistics feature. The PMU is a PCIe
> configuration space register block provided by each PCIe Root Port in a
> Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error
> injection, and Statistics).
>
> To facilitate collection of statistics the controller provides the
> following two features for each Root Port:
>
> - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
>    time spent in each low-power LTSSM state) and
> - one 32-bit counter for Event Counting (error and non-error events for
>    a specified lane)
>
> Note: There is no interrupt for counter overflow.
>
> This driver adds PMU devices for each PCIe Root Port. And the PMU device is
> named based the BDF of Root Port. For example,
>
>      30:03.0 PCI bridge: Device 1ded:8000 (rev 01)
>
> the PMU device name for this Root Port is dwc_rootport_3018.
>
> Example usage of counting PCIe RX TLP data payload (Units of bytes)::
>
>      $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/
>
> average RX bandwidth can be calculated like this:
>
>      PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window
>
> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> ---
>   drivers/perf/Kconfig        |   7 +
>   drivers/perf/Makefile       |   1 +
>   drivers/perf/dwc_pcie_pmu.c | 770 ++++++++++++++++++++++++++++++++++++
>   3 files changed, 778 insertions(+)
>   create mode 100644 drivers/perf/dwc_pcie_pmu.c
>
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 273d67ecf6d2..ec6e0d9194a1 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -217,6 +217,13 @@ config MARVELL_CN10K_DDR_PMU
>   	  Enable perf support for Marvell DDR Performance monitoring
>   	  event on CN10K platform.
>   
> +config DWC_PCIE_PMU
> +	tristate "Synopsys DesignWare PCIe PMU"
> +	depends on PCI
> +	help
> +	  Enable perf support for Synopsys DesignWare PCIe PMU Performance
> +	  monitoring event on platform including the Alibaba Yitian 710.
> +
>   source "drivers/perf/arm_cspmu/Kconfig"
>   
>   source "drivers/perf/amlogic/Kconfig"
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 16b3ec4db916..a06338e3401c 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -23,6 +23,7 @@ obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
>   obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
>   obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
>   obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
> +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
>   obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
>   obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
>   obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
> diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
> new file mode 100644
> index 000000000000..ddb06d763b0c
> --- /dev/null
> +++ b/drivers/perf/dwc_pcie_pmu.c
> @@ -0,0 +1,770 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Synopsys DesignWare PCIe PMU driver
> + *
> + * Copyright (C) 2021-2023 Alibaba Inc.
> + */
> +
> +#include <linux/bitfield.h>
> +#include <linux/bitops.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/perf_event.h>
> +#include <linux/pci.h>
> +#include <linux/platform_device.h>
> +#include <linux/smp.h>
> +#include <linux/sysfs.h>
> +#include <linux/types.h>
> +
> +#define DWC_PCIE_VSEC_RAS_DES_ID		0x02
> +#define DWC_PCIE_EVENT_CNT_CTL			0x8
> +
> +/*
> + * Event Counter Data Select includes two parts:
> + * - 27-24: Group number(4-bit: 0..0x7)
> + * - 23-16: Event number(8-bit: 0..0x13) within the Group
> + *
> + * Put them together as in TRM.
> + */
> +#define DWC_PCIE_CNT_EVENT_SEL			GENMASK(27, 16)
> +#define DWC_PCIE_CNT_LANE_SEL			GENMASK(11, 8)
> +#define DWC_PCIE_CNT_STATUS			BIT(7)
> +#define DWC_PCIE_CNT_ENABLE			GENMASK(4, 2)
> +#define DWC_PCIE_PER_EVENT_OFF			0x1
> +#define DWC_PCIE_PER_EVENT_ON			0x3
> +#define DWC_PCIE_EVENT_CLEAR			GENMASK(1, 0)
> +#define DWC_PCIE_EVENT_PER_CLEAR		0x1
> +
> +#define DWC_PCIE_EVENT_CNT_DATA			0xC
> +
> +#define DWC_PCIE_TIME_BASED_ANAL_CTL		0x10
> +#define DWC_PCIE_TIME_BASED_REPORT_SEL		GENMASK(31, 24)
> +#define DWC_PCIE_TIME_BASED_DURATION_SEL	GENMASK(15, 8)
> +#define DWC_PCIE_DURATION_MANUAL_CTL		0x0
> +#define DWC_PCIE_DURATION_1MS			0x1
> +#define DWC_PCIE_DURATION_10MS			0x2
> +#define DWC_PCIE_DURATION_100MS			0x3
> +#define DWC_PCIE_DURATION_1S			0x4
> +#define DWC_PCIE_DURATION_2S			0x5
> +#define DWC_PCIE_DURATION_4S			0x6
> +#define DWC_PCIE_DURATION_4US			0xFF
> +#define DWC_PCIE_TIME_BASED_TIMER_START		BIT(0)
> +#define DWC_PCIE_TIME_BASED_CNT_ENABLE		0x1
> +
> +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW	0x14
> +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH	0x18
> +
> +/* Event attributes */
> +#define DWC_PCIE_CONFIG_EVENTID			GENMASK(15, 0)
> +#define DWC_PCIE_CONFIG_TYPE			GENMASK(19, 16)
> +#define DWC_PCIE_CONFIG_LANE			GENMASK(27, 20)
> +
> +#define DWC_PCIE_EVENT_ID(event)	FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
> +#define DWC_PCIE_EVENT_TYPE(event)	FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
> +#define DWC_PCIE_EVENT_LANE(event)	FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
> +
> +enum dwc_pcie_event_type {
> +	DWC_PCIE_TIME_BASE_EVENT,
> +	DWC_PCIE_LANE_EVENT,
> +	DWC_PCIE_EVENT_TYPE_MAX,
> +};
> +
> +#define DWC_PCIE_LANE_EVENT_MAX_PERIOD		GENMASK_ULL(31, 0)
> +#define DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD	GENMASK_ULL(63, 0)
> +
> +struct dwc_pcie_pmu {
> +	struct pmu		pmu;
> +	struct pci_dev		*pdev;		/* Root Port device */
> +	u16			ras_des_offset;
> +	u32			nr_lanes;
> +
> +	struct list_head	pmu_node;
> +	struct hlist_node	cpuhp_node;
> +	struct perf_event	*event[DWC_PCIE_EVENT_TYPE_MAX];
> +	int			on_cpu;
> +	bool			registered;
> +};
> +
> +#define to_dwc_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
> +
> +static struct platform_device *dwc_pcie_pmu_dev;
> +static int dwc_pcie_pmu_hp_state;
> +static struct list_head dwc_pcie_pmu_head = LIST_HEAD_INIT(dwc_pcie_pmu_head);
> +
> +static ssize_t cpumask_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(dev_get_drvdata(dev));
> +
> +	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
> +}
> +static DEVICE_ATTR_RO(cpumask);
> +
> +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
> +	&dev_attr_cpumask.attr,
> +	NULL
> +};
> +
> +static struct attribute_group dwc_pcie_cpumask_attr_group = {
> +	.attrs = dwc_pcie_pmu_cpumask_attrs,
> +};
> +
> +struct dwc_pcie_format_attr {
> +	struct device_attribute attr;
> +	u64 field;
> +	int config;
> +};
> +
> +static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
> +	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
> +
> +	return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
> +}
> +
> +#define _dwc_pcie_format_attr(_name, _cfg, _fld)			    \
> +	(&((struct dwc_pcie_format_attr[]) {{				    \
> +		.attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL),\
> +		.config = _cfg,						    \
> +		.field = _fld,						    \
> +	}})[0].attr.attr)
> +
> +#define dwc_pcie_format_attr(_name, _fld)	_dwc_pcie_format_attr(_name, 0, _fld)
> +
> +static struct attribute *dwc_pcie_format_attrs[] = {
> +	dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
> +	dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
> +	dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
> +	NULL,
> +};
> +
> +static struct attribute_group dwc_pcie_format_attrs_group = {
> +	.name = "format",
> +	.attrs = dwc_pcie_format_attrs,
> +};
> +
> +struct dwc_pcie_event_attr {
> +	struct device_attribute attr;
> +	enum dwc_pcie_event_type type;
> +	u16 eventid;
> +	u8 lane;
> +};
> +
> +static ssize_t dwc_pcie_event_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct dwc_pcie_event_attr *eattr;
> +
> +	eattr = container_of(attr, typeof(*eattr), attr);
> +
> +	if (eattr->type == DWC_PCIE_LANE_EVENT)
> +		return sysfs_emit(buf, "eventid=0x%x,type=0x%x,lane=?\n",
> +				  eattr->eventid, eattr->type);
> +	else if (eattr->type == DWC_PCIE_TIME_BASE_EVENT)
> +		return sysfs_emit(buf, "eventid=0x%x,type=0x%x\n",
> +				  eattr->eventid, eattr->type);
> +
> +	return 0;
> +}
> +
> +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane)		\
> +	(&((struct dwc_pcie_event_attr[]) {{				\
> +		.attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL),	\
> +		.type = _type,						\
> +		.eventid = _eventid,					\
> +		.lane = _lane,						\
> +	}})[0].attr.attr)
> +
> +#define DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(_name, _eventid)		\
> +	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
> +#define DWC_PCIE_PMU_LANE_EVENT_ATTR(_name, _eventid)			\
> +	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_LANE_EVENT, _eventid, 0)
> +
> +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
> +	/* Group #0 */
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(one_cycle, 0x00),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_L0S, 0x01),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(RX_L0S, 0x02),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L0, 0x03),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1, 0x04),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_1, 0x05),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_2, 0x06),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(CFG_RCVRY, 0x07),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_RX_L0S, 0x08),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09),
> +
> +	/* Group #1 */
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
> +	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
> +
> +	/*
> +	 * Leave it to the user to specify the lane ID to avoid generating
> +	 * a list of hundreds of events.
> +	 */
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ack_dllp, 0x600),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_read, 0x703),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_write, 0x704),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_read, 0x705),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_without_data, 0x706),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_with_data, 0x707),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_message_tlp, 0x708),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_atomic, 0x709),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_tlp_with_prefix, 0x70A),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_write, 0x70B),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_read, 0x70C),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_write, 0x70F),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_read, 0x710),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_without_data, 0x711),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_with_data, 0x712),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_message_tlp, 0x713),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_atomic, 0x714),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_tlp_with_prefix, 0x715),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ccix_tlp, 0x716),
> +	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ccix_tlp, 0x717),
> +	NULL
> +};
> +
> +static const struct attribute_group dwc_pcie_event_attrs_group = {
> +	.name = "events",
> +	.attrs = dwc_pcie_pmu_time_event_attrs,
> +};
> +
> +static const struct attribute_group *dwc_pcie_attr_groups[] = {
> +	&dwc_pcie_event_attrs_group,
> +	&dwc_pcie_format_attrs_group,
> +	&dwc_pcie_cpumask_attr_group,
> +	NULL
> +};
> +
> +static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					   bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, &val);
> +
> +	/* Clear DWC_PCIE_CNT_ENABLE field first */
> +	val &= ~DWC_PCIE_CNT_ENABLE;
> +	if (enable)
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
> +	else
> +		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, val);
> +}
> +
> +static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
> +					  bool enable)
> +{
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			      &val);
> +
> +	if (enable)
> +		val |= DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +	else
> +		val &= ~DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +
> +	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
> +			       val);
> +}
> +
> +static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 val;
> +
> +	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
> +
> +	return val;
> +}
> +
> +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 lo, hi, ss;
> +
> +	/*
> +	 * The 64-bit value of the data counter is spread across two
> +	 * registers that are not synchronized. In order to read them
> +	 * atomically, ensure that the high 32 bits match before and after
> +	 * reading the low 32 bits.
> +	 */
> +	pci_read_config_dword(pdev, ras_des_offset +
> +		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
> +	do {
> +		/* snapshot the high 32 bits */
> +		ss = hi;
> +
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
> +			&lo);
> +		pci_read_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
> +			&hi);
> +	} while (hi != ss);
> +
> +	/*
> +	 * The Group#1 event measures the amount of data processed in 16-byte
> +	 * units. Simplify the end-user interface by multiplying the counter
> +	 * at the point of read.
> +	 */
> +	if (event_id >= 0x20 && event_id <= 0x23)
> +		return (((u64)hi << 32) | lo) << 4;
> +	else
> +		return (((u64)hi << 32) | lo);
> +}
> +
> +static void dwc_pcie_pmu_event_update(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	u64 delta, prev, now;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +
> +		if (type == DWC_PCIE_LANE_EVENT)
> +			now = dwc_pcie_pmu_read_lane_event_counter(event);
> +		else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +			now = dwc_pcie_pmu_read_time_based_counter(event);
> +
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;
> +
> +	local64_add(delta, &event->count);
> +}
> +
> +static int dwc_pcie_pmu_event_init(struct perf_event *event)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct perf_event *sibling;
> +	u32 lane;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/* We don't support sampling */
> +	if (is_sampling_event(event))
> +		return -EINVAL;
> +
> +	/* We cannot support task bound events */
> +	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
> +		return -EINVAL;
> +
> +	if (event->group_leader != event &&
> +	    !is_software_event(event->group_leader))
> +		return -EINVAL;
> +
> +	for_each_sibling_event(sibling, event->group_leader) {
> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
> +			return -EINVAL;
> +	}
> +
> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		lane = DWC_PCIE_EVENT_LANE(event);
> +		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
> +			return -EINVAL;
> +	}
> +
> +	event->cpu = pcie_pmu->on_cpu;
> +
> +	return 0;
> +}
> +
> +static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
> +{
> +	local64_set(&hwc->prev_count, 0);
> +}
> +
> +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	hwc->state = 0;
> +	dwc_pcie_pmu_set_period(hwc);
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
> +}
> +
> +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (event->hw.state & PERF_HES_STOPPED)
> +		return;
> +
> +	if (type == DWC_PCIE_LANE_EVENT)
> +		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
> +	else if (type == DWC_PCIE_TIME_BASE_EVENT)
> +		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
> +
> +	dwc_pcie_pmu_event_update(event);
> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	struct pci_dev *pdev = pcie_pmu->pdev;
> +	struct hw_perf_event *hwc = &event->hw;
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +	int event_id = DWC_PCIE_EVENT_ID(event);
> +	int lane = DWC_PCIE_EVENT_LANE(event);
> +	u16 ras_des_offset = pcie_pmu->ras_des_offset;
> +	u32 ctrl;
> +
> +	/* one counter for each type and it is in use */
> +	if (pcie_pmu->event[type])
> +		return -ENOSPC;
> +
> +	pcie_pmu->event[type] = event;
> +	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +
> +	if (type == DWC_PCIE_LANE_EVENT) {
> +		/* EVENT_COUNTER_DATA_REG needs clear manually */
> +		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
> +			FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) |
> +			FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
> +		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
> +				       ctrl);
> +	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
> +		/*
> +		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
> +		 * use it with any manually controlled duration. And it is
> +		 * cleared when next measurement starts.
> +		 */
> +		ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) |
> +			FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL,
> +				   DWC_PCIE_DURATION_MANUAL_CTL) |
> +			DWC_PCIE_TIME_BASED_CNT_ENABLE;
> +		pci_write_config_dword(
> +			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl);
> +	}
> +
> +	if (flags & PERF_EF_START)
> +		dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
> +
> +	perf_event_update_userpage(event);
> +
> +	return 0;
> +}
> +
> +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
> +	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
> +
> +	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
> +	perf_event_update_userpage(event);
> +	pcie_pmu->event[type] = NULL;
> +}
> +
> +static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
> +{
> +	cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node);
> +}
> +
> +/*
> + * Find the PMU of a PCI device.
> + * @pdev: The PCI device.
> + */
> +static struct dwc_pcie_pmu *dwc_pcie_find_dev_pmu(struct pci_dev *pdev)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	list_for_each_entry(pcie_pmu, &dwc_pcie_pmu_head, pmu_node)
> +		if (pcie_pmu->pdev == pdev)
> +			return pcie_pmu;
> +
> +	return NULL;
> +}
> +
> +static void dwc_pcie_pmu_unregister_pmu(void *data)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu = data;
> +
> +	if (!pcie_pmu->registered)
> +		return;
> +
> +	pcie_pmu->registered = false;
> +	list_del(&pcie_pmu->pmu_node);
> +	perf_pmu_unregister(&pcie_pmu->pmu);
> +}
> +
> +static int dwc_pcie_pmu_notifier(struct notifier_block *nb,
> +				     unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	/* Unregister the PMU when the device is going to be deleted. */
> +	if (action != BUS_NOTIFY_DEL_DEVICE)
> +		return NOTIFY_DONE;
> +
> +	pcie_pmu = dwc_pcie_find_dev_pmu(pdev);
> +	if (!pcie_pmu)
> +		return NOTIFY_DONE;
> +
> +	dwc_pcie_pmu_unregister_pmu(pcie_pmu);
> +
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block dwc_pcie_pmu_nb = {
> +	.notifier_call = dwc_pcie_pmu_notifier,
> +};
> +
> +static void dwc_pcie_pmu_unregister_nb(void *data)
> +{
> +	bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb);
> +}
> +
> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	bool notify = false;
> +	char *name;
> +	u32 bdf;
> +	int ret;
> +
> +	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
> +	for_each_pci_dev(pdev) {
> +		u16 vsec;
> +		u32 val;
> +
> +		if (!(pci_is_pcie(pdev) &&
> +		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
> +			continue;
> +
> +		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
> +						DWC_PCIE_VSEC_RAS_DES_ID);

We are searching for ALIBABA vendor only  for this capability.

Can we have a list of vendor ID's and we can check for all those vendors 
for this capability so that it will be easy to add new vendors in the list

something like this

struct vendor_ids {

int vendor_id;

};

struct vendor_ids dwc_ids[] = {

     {.vendor_id =PCI_VENDOR_ID_ALIBABA },

      {.vendor_id = XXX},

};

	for_each_pci_dev(pdev) {
		u16 vsec;
		u32 val;

		if (!(pci_is_pcie(pdev) &&
		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
			continue;
		for (int i = 0; i < num of elements of dwc_ids < i++) {
			---

		}
		
		---
	}

Thanks & Regards,
Krishna Chaitanya.

> +		if (!vsec)
> +			continue;
> +
> +		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
> +		if (PCI_VNDR_HEADER_REV(val) != 0x04)
> +			continue;
> +		pci_dbg(pdev,
> +			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
> +
> +		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
> +		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
> +				      bdf);
> +		if (!name) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		/* All checks passed, go go go */
> +		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
> +		if (!pcie_pmu) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		pcie_pmu->pdev = pdev;
> +		pcie_pmu->ras_des_offset = vsec;
> +		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
> +		pcie_pmu->on_cpu = -1;
> +		pcie_pmu->pmu = (struct pmu){
> +			.module		= THIS_MODULE,
> +			.attr_groups	= dwc_pcie_attr_groups,
> +			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +			.task_ctx_nr	= perf_invalid_context,
> +			.event_init	= dwc_pcie_pmu_event_init,
> +			.add		= dwc_pcie_pmu_event_add,
> +			.del		= dwc_pcie_pmu_event_del,
> +			.start		= dwc_pcie_pmu_event_start,
> +			.stop		= dwc_pcie_pmu_event_stop,
> +			.read		= dwc_pcie_pmu_event_update,
> +		};
> +
> +		/* Add this instance to the list used by the offline callback */
> +		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
> +					       &pcie_pmu->cpuhp_node);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering hotplug @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Unwind when platform driver removes */
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
> +			&pcie_pmu->cpuhp_node);
> +		if (ret)
> +			goto out;
> +
> +		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
> +		if (ret) {
> +			pci_err(pdev,
> +				"Error %d registering PMU @%x\n", ret, bdf);
> +			goto out;
> +		}
> +
> +		/* Cache PMU to handle pci device hotplug */
> +		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
> +		pcie_pmu->registered = true;
> +		notify = true;
> +
> +		ret = devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
> +		return devm_add_action_or_reset(
> +			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
> +
> +	return 0;
> +
> +out:
> +	pci_dev_put(pdev);
> +
> +	return ret;
> +}
> +
> +static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	if (pcie_pmu->on_cpu == -1)
> +		pcie_pmu->on_cpu = cpumask_local_spread(
> +			0, dev_to_node(&pcie_pmu->pdev->dev));
> +
> +	return 0;
> +}
> +
> +static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
> +{
> +	struct dwc_pcie_pmu *pcie_pmu;
> +	struct pci_dev *pdev;
> +	int node;
> +	cpumask_t mask;
> +	unsigned int target;
> +
> +	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
> +	/* Nothing to do if this CPU doesn't own the PMU */
> +	if (cpu != pcie_pmu->on_cpu)
> +		return 0;
> +
> +	pcie_pmu->on_cpu = -1;
> +	pdev = pcie_pmu->pdev;
> +	node = dev_to_node(&pdev->dev);
> +	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
> +	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
> +		target = cpumask_any(&mask);
> +	else
> +		target = cpumask_any_but(cpu_online_mask, cpu);
> +
> +	if (target >= nr_cpu_ids) {
> +		pci_err(pdev, "There is no CPU to set\n");
> +		return 0;
> +	}
> +
> +	/* This PMU does NOT support interrupt, just migrate context. */
> +	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
> +	pcie_pmu->on_cpu = target;
> +
> +	return 0;
> +}
> +
> +static struct platform_driver dwc_pcie_pmu_driver = {
> +	.probe = dwc_pcie_pmu_probe,
> +	.driver = {.name = "dwc_pcie_pmu",},
> +};
> +
> +static int __init dwc_pcie_pmu_init(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +				      "perf/dwc_pcie_pmu:online",
> +				      dwc_pcie_pmu_online_cpu,
> +				      dwc_pcie_pmu_offline_cpu);
> +	if (ret < 0)
> +		return ret;
> +
> +	dwc_pcie_pmu_hp_state = ret;
> +
> +	ret = platform_driver_register(&dwc_pcie_pmu_driver);
> +	if (ret)
> +		goto platform_driver_register_err;
> +
> +	dwc_pcie_pmu_dev = platform_device_register_simple(
> +				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
> +	if (IS_ERR(dwc_pcie_pmu_dev)) {
> +		ret = PTR_ERR(dwc_pcie_pmu_dev);
> +		goto platform_device_register_error;
> +	}
> +
> +	return 0;
> +
> +platform_device_register_error:
> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
> +platform_driver_register_err:
> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
> +
> +	return ret;
> +}
> +
> +static void __exit dwc_pcie_pmu_exit(void)
> +{
> +	platform_device_unregister(dwc_pcie_pmu_dev);
> +	platform_driver_unregister(&dwc_pcie_pmu_driver);
> +	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
> +}
> +
> +module_init(dwc_pcie_pmu_init);
> +module_exit(dwc_pcie_pmu_exit);
> +
> +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
> +MODULE_AUTHOR("Shuai Xue <xueshuai@linux.alibaba.com>");
> +MODULE_LICENSE("GPL v2");
Shuai Xue Oct. 30, 2023, 10:29 a.m. UTC | #18
On 2023/10/30 14:28, Krishna Chaitanya Chundru wrote:
> 
...
>> +
>> +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
>> +{
>> +    struct pci_dev *pdev = NULL;
>> +    struct dwc_pcie_pmu *pcie_pmu;
>> +    bool notify = false;
>> +    char *name;
>> +    u32 bdf;
>> +    int ret;
>> +
>> +    /* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
>> +    for_each_pci_dev(pdev) {
>> +        u16 vsec;
>> +        u32 val;
>> +
>> +        if (!(pci_is_pcie(pdev) &&
>> +              pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
>> +            continue;
>> +
>> +        vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
>> +                        DWC_PCIE_VSEC_RAS_DES_ID);
> 
> We are searching for ALIBABA vendor only  for this capability.
> 
> Can we have a list of vendor ID's and we can check for all those vendors for this capability so that it will be easy to add new vendors in the list
> 
> something like this
> 
> struct vendor_ids {
> 
> int vendor_id;
> 
> };
> 
> struct vendor_ids dwc_ids[] = {
> 
>     {.vendor_id =PCI_VENDOR_ID_ALIBABA },
> 
>      {.vendor_id = XXX},
> 
> };
> 
>     for_each_pci_dev(pdev) {
>         u16 vsec;
>         u32 val;
> 
>         if (!(pci_is_pcie(pdev) &&
>               pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
>             continue;
>         for (int i = 0; i < num of elements of dwc_ids < i++) {
>             ---
> 
>         }
>        
>         ---
>     }
> 
> Thanks & Regards,
> Krishna Chaitanya.
> 


Good idea, with vendor_ids, I think it will be easy to extend for other vendors in the future.

Thank you.

Best Regards,
Shuai
Jonathan Cameron Oct. 30, 2023, 7:22 p.m. UTC | #19
On Fri, 27 Oct 2023 20:25:16 +0800
Shuai Xue <xueshuai@linux.alibaba.com> wrote:

> On 2023/10/27 00:52, Robin Murphy wrote:
> > On 26/10/2023 2:44 pm, Jonathan Cameron wrote:  
> >> On Tue, 24 Oct 2023 17:29:34 +0800
> >> Shuai Xue <xueshuai@linux.alibaba.com> wrote:
> >>  
> >>> + Will, Jonathan, Bjorn and Yicong for probe and hotplug handing.
> >>>  
> ...
> >>>>>> +
> >>>>>> +    dwc_pcie_pmu_hp_state = ret;
> >>>>>> +
> >>>>>> +    ret = platform_driver_register(&dwc_pcie_pmu_driver);
> >>>>>> +    if (ret)
> >>>>>> +        goto platform_driver_register_err;
> >>>>>> +
> >>>>>> +    dwc_pcie_pmu_dev = platform_device_register_simple(
> >>>>>> +                "dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
> >>>>>> +    if (IS_ERR(dwc_pcie_pmu_dev)) {
> >>>>>> +        ret = PTR_ERR(dwc_pcie_pmu_dev);
> >>>>>> +        goto platform_device_register_error;
> >>>>>> +    }  
> >>>>>
> >>>>> I'm a bit confused as to why you're having to create a platform device
> >>>>> for a PCI device -- is this because the main designware driver has already
> >>>>> bound to it? A comment here explaining why you need to do this would be
> >>>>> very helpful. In particular, is there any dependency on another driver
> >>>>> to make sure that e.g. config space accesses work properly? If so, we
> >>>>> probably need to enforce module load ordering or something like that.  
> >>>>
> >>>> AFAICS the platform device/driver serve no purpose other than being a hilariously roundabout way to run the for_each_pci_dev() loop in dwc_pcie_pmu_probe() upon module init, and to save explicitly freeing the PMU name/data. Furthermore the devres action for dwc_pcie_pmu_remove_cpuhp_instance() is apparently going for even more style points at module exit by not even relying on the corresponding .remove callback of the tenuous platform driver to undo what its .probe did, but (ab)using the device's devres list to avoid having to keep track of an explicit list of PMU instances at all.  
> >>>
> >>> You are right.  
> >>
> >> Also provides a (potential) parent for the PMU devices which is something
> >> we were trying to clean up for existing PMUs (which end up in the
> >> wrong directly in sysfs because they typically don't have parents).  
> > 
> > Surely the relevant PCI device would be an even more appropriate parent, though, since that's the true topology?
> >   
> 
> I see, I will add its parent.

Agreed - I hadn't it in my head that we didn't have a good mapping to a particular
PCIe device (based on some similarish hardware where the counters are shared across
multiple RPs) Here I guess it's fine to use the PCI device.

Jonathan

> 
> Thank you.
> Best Regards,
> Shuai
>
diff mbox series

Patch

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 273d67ecf6d2..ec6e0d9194a1 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -217,6 +217,13 @@  config MARVELL_CN10K_DDR_PMU
 	  Enable perf support for Marvell DDR Performance monitoring
 	  event on CN10K platform.
 
+config DWC_PCIE_PMU
+	tristate "Synopsys DesignWare PCIe PMU"
+	depends on PCI
+	help
+	  Enable perf support for Synopsys DesignWare PCIe PMU Performance
+	  monitoring event on platform including the Alibaba Yitian 710.
+
 source "drivers/perf/arm_cspmu/Kconfig"
 
 source "drivers/perf/amlogic/Kconfig"
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 16b3ec4db916..a06338e3401c 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -23,6 +23,7 @@  obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
 obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
 obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
+obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
 obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
 obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
 obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
new file mode 100644
index 000000000000..ddb06d763b0c
--- /dev/null
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -0,0 +1,770 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Synopsys DesignWare PCIe PMU driver
+ *
+ * Copyright (C) 2021-2023 Alibaba Inc.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+
+#define DWC_PCIE_VSEC_RAS_DES_ID		0x02
+#define DWC_PCIE_EVENT_CNT_CTL			0x8
+
+/*
+ * Event Counter Data Select includes two parts:
+ * - 27-24: Group number(4-bit: 0..0x7)
+ * - 23-16: Event number(8-bit: 0..0x13) within the Group
+ *
+ * Put them together as in TRM.
+ */
+#define DWC_PCIE_CNT_EVENT_SEL			GENMASK(27, 16)
+#define DWC_PCIE_CNT_LANE_SEL			GENMASK(11, 8)
+#define DWC_PCIE_CNT_STATUS			BIT(7)
+#define DWC_PCIE_CNT_ENABLE			GENMASK(4, 2)
+#define DWC_PCIE_PER_EVENT_OFF			0x1
+#define DWC_PCIE_PER_EVENT_ON			0x3
+#define DWC_PCIE_EVENT_CLEAR			GENMASK(1, 0)
+#define DWC_PCIE_EVENT_PER_CLEAR		0x1
+
+#define DWC_PCIE_EVENT_CNT_DATA			0xC
+
+#define DWC_PCIE_TIME_BASED_ANAL_CTL		0x10
+#define DWC_PCIE_TIME_BASED_REPORT_SEL		GENMASK(31, 24)
+#define DWC_PCIE_TIME_BASED_DURATION_SEL	GENMASK(15, 8)
+#define DWC_PCIE_DURATION_MANUAL_CTL		0x0
+#define DWC_PCIE_DURATION_1MS			0x1
+#define DWC_PCIE_DURATION_10MS			0x2
+#define DWC_PCIE_DURATION_100MS			0x3
+#define DWC_PCIE_DURATION_1S			0x4
+#define DWC_PCIE_DURATION_2S			0x5
+#define DWC_PCIE_DURATION_4S			0x6
+#define DWC_PCIE_DURATION_4US			0xFF
+#define DWC_PCIE_TIME_BASED_TIMER_START		BIT(0)
+#define DWC_PCIE_TIME_BASED_CNT_ENABLE		0x1
+
+#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW	0x14
+#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH	0x18
+
+/* Event attributes */
+#define DWC_PCIE_CONFIG_EVENTID			GENMASK(15, 0)
+#define DWC_PCIE_CONFIG_TYPE			GENMASK(19, 16)
+#define DWC_PCIE_CONFIG_LANE			GENMASK(27, 20)
+
+#define DWC_PCIE_EVENT_ID(event)	FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config)
+#define DWC_PCIE_EVENT_TYPE(event)	FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config)
+#define DWC_PCIE_EVENT_LANE(event)	FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config)
+
+enum dwc_pcie_event_type {
+	DWC_PCIE_TIME_BASE_EVENT,
+	DWC_PCIE_LANE_EVENT,
+	DWC_PCIE_EVENT_TYPE_MAX,
+};
+
+#define DWC_PCIE_LANE_EVENT_MAX_PERIOD		GENMASK_ULL(31, 0)
+#define DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD	GENMASK_ULL(63, 0)
+
+struct dwc_pcie_pmu {
+	struct pmu		pmu;
+	struct pci_dev		*pdev;		/* Root Port device */
+	u16			ras_des_offset;
+	u32			nr_lanes;
+
+	struct list_head	pmu_node;
+	struct hlist_node	cpuhp_node;
+	struct perf_event	*event[DWC_PCIE_EVENT_TYPE_MAX];
+	int			on_cpu;
+	bool			registered;
+};
+
+#define to_dwc_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu))
+
+static struct platform_device *dwc_pcie_pmu_dev;
+static int dwc_pcie_pmu_hp_state;
+static struct list_head dwc_pcie_pmu_head = LIST_HEAD_INIT(dwc_pcie_pmu_head);
+
+static ssize_t cpumask_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static struct attribute_group dwc_pcie_cpumask_attr_group = {
+	.attrs = dwc_pcie_pmu_cpumask_attrs,
+};
+
+struct dwc_pcie_format_attr {
+	struct device_attribute attr;
+	u64 field;
+	int config;
+};
+
+static ssize_t dwc_pcie_pmu_format_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct dwc_pcie_format_attr *fmt = container_of(attr, typeof(*fmt), attr);
+	int lo = __ffs(fmt->field), hi = __fls(fmt->field);
+
+	return sysfs_emit(buf, "config:%d-%d\n", lo, hi);
+}
+
+#define _dwc_pcie_format_attr(_name, _cfg, _fld)			    \
+	(&((struct dwc_pcie_format_attr[]) {{				    \
+		.attr = __ATTR(_name, 0444, dwc_pcie_pmu_format_show, NULL),\
+		.config = _cfg,						    \
+		.field = _fld,						    \
+	}})[0].attr.attr)
+
+#define dwc_pcie_format_attr(_name, _fld)	_dwc_pcie_format_attr(_name, 0, _fld)
+
+static struct attribute *dwc_pcie_format_attrs[] = {
+	dwc_pcie_format_attr(type, DWC_PCIE_CONFIG_TYPE),
+	dwc_pcie_format_attr(eventid, DWC_PCIE_CONFIG_EVENTID),
+	dwc_pcie_format_attr(lane, DWC_PCIE_CONFIG_LANE),
+	NULL,
+};
+
+static struct attribute_group dwc_pcie_format_attrs_group = {
+	.name = "format",
+	.attrs = dwc_pcie_format_attrs,
+};
+
+struct dwc_pcie_event_attr {
+	struct device_attribute attr;
+	enum dwc_pcie_event_type type;
+	u16 eventid;
+	u8 lane;
+};
+
+static ssize_t dwc_pcie_event_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct dwc_pcie_event_attr *eattr;
+
+	eattr = container_of(attr, typeof(*eattr), attr);
+
+	if (eattr->type == DWC_PCIE_LANE_EVENT)
+		return sysfs_emit(buf, "eventid=0x%x,type=0x%x,lane=?\n",
+				  eattr->eventid, eattr->type);
+	else if (eattr->type == DWC_PCIE_TIME_BASE_EVENT)
+		return sysfs_emit(buf, "eventid=0x%x,type=0x%x\n",
+				  eattr->eventid, eattr->type);
+
+	return 0;
+}
+
+#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane)		\
+	(&((struct dwc_pcie_event_attr[]) {{				\
+		.attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL),	\
+		.type = _type,						\
+		.eventid = _eventid,					\
+		.lane = _lane,						\
+	}})[0].attr.attr)
+
+#define DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(_name, _eventid)		\
+	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0)
+#define DWC_PCIE_PMU_LANE_EVENT_ATTR(_name, _eventid)			\
+	DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_LANE_EVENT, _eventid, 0)
+
+static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
+	/* Group #0 */
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(one_cycle, 0x00),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_L0S, 0x01),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(RX_L0S, 0x02),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L0, 0x03),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1, 0x04),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_1, 0x05),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_2, 0x06),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(CFG_RCVRY, 0x07),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_RX_L0S, 0x08),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09),
+
+	/* Group #1 */
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
+
+	/*
+	 * Leave it to the user to specify the lane ID to avoid generating
+	 * a list of hundreds of events.
+	 */
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ack_dllp, 0x600),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_read, 0x703),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_write, 0x704),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_read, 0x705),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_without_data, 0x706),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_with_data, 0x707),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_message_tlp, 0x708),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_atomic, 0x709),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_tlp_with_prefix, 0x70A),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_write, 0x70B),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_read, 0x70C),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_write, 0x70F),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_read, 0x710),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_without_data, 0x711),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_with_data, 0x712),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_message_tlp, 0x713),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_atomic, 0x714),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_tlp_with_prefix, 0x715),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ccix_tlp, 0x716),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ccix_tlp, 0x717),
+	NULL
+};
+
+static const struct attribute_group dwc_pcie_event_attrs_group = {
+	.name = "events",
+	.attrs = dwc_pcie_pmu_time_event_attrs,
+};
+
+static const struct attribute_group *dwc_pcie_attr_groups[] = {
+	&dwc_pcie_event_attrs_group,
+	&dwc_pcie_format_attrs_group,
+	&dwc_pcie_cpumask_attr_group,
+	NULL
+};
+
+static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
+					   bool enable)
+{
+	struct pci_dev *pdev = pcie_pmu->pdev;
+	u16 ras_des_offset = pcie_pmu->ras_des_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, &val);
+
+	/* Clear DWC_PCIE_CNT_ENABLE field first */
+	val &= ~DWC_PCIE_CNT_ENABLE;
+	if (enable)
+		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
+	else
+		val |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
+
+	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, val);
+}
+
+static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
+					  bool enable)
+{
+	struct pci_dev *pdev = pcie_pmu->pdev;
+	u16 ras_des_offset = pcie_pmu->ras_des_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
+			      &val);
+
+	if (enable)
+		val |= DWC_PCIE_TIME_BASED_CNT_ENABLE;
+	else
+		val &= ~DWC_PCIE_TIME_BASED_CNT_ENABLE;
+
+	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL,
+			       val);
+}
+
+static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
+{
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
+	struct pci_dev *pdev = pcie_pmu->pdev;
+	u16 ras_des_offset = pcie_pmu->ras_des_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
+
+	return val;
+}
+
+static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event)
+{
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
+	struct pci_dev *pdev = pcie_pmu->pdev;
+	int event_id = DWC_PCIE_EVENT_ID(event);
+	u16 ras_des_offset = pcie_pmu->ras_des_offset;
+	u32 lo, hi, ss;
+
+	/*
+	 * The 64-bit value of the data counter is spread across two
+	 * registers that are not synchronized. In order to read them
+	 * atomically, ensure that the high 32 bits match before and after
+	 * reading the low 32 bits.
+	 */
+	pci_read_config_dword(pdev, ras_des_offset +
+		DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi);
+	do {
+		/* snapshot the high 32 bits */
+		ss = hi;
+
+		pci_read_config_dword(
+			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW,
+			&lo);
+		pci_read_config_dword(
+			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH,
+			&hi);
+	} while (hi != ss);
+
+	/*
+	 * The Group#1 event measures the amount of data processed in 16-byte
+	 * units. Simplify the end-user interface by multiplying the counter
+	 * at the point of read.
+	 */
+	if (event_id >= 0x20 && event_id <= 0x23)
+		return (((u64)hi << 32) | lo) << 4;
+	else
+		return (((u64)hi << 32) | lo);
+}
+
+static void dwc_pcie_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+	u64 delta, prev, now;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+
+		if (type == DWC_PCIE_LANE_EVENT)
+			now = dwc_pcie_pmu_read_lane_event_counter(event);
+		else if (type == DWC_PCIE_TIME_BASE_EVENT)
+			now = dwc_pcie_pmu_read_time_based_counter(event);
+
+	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+	if (type == DWC_PCIE_LANE_EVENT)
+		delta = (now - prev) & DWC_PCIE_LANE_EVENT_MAX_PERIOD;
+	else if (type == DWC_PCIE_TIME_BASE_EVENT)
+		delta = (now - prev) & DWC_PCIE_TIME_BASED_EVENT_MAX_PERIOD;
+
+	local64_add(delta, &event->count);
+}
+
+static int dwc_pcie_pmu_event_init(struct perf_event *event)
+{
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
+	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+	struct perf_event *sibling;
+	u32 lane;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* We don't support sampling */
+	if (is_sampling_event(event))
+		return -EINVAL;
+
+	/* We cannot support task bound events */
+	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	if (event->group_leader != event &&
+	    !is_software_event(event->group_leader))
+		return -EINVAL;
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (sibling->pmu != event->pmu && !is_software_event(sibling))
+			return -EINVAL;
+	}
+
+	if (type == DWC_PCIE_LANE_EVENT) {
+		lane = DWC_PCIE_EVENT_LANE(event);
+		if (lane < 0 || lane >= pcie_pmu->nr_lanes)
+			return -EINVAL;
+	}
+
+	event->cpu = pcie_pmu->on_cpu;
+
+	return 0;
+}
+
+static void dwc_pcie_pmu_set_period(struct hw_perf_event *hwc)
+{
+	local64_set(&hwc->prev_count, 0);
+}
+
+static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
+	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+
+	hwc->state = 0;
+	dwc_pcie_pmu_set_period(hwc);
+
+	if (type == DWC_PCIE_LANE_EVENT)
+		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
+	else if (type == DWC_PCIE_TIME_BASE_EVENT)
+		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
+}
+
+static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
+{
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
+	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	if (type == DWC_PCIE_LANE_EVENT)
+		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
+	else if (type == DWC_PCIE_TIME_BASE_EVENT)
+		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
+
+	dwc_pcie_pmu_event_update(event);
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
+	struct pci_dev *pdev = pcie_pmu->pdev;
+	struct hw_perf_event *hwc = &event->hw;
+	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+	int event_id = DWC_PCIE_EVENT_ID(event);
+	int lane = DWC_PCIE_EVENT_LANE(event);
+	u16 ras_des_offset = pcie_pmu->ras_des_offset;
+	u32 ctrl;
+
+	/* one counter for each type and it is in use */
+	if (pcie_pmu->event[type])
+		return -ENOSPC;
+
+	pcie_pmu->event[type] = event;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (type == DWC_PCIE_LANE_EVENT) {
+		/* EVENT_COUNTER_DATA_REG needs clear manually */
+		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
+			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
+			FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) |
+			FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
+		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
+				       ctrl);
+	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
+		/*
+		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
+		 * use it with any manually controlled duration. And it is
+		 * cleared when next measurement starts.
+		 */
+		ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) |
+			FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL,
+				   DWC_PCIE_DURATION_MANUAL_CTL) |
+			DWC_PCIE_TIME_BASED_CNT_ENABLE;
+		pci_write_config_dword(
+			pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl);
+	}
+
+	if (flags & PERF_EF_START)
+		dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD);
+
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
+{
+	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
+	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
+
+	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
+	perf_event_update_userpage(event);
+	pcie_pmu->event[type] = NULL;
+}
+
+static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
+{
+	cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node);
+}
+
+/*
+ * Find the PMU of a PCI device.
+ * @pdev: The PCI device.
+ */
+static struct dwc_pcie_pmu *dwc_pcie_find_dev_pmu(struct pci_dev *pdev)
+{
+	struct dwc_pcie_pmu *pcie_pmu;
+
+	list_for_each_entry(pcie_pmu, &dwc_pcie_pmu_head, pmu_node)
+		if (pcie_pmu->pdev == pdev)
+			return pcie_pmu;
+
+	return NULL;
+}
+
+static void dwc_pcie_pmu_unregister_pmu(void *data)
+{
+	struct dwc_pcie_pmu *pcie_pmu = data;
+
+	if (!pcie_pmu->registered)
+		return;
+
+	pcie_pmu->registered = false;
+	list_del(&pcie_pmu->pmu_node);
+	perf_pmu_unregister(&pcie_pmu->pmu);
+}
+
+static int dwc_pcie_pmu_notifier(struct notifier_block *nb,
+				     unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct dwc_pcie_pmu *pcie_pmu;
+
+	/* Unregister the PMU when the device is going to be deleted. */
+	if (action != BUS_NOTIFY_DEL_DEVICE)
+		return NOTIFY_DONE;
+
+	pcie_pmu = dwc_pcie_find_dev_pmu(pdev);
+	if (!pcie_pmu)
+		return NOTIFY_DONE;
+
+	dwc_pcie_pmu_unregister_pmu(pcie_pmu);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block dwc_pcie_pmu_nb = {
+	.notifier_call = dwc_pcie_pmu_notifier,
+};
+
+static void dwc_pcie_pmu_unregister_nb(void *data)
+{
+	bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb);
+}
+
+static int dwc_pcie_pmu_probe(struct platform_device *plat_dev)
+{
+	struct pci_dev *pdev = NULL;
+	struct dwc_pcie_pmu *pcie_pmu;
+	bool notify = false;
+	char *name;
+	u32 bdf;
+	int ret;
+
+	/* Match the rootport with VSEC_RAS_DES_ID, and register a PMU for it */
+	for_each_pci_dev(pdev) {
+		u16 vsec;
+		u32 val;
+
+		if (!(pci_is_pcie(pdev) &&
+		      pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT))
+			continue;
+
+		vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_ALIBABA,
+						DWC_PCIE_VSEC_RAS_DES_ID);
+		if (!vsec)
+			continue;
+
+		pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
+		if (PCI_VNDR_HEADER_REV(val) != 0x04)
+			continue;
+		pci_dbg(pdev,
+			"Detected PCIe Vendor-Specific Extended Capability RAS DES\n");
+
+		bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
+		name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x",
+				      bdf);
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		/* All checks passed, go go go */
+		pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
+		if (!pcie_pmu) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		pcie_pmu->pdev = pdev;
+		pcie_pmu->ras_des_offset = vsec;
+		pcie_pmu->nr_lanes = pcie_get_width_cap(pdev);
+		pcie_pmu->on_cpu = -1;
+		pcie_pmu->pmu = (struct pmu){
+			.module		= THIS_MODULE,
+			.attr_groups	= dwc_pcie_attr_groups,
+			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+			.task_ctx_nr	= perf_invalid_context,
+			.event_init	= dwc_pcie_pmu_event_init,
+			.add		= dwc_pcie_pmu_event_add,
+			.del		= dwc_pcie_pmu_event_del,
+			.start		= dwc_pcie_pmu_event_start,
+			.stop		= dwc_pcie_pmu_event_stop,
+			.read		= dwc_pcie_pmu_event_update,
+		};
+
+		/* Add this instance to the list used by the offline callback */
+		ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state,
+					       &pcie_pmu->cpuhp_node);
+		if (ret) {
+			pci_err(pdev,
+				"Error %d registering hotplug @%x\n", ret, bdf);
+			goto out;
+		}
+
+		/* Unwind when platform driver removes */
+		ret = devm_add_action_or_reset(
+			&plat_dev->dev, dwc_pcie_pmu_remove_cpuhp_instance,
+			&pcie_pmu->cpuhp_node);
+		if (ret)
+			goto out;
+
+		ret = perf_pmu_register(&pcie_pmu->pmu, name, -1);
+		if (ret) {
+			pci_err(pdev,
+				"Error %d registering PMU @%x\n", ret, bdf);
+			goto out;
+		}
+
+		/* Cache PMU to handle pci device hotplug */
+		list_add(&pcie_pmu->pmu_node, &dwc_pcie_pmu_head);
+		pcie_pmu->registered = true;
+		notify = true;
+
+		ret = devm_add_action_or_reset(
+			&plat_dev->dev, dwc_pcie_pmu_unregister_pmu, pcie_pmu);
+		if (ret)
+			goto out;
+	}
+
+	if (notify && !bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb))
+		return devm_add_action_or_reset(
+			&plat_dev->dev, dwc_pcie_pmu_unregister_nb, NULL);
+
+	return 0;
+
+out:
+	pci_dev_put(pdev);
+
+	return ret;
+}
+
+static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
+{
+	struct dwc_pcie_pmu *pcie_pmu;
+
+	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
+	if (pcie_pmu->on_cpu == -1)
+		pcie_pmu->on_cpu = cpumask_local_spread(
+			0, dev_to_node(&pcie_pmu->pdev->dev));
+
+	return 0;
+}
+
+static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
+{
+	struct dwc_pcie_pmu *pcie_pmu;
+	struct pci_dev *pdev;
+	int node;
+	cpumask_t mask;
+	unsigned int target;
+
+	pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node);
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (cpu != pcie_pmu->on_cpu)
+		return 0;
+
+	pcie_pmu->on_cpu = -1;
+	pdev = pcie_pmu->pdev;
+	node = dev_to_node(&pdev->dev);
+	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
+	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
+		target = cpumask_any(&mask);
+	else
+		target = cpumask_any_but(cpu_online_mask, cpu);
+
+	if (target >= nr_cpu_ids) {
+		pci_err(pdev, "There is no CPU to set\n");
+		return 0;
+	}
+
+	/* This PMU does NOT support interrupt, just migrate context. */
+	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
+	pcie_pmu->on_cpu = target;
+
+	return 0;
+}
+
+static struct platform_driver dwc_pcie_pmu_driver = {
+	.probe = dwc_pcie_pmu_probe,
+	.driver = {.name = "dwc_pcie_pmu",},
+};
+
+static int __init dwc_pcie_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/dwc_pcie_pmu:online",
+				      dwc_pcie_pmu_online_cpu,
+				      dwc_pcie_pmu_offline_cpu);
+	if (ret < 0)
+		return ret;
+
+	dwc_pcie_pmu_hp_state = ret;
+
+	ret = platform_driver_register(&dwc_pcie_pmu_driver);
+	if (ret)
+		goto platform_driver_register_err;
+
+	dwc_pcie_pmu_dev = platform_device_register_simple(
+				"dwc_pcie_pmu", PLATFORM_DEVID_NONE, NULL, 0);
+	if (IS_ERR(dwc_pcie_pmu_dev)) {
+		ret = PTR_ERR(dwc_pcie_pmu_dev);
+		goto platform_device_register_error;
+	}
+
+	return 0;
+
+platform_device_register_error:
+	platform_driver_unregister(&dwc_pcie_pmu_driver);
+platform_driver_register_err:
+	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
+
+	return ret;
+}
+
+static void __exit dwc_pcie_pmu_exit(void)
+{
+	platform_device_unregister(dwc_pcie_pmu_dev);
+	platform_driver_unregister(&dwc_pcie_pmu_driver);
+	cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state);
+}
+
+module_init(dwc_pcie_pmu_init);
+module_exit(dwc_pcie_pmu_exit);
+
+MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller");
+MODULE_AUTHOR("Shuai Xue <xueshuai@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");