diff mbox series

[v5,13/16] cxl/pci: Add error handler for CXL PCIe Port RAS errors

Message ID 20250107143852.3692571-14-terry.bowman@amd.com
State New
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry Jan. 7, 2025, 2:38 p.m. UTC
Introduce correctable and uncorrectable CXL PCIe Port Protocol Error
handlers.

The handlers will be called with a 'struct pci_dev' parameter
indicating the CXL Port device requiring handling. The CXL PCIe Port
device's underlying 'struct device' will match the port device in the
CXL topology.

Use the PCIe Port's device object to find the matching CXL Upstream Switch
Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
matching CXL Port device should contain a cached reference to the RAS
register block. The cached RAS block will be used handling the error.

Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
a reference to the RAS registers as a parameter. These functions will use
the RAS register reference to indicate an error and clear the device's RAS
status.

Future patches will assign the error handlers and add trace logging.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/pci.c | 63 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

Comments

Jonathan Cameron Jan. 14, 2025, 11:46 a.m. UTC | #1
On Tue, 7 Jan 2025 08:38:49 -0600
Terry Bowman <terry.bowman@amd.com> wrote:

> Introduce correctable and uncorrectable CXL PCIe Port Protocol Error
> handlers.
> 
> The handlers will be called with a 'struct pci_dev' parameter
> indicating the CXL Port device requiring handling. The CXL PCIe Port
> device's underlying 'struct device' will match the port device in the
> CXL topology.
> 
> Use the PCIe Port's device object to find the matching CXL Upstream Switch
> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
> matching CXL Port device should contain a cached reference to the RAS
> register block. The cached RAS block will be used handling the error.
> 
> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
> a reference to the RAS registers as a parameter. These functions will use
> the RAS register reference to indicate an error and clear the device's RAS
> status.
> 
> Future patches will assign the error handlers and add trace logging.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/core/pci.c | 63 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 8275b3dc3589..411834f7efe0 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -776,6 +776,69 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>  }
>  
> +static int match_uport(struct device *dev, const void *data)
> +{
> +	struct device *uport_dev = (struct device *)data;

It should be const and then no need to cast explicitly.


> +	struct cxl_port *port;
> +
> +	if (!is_cxl_port(dev))
> +		return 0;
> +
> +	port = to_cxl_port(dev);
> +
> +	return port->uport_dev == uport_dev;
> +}
> +
> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev)
> +{
> +	struct cxl_port *port;
> +
> +	if (!pdev)
> +		return NULL;
> +
> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
> +		struct cxl_dport *dport;
> +		void __iomem *ras_base;
> +
> +		port = find_cxl_port(&pdev->dev, &dport);
Maybe some __free magic on port as then can just
return dport ? dport->regs.ras : NULL;
> +		ras_base = dport ? dport->regs.ras : NULL;
> +		if (port)
> +			put_device(&port->dev);
> +		return ras_base;
> +	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {

	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {

or maybe just make it a switch statement?

> +		struct device *port_dev;
> +
> +		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
> +					   match_uport);
Likewise on __free magic to automate the put.

> +		if (!port_dev)
> +			return NULL;
> +
> +		port = to_cxl_port(port_dev);
> +		if (!port)

why no put of the port_dev?

> +			return NULL;
> +
> +		put_device(port_dev);
> +		return port->uport_regs.ras;
> +	}
> +
> +	return NULL;
> +}
Bowman, Terry Jan. 14, 2025, 9:20 p.m. UTC | #2
On 1/14/2025 5:46 AM, Jonathan Cameron wrote:
> On Tue, 7 Jan 2025 08:38:49 -0600
> Terry Bowman <terry.bowman@amd.com> wrote:
>
>> Introduce correctable and uncorrectable CXL PCIe Port Protocol Error
>> handlers.
>>
>> The handlers will be called with a 'struct pci_dev' parameter
>> indicating the CXL Port device requiring handling. The CXL PCIe Port
>> device's underlying 'struct device' will match the port device in the
>> CXL topology.
>>
>> Use the PCIe Port's device object to find the matching CXL Upstream Switch
>> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
>> matching CXL Port device should contain a cached reference to the RAS
>> register block. The cached RAS block will be used handling the error.
>>
>> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
>> a reference to the RAS registers as a parameter. These functions will use
>> the RAS register reference to indicate an error and clear the device's RAS
>> status.
>>
>> Future patches will assign the error handlers and add trace logging.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/pci.c | 63 ++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 63 insertions(+)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 8275b3dc3589..411834f7efe0 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -776,6 +776,69 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>>  }
>>  
>> +static int match_uport(struct device *dev, const void *data)
>> +{
>> +	struct device *uport_dev = (struct device *)data;
> It should be const and then no need to cast explicitly.
>

Ok

>> +	struct cxl_port *port;
>> +
>> +	if (!is_cxl_port(dev))
>> +		return 0;
>> +
>> +	port = to_cxl_port(dev);
>> +
>> +	return port->uport_dev == uport_dev;
>> +}
>> +
>> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev)
>> +{
>> +	struct cxl_port *port;
>> +
>> +	if (!pdev)
>> +		return NULL;
>> +
>> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
>> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
>> +		struct cxl_dport *dport;
>> +		void __iomem *ras_base;
>> +
>> +		port = find_cxl_port(&pdev->dev, &dport);
> Maybe some __free magic on port as then can just
> return dport ? dport->regs.ras : NULL;

Ok

>> +		ras_base = dport ? dport->regs.ras : NULL;
>> +		if (port)
>> +			put_device(&port->dev);
>> +		return ras_base;
>> +	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
> 	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
>
> or maybe just make it a switch statement?

I'll add the _free approach back.
>> +		struct device *port_dev;
>> +
>> +		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
>> +					   match_uport);
> Likewise on __free magic to automate the put.
>
>> +		if (!port_dev)
>> +			return NULL;
>> +
>> +		port = to_cxl_port(port_dev);
>> +		if (!port)
> why no put of the port_dev?
I overlooked. Thanks

Regards,
Terry
>> +			return NULL;
>> +
>> +		put_device(port_dev);
>> +		return port->uport_regs.ras;
>> +	}
>> +
>> +	return NULL;
>> +}
Ira Weiny Jan. 14, 2025, 10:51 p.m. UTC | #3
Terry Bowman wrote:
> Introduce correctable and uncorrectable CXL PCIe Port Protocol Error
> handlers.
> 
> The handlers will be called with a 'struct pci_dev' parameter
> indicating the CXL Port device requiring handling. The CXL PCIe Port
> device's underlying 'struct device' will match the port device in the
> CXL topology.
> 
> Use the PCIe Port's device object to find the matching CXL Upstream Switch
> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
> matching CXL Port device should contain a cached reference to the RAS
> register block. The cached RAS block will be used handling the error.
> 
> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
> a reference to the RAS registers as a parameter. These functions will use
> the RAS register reference to indicate an error and clear the device's RAS
> status.
> 
> Future patches will assign the error handlers and add trace logging.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/core/pci.c | 63 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 8275b3dc3589..411834f7efe0 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -776,6 +776,69 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>  }
>  
> +static int match_uport(struct device *dev, const void *data)
> +{
> +	struct device *uport_dev = (struct device *)data;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_port(dev))
> +		return 0;
> +
> +	port = to_cxl_port(dev);
> +
> +	return port->uport_dev == uport_dev;
> +}
> +
> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev)
> +{
> +	struct cxl_port *port;
> +
> +	if (!pdev)
> +		return NULL;
> +
> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
> +		struct cxl_dport *dport;
> +		void __iomem *ras_base;
> +
> +		port = find_cxl_port(&pdev->dev, &dport);
> +		ras_base = dport ? dport->regs.ras : NULL;
> +		if (port)
> +			put_device(&port->dev);
> +		return ras_base;
> +	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
> +		struct device *port_dev;
> +
> +		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
> +					   match_uport);
> +		if (!port_dev)
> +			return NULL;
> +
> +		port = to_cxl_port(port_dev);
> +		if (!port)
> +			return NULL;
> +
> +		put_device(port_dev);

Is there any chance the cxl_port (and subsequently the mapping of the ras
registers) could go away between here and their use in
__cxl_handle_*_ras()?

Ira

> +		return port->uport_regs.ras;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void cxl_port_cor_error_detected(struct pci_dev *pdev)
> +{
> +	void __iomem *ras_base = cxl_pci_port_ras(pdev);
> +
> +	__cxl_handle_cor_ras(&pdev->dev, ras_base);
> +}
> +
> +static bool cxl_port_error_detected(struct pci_dev *pdev)
> +{
> +	void __iomem *ras_base = cxl_pci_port_ras(pdev);
> +
> +	return __cxl_handle_ras(&pdev->dev, ras_base);
> +}
> +
>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  {
>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
> -- 
> 2.34.1
>
Bowman, Terry Jan. 14, 2025, 11:10 p.m. UTC | #4
On 1/14/2025 4:51 PM, Ira Weiny wrote:
> Terry Bowman wrote:
>> Introduce correctable and uncorrectable CXL PCIe Port Protocol Error
>> handlers.
>>
>> The handlers will be called with a 'struct pci_dev' parameter
>> indicating the CXL Port device requiring handling. The CXL PCIe Port
>> device's underlying 'struct device' will match the port device in the
>> CXL topology.
>>
>> Use the PCIe Port's device object to find the matching CXL Upstream Switch
>> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
>> matching CXL Port device should contain a cached reference to the RAS
>> register block. The cached RAS block will be used handling the error.
>>
>> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
>> a reference to the RAS registers as a parameter. These functions will use
>> the RAS register reference to indicate an error and clear the device's RAS
>> status.
>>
>> Future patches will assign the error handlers and add trace logging.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/pci.c | 63 ++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 63 insertions(+)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 8275b3dc3589..411834f7efe0 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -776,6 +776,69 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>>  }
>>  
>> +static int match_uport(struct device *dev, const void *data)
>> +{
>> +	struct device *uport_dev = (struct device *)data;
>> +	struct cxl_port *port;
>> +
>> +	if (!is_cxl_port(dev))
>> +		return 0;
>> +
>> +	port = to_cxl_port(dev);
>> +
>> +	return port->uport_dev == uport_dev;
>> +}
>> +
>> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev)
>> +{
>> +	struct cxl_port *port;
>> +
>> +	if (!pdev)
>> +		return NULL;
>> +
>> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
>> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
>> +		struct cxl_dport *dport;
>> +		void __iomem *ras_base;
>> +
>> +		port = find_cxl_port(&pdev->dev, &dport);
>> +		ras_base = dport ? dport->regs.ras : NULL;
>> +		if (port)
>> +			put_device(&port->dev);
>> +		return ras_base;
>> +	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
>> +		struct device *port_dev;
>> +
>> +		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
>> +					   match_uport);
>> +		if (!port_dev)
>> +			return NULL;
>> +
>> +		port = to_cxl_port(port_dev);
>> +		if (!port)
>> +			return NULL;
>> +
>> +		put_device(port_dev);
> Is there any chance the cxl_port (and subsequently the mapping of the ras
> registers) could go away between here and their use in
> __cxl_handle_*_ras()?
>
> Ira

Yes. I believe that is possible.

Regards,
Terry

>> +		return port->uport_regs.ras;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static void cxl_port_cor_error_detected(struct pci_dev *pdev)
>> +{
>> +	void __iomem *ras_base = cxl_pci_port_ras(pdev);
>> +
>> +	__cxl_handle_cor_ras(&pdev->dev, ras_base);
>> +}
>> +
>> +static bool cxl_port_error_detected(struct pci_dev *pdev)
>> +{
>> +	void __iomem *ras_base = cxl_pci_port_ras(pdev);
>> +
>> +	return __cxl_handle_ras(&pdev->dev, ras_base);
>> +}
>> +
>>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  {
>>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
>> -- 
>> 2.34.1
>>
>
Bowman, Terry Jan. 14, 2025, 11:42 p.m. UTC | #5
On 1/14/2025 4:51 PM, Ira Weiny wrote:
> Terry Bowman wrote:
>> Introduce correctable and uncorrectable CXL PCIe Port Protocol Error
>> handlers.
>>
>> The handlers will be called with a 'struct pci_dev' parameter
>> indicating the CXL Port device requiring handling. The CXL PCIe Port
>> device's underlying 'struct device' will match the port device in the
>> CXL topology.
>>
>> Use the PCIe Port's device object to find the matching CXL Upstream Switch
>> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
>> matching CXL Port device should contain a cached reference to the RAS
>> register block. The cached RAS block will be used handling the error.
>>
>> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
>> a reference to the RAS registers as a parameter. These functions will use
>> the RAS register reference to indicate an error and clear the device's RAS
>> status.
>>
>> Future patches will assign the error handlers and add trace logging.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/pci.c | 63 ++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 63 insertions(+)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 8275b3dc3589..411834f7efe0 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -776,6 +776,69 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>>  }
>>  
>> +static int match_uport(struct device *dev, const void *data)
>> +{
>> +	struct device *uport_dev = (struct device *)data;
>> +	struct cxl_port *port;
>> +
>> +	if (!is_cxl_port(dev))
>> +		return 0;
>> +
>> +	port = to_cxl_port(dev);
>> +
>> +	return port->uport_dev == uport_dev;
>> +}
>> +
>> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev)
>> +{
>> +	struct cxl_port *port;
>> +
>> +	if (!pdev)
>> +		return NULL;
>> +
>> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
>> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
>> +		struct cxl_dport *dport;
>> +		void __iomem *ras_base;
>> +
>> +		port = find_cxl_port(&pdev->dev, &dport);
>> +		ras_base = dport ? dport->regs.ras : NULL;
>> +		if (port)
>> +			put_device(&port->dev);
>> +		return ras_base;
>> +	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
>> +		struct device *port_dev;
>> +
>> +		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
>> +					   match_uport);
>> +		if (!port_dev)
>> +			return NULL;
>> +
>> +		port = to_cxl_port(port_dev);
>> +		if (!port)
>> +			return NULL;
>> +
>> +		put_device(port_dev);
> Is there any chance the cxl_port (and subsequently the mapping of the ras
> registers) could go away between here and their use in
> __cxl_handle_*_ras()?
>
> Ira
Yes, this could happen.

>> +		return port->uport_regs.ras;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static void cxl_port_cor_error_detected(struct pci_dev *pdev)
>> +{
>> +	void __iomem *ras_base = cxl_pci_port_ras(pdev);
>> +
>> +	__cxl_handle_cor_ras(&pdev->dev, ras_base);
>> +}
>> +
>> +static bool cxl_port_error_detected(struct pci_dev *pdev)
>> +{
>> +	void __iomem *ras_base = cxl_pci_port_ras(pdev);
>> +
>> +	return __cxl_handle_ras(&pdev->dev, ras_base);
>> +}
>> +
>>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  {
>>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
>> -- 
>> 2.34.1
>>
>
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 8275b3dc3589..411834f7efe0 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -776,6 +776,69 @@  static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
 	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
 }
 
+static int match_uport(struct device *dev, const void *data)
+{
+	struct device *uport_dev = (struct device *)data;
+	struct cxl_port *port;
+
+	if (!is_cxl_port(dev))
+		return 0;
+
+	port = to_cxl_port(dev);
+
+	return port->uport_dev == uport_dev;
+}
+
+static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev)
+{
+	struct cxl_port *port;
+
+	if (!pdev)
+		return NULL;
+
+	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
+	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
+		struct cxl_dport *dport;
+		void __iomem *ras_base;
+
+		port = find_cxl_port(&pdev->dev, &dport);
+		ras_base = dport ? dport->regs.ras : NULL;
+		if (port)
+			put_device(&port->dev);
+		return ras_base;
+	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
+		struct device *port_dev;
+
+		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
+					   match_uport);
+		if (!port_dev)
+			return NULL;
+
+		port = to_cxl_port(port_dev);
+		if (!port)
+			return NULL;
+
+		put_device(port_dev);
+		return port->uport_regs.ras;
+	}
+
+	return NULL;
+}
+
+static void cxl_port_cor_error_detected(struct pci_dev *pdev)
+{
+	void __iomem *ras_base = cxl_pci_port_ras(pdev);
+
+	__cxl_handle_cor_ras(&pdev->dev, ras_base);
+}
+
+static bool cxl_port_error_detected(struct pci_dev *pdev)
+{
+	void __iomem *ras_base = cxl_pci_port_ras(pdev);
+
+	return __cxl_handle_ras(&pdev->dev, ras_base);
+}
+
 void cxl_uport_init_ras_reporting(struct cxl_port *port)
 {
 	/* uport may have more than 1 downstream EP. Check if already mapped. */