diff mbox series

[v8,04/16] cxl/aer: AER service driver forwards CXL error to CXL driver

Message ID 20250327014717.2988633-5-terry.bowman@amd.com (mailing list archive)
State New
Delegated to: Bjorn Helgaas
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry March 27, 2025, 1:47 a.m. UTC
The AER service driver includes a CXL-specific kfifo, intended to forward
CXL errors to the CXL driver. However, the forwarding functionality is
currently unimplemented. Update the AER driver to enable error forwarding
to the CXL driver.

Modify the AER service driver's handle_error_source(), which is called from
process_aer_err_devices(), to distinguish between PCIe and CXL errors.

Rename and update is_internal_error() to is_cxl_error(). Ensuring it
checks both the 'struct aer_info::is_cxl' flag and the AER internal error
masks.

If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
as done in the current AER driver.

If the error is a CXL-related error then forward it to the CXL driver for
handling using the kfifo mechanism.

Introduce a new function forward_cxl_error(), which constructs a CXL
protocol error context using cxl_create_prot_err_info(). This context is
then passed to the CXL driver via kfifo using a 'struct work_struct'.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/pci/pcie/aer.c | 61 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 6 deletions(-)

Comments

Bjorn Helgaas March 27, 2025, 5:13 p.m. UTC | #1
On Wed, Mar 26, 2025 at 08:47:05PM -0500, Terry Bowman wrote:
> The AER service driver includes a CXL-specific kfifo, intended to forward
> CXL errors to the CXL driver. However, the forwarding functionality is
> currently unimplemented. Update the AER driver to enable error forwarding
> to the CXL driver.
> 
> Modify the AER service driver's handle_error_source(), which is called from
> process_aer_err_devices(), to distinguish between PCIe and CXL errors.
> 
> Rename and update is_internal_error() to is_cxl_error(). Ensuring it
> checks both the 'struct aer_info::is_cxl' flag and the AER internal error
> masks.
> 
> If the error is a standard PCIe error then continue calling pcie_aer_handle_error()
> as done in the current AER driver.
> 
> If the error is a CXL-related error then forward it to the CXL driver for
> handling using the kfifo mechanism.
> 
> Introduce a new function forward_cxl_error(), which constructs a CXL
> protocol error context using cxl_create_prot_err_info(). This context is
> then passed to the CXL driver via kfifo using a 'struct work_struct'.

This only touches drivers/pci, so I would make the subject prefix be
"PCI/AER".

> +static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
> +{
> +	int severity = info->severity;
> +	struct cxl_prot_err_work_data wd;
> +	struct cxl_prot_error_info *err_info = &wd.err_info;
> +	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(_pdev);
> +
> +	if (!cxl_create_prot_err_info) {
> +		pci_err(pdev, "Failed. CXL-AER interface not initialized.");
> +		return;
> +	}
> +
> +	if (cxl_create_prot_err_info(pdev, severity, err_info)) {
> +		pci_err(pdev, "Failed to create CXL protocol error information");
> +		return;
> +	}
> +
> +	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);
> +
> +	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
> +		pr_err_ratelimited("CXL kfifo overflow\n");

Needs a dev identifier here to anchor the message to something.

> +		return;
> +	}
> +
> +	schedule_work(cxl_prot_err_work);
> +}
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 46123b70f496..d1df751cfe4b 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1010,6 +1010,14 @@  static bool is_internal_error(struct aer_err_info *info)
 	return info->status & PCI_ERR_UNC_INTN;
 }
 
+static bool is_cxl_error(struct aer_err_info *info)
+{
+	if (!info || !info->is_cxl)
+		return false;
+
+	return is_internal_error(info);
+}
+
 static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
 {
 	struct aer_err_info *info = (struct aer_err_info *)data;
@@ -1062,13 +1070,17 @@  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
 	return *handles_cxl;
 }
 
-static bool handles_cxl_errors(struct pci_dev *rcec)
+static bool handles_cxl_errors(struct pci_dev *dev)
 {
 	bool handles_cxl = false;
 
-	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
-	    pcie_aer_is_native(rcec))
-		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
+	if (!pcie_aer_is_native(dev))
+		return false;
+
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC)
+		pcie_walk_rcec(dev, handles_cxl_error_iter, &handles_cxl);
+	else
+		handles_cxl = pcie_is_cxl(dev);
 
 	return handles_cxl;
 }
@@ -1082,10 +1094,44 @@  static void cxl_rch_enable_rcec(struct pci_dev *rcec)
 	pci_info(rcec, "CXL: Internal errors unmasked");
 }
 
+static void forward_cxl_error(struct pci_dev *_pdev, struct aer_err_info *info)
+{
+	int severity = info->severity;
+	struct cxl_prot_err_work_data wd;
+	struct cxl_prot_error_info *err_info = &wd.err_info;
+	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(_pdev);
+
+	if (!cxl_create_prot_err_info) {
+		pci_err(pdev, "Failed. CXL-AER interface not initialized.");
+		return;
+	}
+
+	if (cxl_create_prot_err_info(pdev, severity, err_info)) {
+		pci_err(pdev, "Failed to create CXL protocol error information");
+		return;
+	}
+
+	struct device *cxl_dev __free(put_device) = get_device(err_info->dev);
+
+	if (!kfifo_put(&cxl_prot_err_fifo, wd)) {
+		pr_err_ratelimited("CXL kfifo overflow\n");
+		return;
+	}
+
+	schedule_work(cxl_prot_err_work);
+}
+
 #else
 static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
 static inline void cxl_rch_handle_error(struct pci_dev *dev,
 					struct aer_err_info *info) { }
+static inline void forward_cxl_error(struct pci_dev *dev,
+				    struct aer_err_info *info) { }
+static inline bool handles_cxl_errors(struct pci_dev *dev)
+{
+	return false;
+}
+static bool is_cxl_error(struct aer_err_info *info) { return 0; };
 #endif
 
 /**
@@ -1123,8 +1169,11 @@  static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 
 static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 {
-	cxl_rch_handle_error(dev, info);
-	pci_aer_handle_error(dev, info);
+	if (is_cxl_error(info))
+		forward_cxl_error(dev, info);
+	else
+		pci_aer_handle_error(dev, info);
+
 	pci_dev_put(dev);
 }