Message ID | 20230607221651.2454764-27-terry.bowman@amd.com |
---|---|
State | Superseded |
Headers | show |
Series | cxl/pci: Add support for RCH RAS error handling | expand |
On Wed, Jun 07, 2023 at 05:16:51PM -0500, Terry Bowman wrote: > From: Robert Richter <rrichter@amd.com> > > AER corrected and uncorrectable internal errors (CIE/UIE) are masked > in their corresponding mask registers per default once in power-up > state. [1][2] Enable internal errors for RCECs to receive CXL > downstream port errors of Restricted CXL Hosts (RCHs). > > [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors > [2] PCIe Base Spec r6.0, 7.8.4.3 Uncorrectable Error Mask Register, > 7.8.4.6 Correctable Error Mask Register > > Co-developed-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Robert Richter <rrichter@amd.com> Acked-by: Bjorn Helgaas <bhelgaas@google.com> > --- > drivers/pci/pcie/aer.c | 57 ++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 57 insertions(+) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index c354ca5e8f2b..4f9203e27c62 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -948,6 +948,30 @@ static bool find_source_device(struct pci_dev *parent, > > #ifdef CONFIG_PCIEAER_CXL > > +/** > + * pci_aer_unmask_internal_errors - unmask internal errors > + * @dev: pointer to the pcie_dev data structure > + * > + * Unmasks internal errors in the Uncorrectable and Correctable Error > + * Mask registers. > + * > + * Note: AER must be enabled and supported by the device which must be > + * checked in advance, e.g. with pcie_aer_is_native(). > + */ > +static void pci_aer_unmask_internal_errors(struct pci_dev *dev) > +{ > + int aer = dev->aer_cap; > + u32 mask; > + > + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); > + mask &= ~PCI_ERR_UNC_INTN; > + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask); > + > + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); > + mask &= ~PCI_ERR_COR_INTERNAL; > + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); > +} > + > static bool is_cxl_mem_dev(struct pci_dev *dev) > { > /* > @@ -1027,7 +1051,39 @@ static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) > pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); > } > > +static int handles_cxl_error_iter(struct pci_dev *dev, void *data) > +{ > + int *handles_cxl = data; > + > + if (!*handles_cxl) > + *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); > + > + /* Non-zero terminates iteration */ > + return *handles_cxl; > +} > + > +static bool handles_cxl_errors(struct pci_dev *rcec) > +{ > + int handles_cxl = 0; > + > + if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && > + pcie_aer_is_native(rcec)) > + pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); > + > + return !!handles_cxl; > +} > + > +static void cxl_rch_enable_rcec(struct pci_dev *rcec) > +{ > + if (!handles_cxl_errors(rcec)) > + return; > + > + pci_aer_unmask_internal_errors(rcec); > + pci_info(rcec, "CXL: Internal errors unmasked"); > +} > + > #else > +static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { } > static inline void cxl_rch_handle_error(struct pci_dev *dev, > struct aer_err_info *info) { } > #endif > @@ -1428,6 +1484,7 @@ static int aer_probe(struct pcie_device *dev) > return status; > } > > + cxl_rch_enable_rcec(port); > aer_enable_rootport(rpc); > pci_info(port, "enabled with IRQ %d\n", dev->irq); > return 0; > -- > 2.34.1 >
Terry Bowman wrote: > From: Robert Richter <rrichter@amd.com> > > AER corrected and uncorrectable internal errors (CIE/UIE) are masked > in their corresponding mask registers per default once in power-up > state. [1][2] Enable internal errors for RCECs to receive CXL > downstream port errors of Restricted CXL Hosts (RCHs). > > [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors > [2] PCIe Base Spec r6.0, 7.8.4.3 Uncorrectable Error Mask Register, > 7.8.4.6 Correctable Error Mask Register > > Co-developed-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Robert Richter <rrichter@amd.com> > --- > drivers/pci/pcie/aer.c | 57 ++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 57 insertions(+) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index c354ca5e8f2b..4f9203e27c62 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -948,6 +948,30 @@ static bool find_source_device(struct pci_dev *parent, > > #ifdef CONFIG_PCIEAER_CXL > > +/** > + * pci_aer_unmask_internal_errors - unmask internal errors > + * @dev: pointer to the pcie_dev data structure > + * > + * Unmasks internal errors in the Uncorrectable and Correctable Error > + * Mask registers. > + * > + * Note: AER must be enabled and supported by the device which must be > + * checked in advance, e.g. with pcie_aer_is_native(). > + */ > +static void pci_aer_unmask_internal_errors(struct pci_dev *dev) > +{ > + int aer = dev->aer_cap; > + u32 mask; > + > + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); > + mask &= ~PCI_ERR_UNC_INTN; > + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask); > + > + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); > + mask &= ~PCI_ERR_COR_INTERNAL; > + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); > +} > + > static bool is_cxl_mem_dev(struct pci_dev *dev) > { > /* > @@ -1027,7 +1051,39 @@ static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) > pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); > } > > +static int handles_cxl_error_iter(struct pci_dev *dev, void *data) > +{ > + int *handles_cxl = data; > + > + if (!*handles_cxl) > + *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); > + > + /* Non-zero terminates iteration */ > + return *handles_cxl; > +} > + > +static bool handles_cxl_errors(struct pci_dev *rcec) > +{ > + int handles_cxl = 0; > + > + if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && > + pcie_aer_is_native(rcec)) > + pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); > + > + return !!handles_cxl; > +} > + > +static void cxl_rch_enable_rcec(struct pci_dev *rcec) > +{ > + if (!handles_cxl_errors(rcec)) > + return; > + > + pci_aer_unmask_internal_errors(rcec); > + pci_info(rcec, "CXL: Internal errors unmasked"); > +} > + > #else > +static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { } > static inline void cxl_rch_handle_error(struct pci_dev *dev, > struct aer_err_info *info) { } > #endif > @@ -1428,6 +1484,7 @@ static int aer_probe(struct pcie_device *dev) > return status; > } > > + cxl_rch_enable_rcec(port); Similar to the last patch, I wonder if it is sufficient to call this cxl_enable_rcec() in anticipation of VH support and the fact that this only depends on RCiEPs not necessarily anything RCH specific like RCRB shenanigans. Shouldn't there also be a corresponding cxl_disable_rcec() in aer_remove()?
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index c354ca5e8f2b..4f9203e27c62 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -948,6 +948,30 @@ static bool find_source_device(struct pci_dev *parent, #ifdef CONFIG_PCIEAER_CXL +/** + * pci_aer_unmask_internal_errors - unmask internal errors + * @dev: pointer to the pcie_dev data structure + * + * Unmasks internal errors in the Uncorrectable and Correctable Error + * Mask registers. + * + * Note: AER must be enabled and supported by the device which must be + * checked in advance, e.g. with pcie_aer_is_native(). + */ +static void pci_aer_unmask_internal_errors(struct pci_dev *dev) +{ + int aer = dev->aer_cap; + u32 mask; + + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask); + mask &= ~PCI_ERR_UNC_INTN; + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask); + + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask); + mask &= ~PCI_ERR_COR_INTERNAL; + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); +} + static bool is_cxl_mem_dev(struct pci_dev *dev) { /* @@ -1027,7 +1051,39 @@ static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); } +static int handles_cxl_error_iter(struct pci_dev *dev, void *data) +{ + int *handles_cxl = data; + + if (!*handles_cxl) + *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); + + /* Non-zero terminates iteration */ + return *handles_cxl; +} + +static bool handles_cxl_errors(struct pci_dev *rcec) +{ + int handles_cxl = 0; + + if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && + pcie_aer_is_native(rcec)) + pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); + + return !!handles_cxl; +} + +static void cxl_rch_enable_rcec(struct pci_dev *rcec) +{ + if (!handles_cxl_errors(rcec)) + return; + + pci_aer_unmask_internal_errors(rcec); + pci_info(rcec, "CXL: Internal errors unmasked"); +} + #else +static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { } static inline void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) { } #endif @@ -1428,6 +1484,7 @@ static int aer_probe(struct pcie_device *dev) return status; } + cxl_rch_enable_rcec(port); aer_enable_rootport(rpc); pci_info(port, "enabled with IRQ %d\n", dev->irq); return 0;