Message ID | 20230323213808.398039-6-terry.bowman@amd.com |
---|---|
State | Superseded |
Headers | show |
Series | cxl/pci: Add support for RCH RAS error handling | expand |
Hi Terry, Thank you for the patch! Yet something to improve: [auto build test ERROR on pci/next] [also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323] [cannot apply to cxl/pending] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 base: https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next patch link: https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20230324/202303241305.7Xdy6342-lkp@intel.com/config) compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/cxl/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Link: https://lore.kernel.org/oe-kbuild-all/202303241305.7Xdy6342-lkp@intel.com/ All errors (new ones prefixed by >>): >> drivers/cxl/mem.c:51:31: error: no member named 'rcec' in 'struct pci_dev' struct pci_dev *rcec = pdev->rcec; ~~~~ ^ >> drivers/cxl/mem.c:63:14: error: no member named 'aer_cap' in 'struct pci_dev' aer = rcec->aer_cap; ~~~~ ^ 2 errors generated. vim +51 drivers/cxl/mem.c 48 49 static int rcec_enable_aer_ints(struct pci_dev *pdev) 50 { > 51 struct pci_dev *rcec = pdev->rcec; 52 int aer, rc; 53 u32 mask; 54 55 if (!rcec) 56 return -ENODEV; 57 58 /* 59 * Internal errors are masked by default, unmask RCEC's here 60 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h) 61 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h) 62 */ > 63 aer = rcec->aer_cap; 64 rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask); 65 if (rc) 66 return rc; 67 mask &= ~PCI_ERR_UNC_INTN; 68 rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask); 69 if (rc) 70 return rc; 71 72 rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask); 73 if (rc) 74 return rc; 75 mask &= ~PCI_ERR_COR_INTERNAL; 76 rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask); 77 78 return rc; 79 } 80
Hi Terry, Thank you for the patch! Yet something to improve: [auto build test ERROR on pci/next] [also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323] [cannot apply to cxl/pending] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 base: https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next patch link: https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging config: riscv-randconfig-r024-20230322 (https://download.01.org/0day-ci/archive/20230324/202303241313.B0maYlFU-lkp@intel.com/config) compiler: riscv64-linux-gcc (GCC) 12.1.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv olddefconfig COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash drivers/cxl/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Link: https://lore.kernel.org/oe-kbuild-all/202303241313.B0maYlFU-lkp@intel.com/ All errors (new ones prefixed by >>): drivers/cxl/mem.c: In function 'rcec_enable_aer_ints': >> drivers/cxl/mem.c:63:21: error: 'struct pci_dev' has no member named 'aer_cap'; did you mean 'ats_cap'? 63 | aer = rcec->aer_cap; | ^~~~~~~ | ats_cap vim +63 drivers/cxl/mem.c 48 49 static int rcec_enable_aer_ints(struct pci_dev *pdev) 50 { 51 struct pci_dev *rcec = pdev->rcec; 52 int aer, rc; 53 u32 mask; 54 55 if (!rcec) 56 return -ENODEV; 57 58 /* 59 * Internal errors are masked by default, unmask RCEC's here 60 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h) 61 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h) 62 */ > 63 aer = rcec->aer_cap; 64 rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask); 65 if (rc) 66 return rc; 67 mask &= ~PCI_ERR_UNC_INTN; 68 rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask); 69 if (rc) 70 return rc; 71 72 rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask); 73 if (rc) 74 return rc; 75 mask &= ~PCI_ERR_COR_INTERNAL; 76 rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask); 77 78 return rc; 79 } 80
Hi Terry, Thank you for the patch! Yet something to improve: [auto build test ERROR on pci/next] [also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323] [cannot apply to cxl/pending] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 base: https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next patch link: https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging config: i386-randconfig-a001 (https://download.01.org/0day-ci/archive/20230324/202303241458.BV292BDH-lkp@intel.com/config) compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 reproduce (this is a W=1 build): # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=i386 olddefconfig make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Link: https://lore.kernel.org/oe-kbuild-all/202303241458.BV292BDH-lkp@intel.com/ All errors (new ones prefixed by >>): drivers/cxl/mem.c: In function 'rcec_enable_aer_ints': >> drivers/cxl/mem.c:51:36: error: 'struct pci_dev' has no member named 'rcec' 51 | struct pci_dev *rcec = pdev->rcec; | ^~ >> drivers/cxl/mem.c:63:21: error: 'struct pci_dev' has no member named 'aer_cap'; did you mean 'ats_cap'? 63 | aer = rcec->aer_cap; | ^~~~~~~ | ats_cap vim +51 drivers/cxl/mem.c 48 49 static int rcec_enable_aer_ints(struct pci_dev *pdev) 50 { > 51 struct pci_dev *rcec = pdev->rcec; 52 int aer, rc; 53 u32 mask; 54 55 if (!rcec) 56 return -ENODEV; 57 58 /* 59 * Internal errors are masked by default, unmask RCEC's here 60 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h) 61 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h) 62 */ > 63 aer = rcec->aer_cap; 64 rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask); 65 if (rc) 66 return rc; 67 mask &= ~PCI_ERR_UNC_INTN; 68 rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask); 69 if (rc) 70 return rc; 71 72 rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask); 73 if (rc) 74 return rc; 75 mask &= ~PCI_ERR_COR_INTERNAL; 76 rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask); 77 78 return rc; 79 } 80
I added a comment below. On 3/24/23 01:30, kernel test robot wrote: > Hi Terry, > > Thank you for the patch! Yet something to improve: > > [auto build test ERROR on pci/next] > [also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323] > [cannot apply to cxl/pending] > [If your patch is applied to the wrong git tree, kindly drop us a note. > And when submitting patch, we suggest to use '--base' as documented in > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > url: https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 > base: https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next > patch link: https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com > patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging > config: i386-randconfig-a001 (https://download.01.org/0day-ci/archive/20230324/202303241458.BV292BDH-lkp@intel.com/config) > compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 > reproduce (this is a W=1 build): > # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 > git remote add linux-review https://github.com/intel-lab-lkp/linux > git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044 > git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6 > # save the config file > mkdir build_dir && cp config build_dir/.config > make W=1 O=build_dir ARCH=i386 olddefconfig > make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/ > > If you fix the issue, kindly add following tag where applicable > | Reported-by: kernel test robot <lkp@intel.com> > | Link: https://lore.kernel.org/oe-kbuild-all/202303241458.BV292BDH-lkp@intel.com/ > > All errors (new ones prefixed by >>): > > drivers/cxl/mem.c: In function 'rcec_enable_aer_ints': >>> drivers/cxl/mem.c:51:36: error: 'struct pci_dev' has no member named 'rcec' > 51 | struct pci_dev *rcec = pdev->rcec; > | ^~ >>> drivers/cxl/mem.c:63:21: error: 'struct pci_dev' has no member named 'aer_cap'; did you mean 'ats_cap'? > 63 | aer = rcec->aer_cap; > | ^~~~~~~ > | ats_cap > > > vim +51 drivers/cxl/mem.c > > 48 > 49 static int rcec_enable_aer_ints(struct pci_dev *pdev) > 50 { > > 51 struct pci_dev *rcec = pdev->rcec; > 52 int aer, rc; > 53 u32 mask; > 54 > 55 if (!rcec) > 56 return -ENODEV; > 57 > 58 /* > 59 * Internal errors are masked by default, unmask RCEC's here > 60 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h) > 61 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h) > 62 */ > > 63 aer = rcec->aer_cap; > 64 rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask); > 65 if (rc) > 66 return rc; > 67 mask &= ~PCI_ERR_UNC_INTN; > 68 rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask); > 69 if (rc) > 70 return rc; > 71 > 72 rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask); > 73 if (rc) > 74 return rc; > 75 mask &= ~PCI_ERR_COR_INTERNAL; > 76 rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask); > 77 > 78 return rc; > 79 } > 80 > I will add #ifdef checks for CONFIG_PCIEPORTBUS and CONFIG_PCIEAER around the related code. Regards, Terry
On 3/23/23 2:38 PM, Terry Bowman wrote: > RCH downstream port error logging is missing in the current CXL driver. The > missing AER and RAS error logging is needed for communicating driver error > details to userspace. Update the driver to include PCIe AER and CXL RAS > error logging. > > Add RCH downstream port error handling into the existing RCiEP handler. > The downstream port error handler is added to the RCiEP error handler > because the downstream port is implemented in a RCRB, is not PCI > enumerable, and as a result is not directly accessible to the PCI AER > root port driver. The AER root port driver calls the RCiEP handler for > handling RCD errors and RCH downstream port protocol errors. > > Update the cxl_mem driver to map the RCH RAS and AER register discovered > earlier. The RAS and AER registers will be used in the RCH error handlers. > > Disable RCH downstream port's root port cmd interrupts. Enable RCEC AER > CIE/UIE reporting because they are disabled by default.[1] > > Update existing RCiEP correctable and uncorrectable handlers to also call > the RCH handler. The RCH handler will read the downstream port AER > registers, check for error severity, and if an error exists will log > using an existing kernel AER trace routine. The RCH handler will also > reuse the existing RAS logging routine to log downstream port RAS > errors if they exist. > > [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors > > Co-developed-by: Robert Richter <rrichter@amd.com> > Signed-off-by: Robert Richter <rrichter@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > --- > drivers/cxl/core/pci.c | 126 +++++++++++++++++++++++++++++++++---- > drivers/cxl/core/regs.c | 1 + > drivers/cxl/cxl.h | 13 ++++ > drivers/cxl/mem.c | 134 ++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 262 insertions(+), 12 deletions(-) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 7328a2552411..6e36471969ba 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -5,6 +5,7 @@ > #include <linux/delay.h> > #include <linux/pci.h> > #include <linux/pci-doe.h> > +#include <linux/aer.h> > #include <cxlpci.h> > #include <cxlmem.h> > #include <cxl.h> > @@ -605,32 +606,88 @@ void read_cdat_data(struct cxl_port *port) > } > EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); > > -void cxl_cor_error_detected(struct pci_dev *pdev) > +/* Get AER severity. Return false if there is no error. */ > +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, > + int *severity) > +{ > + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { > + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) > + *severity = AER_FATAL; > + else > + *severity = AER_NONFATAL; > + return true; > + } > + > + if (aer_regs->cor_status & ~aer_regs->cor_mask) { > + *severity = AER_CORRECTABLE; > + return true; > + } > + > + return false; > +} > + > +/* > + * Copy the AER capability registers to a buffer. This is necessary > + * because RCRB AER capability is MMIO mapped. Clear the status > + * after copying. > + * > + * @aer_base: base address of AER capability block in RCRB > + * @aer_regs: destination for copying AER capability > + */ > +static bool cxl_rch_get_aer_info(void __iomem *aer_base, > + struct aer_capability_regs *aer_regs) > +{ > + int read_cnt = PCI_AER_CAPABILITY_LENGTH / sizeof(u32); > + u32 *aer_regs_buf = (u32 *)aer_regs; > + int n; > + > + if (!aer_base) > + return false; > + > + for (n = 0; n < read_cnt; n++) > + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); > + > + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); > + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); > + > + return true; > +} > + > +static void __cxl_log_correctable_ras(struct cxl_dev_state *cxlds, > + void __iomem *ras_base) > { > - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); > void __iomem *addr; > u32 status; > > - if (!cxlds->regs.ras) > + if (!ras_base) > return; > > - addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET; > + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; > status = readl(addr); > if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { > writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); > trace_cxl_aer_correctable_error(cxlds->cxlmd, status); > } > } > -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); > + > +static void cxl_log_correctable_ras_endpoint(struct cxl_dev_state *cxlds) > +{ > + return __cxl_log_correctable_ras(cxlds, cxlds->regs.ras); > +} > + > +static void cxl_log_correctable_ras_dport(struct cxl_dev_state *cxlds) > +{ > + return __cxl_log_correctable_ras(cxlds, cxlds->regs.dport_ras); > +} > > /* CXL spec rev3.0 8.2.4.16.1 */ > -static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) > +static void header_log_copy(void __iomem *ras_base, u32 *log) > { > void __iomem *addr; > u32 *log_addr; > int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); > > - addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET; > + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; > log_addr = log; > > for (i = 0; i < log_u32_size; i++) { > @@ -644,17 +701,18 @@ static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) > * Log the state of the RAS status registers and prepare them to log the > * next error status. Return 1 if reset needed. > */ > -static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) > +static bool __cxl_report_and_clear(struct cxl_dev_state *cxlds, > + void __iomem *ras_base) > { > u32 hl[CXL_HEADERLOG_SIZE_U32]; > void __iomem *addr; > u32 status; > u32 fe; > > - if (!cxlds->regs.ras) > + if (!ras_base) > return false; > > - addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; > + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; > status = readl(addr); > if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) > return false; > @@ -662,7 +720,7 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) > /* If multiple errors, log header points to first error from ctrl reg */ > if (hweight32(status) > 1) { > void __iomem *rcc_addr = > - cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET; > + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; > > fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, > readl(rcc_addr))); > @@ -670,13 +728,54 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) > fe = status; > } > > - header_log_copy(cxlds, hl); > + header_log_copy(ras_base, hl); > trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); > writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); > > return true; > } > > +static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) > +{ > + return __cxl_report_and_clear(cxlds, cxlds->regs.ras); > +} > + > +static bool cxl_report_and_clear_dport(struct cxl_dev_state *cxlds) > +{ > + return __cxl_report_and_clear(cxlds, cxlds->regs.dport_ras); > +} > + > +static void cxl_rch_log_error(struct cxl_dev_state *cxlds) > +{ > + struct pci_dev *pdev = to_pci_dev(cxlds->dev); > + struct aer_capability_regs aer_regs; > + int severity; > + > + if (!cxl_rch_get_aer_info(cxlds->regs.aer, &aer_regs)) > + return; > + > + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) > + return; > + > + cper_print_aer(pdev, severity, &aer_regs); > + > + if (severity == AER_CORRECTABLE) > + cxl_log_correctable_ras_dport(cxlds); > + else > + cxl_report_and_clear_dport(cxlds); > +} > + > +void cxl_cor_error_detected(struct pci_dev *pdev) > +{ > + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); > + > + if (cxlds->rcd) > + cxl_rch_log_error(cxlds); > + > + cxl_log_correctable_ras_endpoint(cxlds); > +} > +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); > + > pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, > pci_channel_state_t state) > { > @@ -685,6 +784,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, > struct device *dev = &cxlmd->dev; > bool ue; > > + if (cxlds->rcd) > + cxl_rch_log_error(cxlds); > + > /* > * A frozen channel indicates an impending reset which is fatal to > * CXL.mem operation, and will likely crash the system. On the off > diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c > index 108a349d8101..7130f35891da 100644 > --- a/drivers/cxl/core/regs.c > +++ b/drivers/cxl/core/regs.c > @@ -198,6 +198,7 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, > > return ret_val; > } > +EXPORT_SYMBOL_NS_GPL(devm_cxl_iomap_block, CXL); > > int cxl_map_component_regs(struct device *dev, struct cxl_component_regs *regs, > struct cxl_register_map *map, unsigned long map_mask) > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > index 9fd7df48ce99..7036e34354bc 100644 > --- a/drivers/cxl/cxl.h > +++ b/drivers/cxl/cxl.h > @@ -66,6 +66,8 @@ > #define CXL_DECODER_MIN_GRANULARITY 256 > #define CXL_DECODER_MAX_ENCODED_IG 6 > > +#define PCI_AER_CAPABILITY_LENGTH 56 > + > static inline int cxl_hdm_decoder_count(u32 cap_hdr) > { > int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); > @@ -209,6 +211,15 @@ struct cxl_regs { > struct_group_tagged(cxl_device_regs, device_regs, > void __iomem *status, *mbox, *memdev; > ); > + > + /* > + * Pointer to RCH cxl_dport AER. (only for RCH/RCD mode) > + * @dport_aer: CXL 2.0 12.2.11 RCH Downstream Port-detected Errors > + */ > + struct_group_tagged(cxl_rch_regs, rch_regs, > + void __iomem *aer; > + void __iomem *dport_ras; > + ); > }; > > struct cxl_reg_map { > @@ -249,6 +260,8 @@ struct cxl_register_map { > }; > }; > > +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, > + resource_size_t length); > void cxl_probe_component_regs(struct device *dev, void __iomem *base, > struct cxl_component_reg_map *map); > void cxl_probe_device_regs(struct device *dev, void __iomem *base, > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c > index 12e8e8ebaac0..e217c44ed749 100644 > --- a/drivers/cxl/mem.c > +++ b/drivers/cxl/mem.c > @@ -4,6 +4,7 @@ > #include <linux/device.h> > #include <linux/module.h> > #include <linux/pci.h> > +#include <linux/aer.h> > > #include "cxlmem.h" > #include "cxlpci.h" > @@ -45,6 +46,132 @@ static int cxl_mem_dpa_show(struct seq_file *file, void *data) > return 0; > } > > +static int rcec_enable_aer_ints(struct pci_dev *pdev) > +{ > + struct pci_dev *rcec = pdev->rcec; > + int aer, rc; > + u32 mask; > + > + if (!rcec) > + return -ENODEV; > + > + /* > + * Internal errors are masked by default, unmask RCEC's here > + * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h) > + * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h) > + */ > + aer = rcec->aer_cap; > + rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask); > + if (rc) > + return rc; > + mask &= ~PCI_ERR_UNC_INTN; > + rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask); > + if (rc) > + return rc; > + > + rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask); > + if (rc) > + return rc; > + mask &= ~PCI_ERR_COR_INTERNAL; > + rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask); > + > + return rc; > +} > + > +static void disable_aer(void *_pdev) > +{ > + struct pci_dev *pdev = (struct pci_dev *)_pdev; > + > + pci_disable_pcie_error_reporting(pdev); > + > + /* > + * Keep the RCEC's internal AER enabled. There > + * could be other RCiEPs using this RCEC. > + */ > +} > + > +static void rch_disable_root_ints(void __iomem *aer_base) > +{ > + u32 aer_cmd_mask, aer_cmd; > + > + /* > + * Disable RCH root port command interrupts. > + * CXL3.0 12.2.1.1 - RCH Downstream Port-detected Errors > + */ > + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | > + PCI_ERR_ROOT_CMD_NONFATAL_EN | > + PCI_ERR_ROOT_CMD_FATAL_EN); > + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); > + aer_cmd &= ~aer_cmd_mask; > + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); > +} > + > +static int cxl_ras_setup_interrupts(struct cxl_dev_state *cxlds) > +{ > + struct pci_dev *pdev = to_pci_dev(cxlds->dev); > + int rc; > + > + if (cxlds->rcd) { > + rch_disable_root_ints(cxlds->regs.aer); > + > + rc = rcec_enable_aer_ints(pdev); > + if (rc) > + return rc; > + } > + > + rc = pci_enable_pcie_error_reporting(pdev); Hi Terry, not sure if you saw this thread [1], but with the new changes upstream [2] to the PCIe subsystem, Bjorn says we no longer need to call pci_enable_pcie_error_report() by the driver. [1]: https://lore.kernel.org/linux-cxl/c2e244bd-a94b-8de2-e43c-7ff8a756cbc7@intel.com/T/#mef401fb0580ebb4c4bc2a164f87e12b60cf76693 [2]: commit f26e58bf6f54 ("PCI/AER: Enable error reporting when AER is native") DJ > + if (rc) > + return rc; > + > + return devm_add_action_or_reset(&pdev->dev, disable_aer, pdev); > +} > + > +static int cxl_rch_map_ras(struct cxl_dev_state *cxlds, > + struct cxl_dport *parent_dport) > +{ > + struct device *dev = parent_dport->dport; > + resource_size_t aer_phys, ras_phys; > + void __iomem *aer, *dport_ras; > + > + if (!parent_dport->rch) > + return 0; > + > + if (!parent_dport->aer_cap || !parent_dport->ras_cap || > + parent_dport->component_reg_phys == CXL_RESOURCE_NONE) > + return -ENODEV; > + > + aer_phys = parent_dport->aer_cap + parent_dport->rcrb; > + aer = devm_cxl_iomap_block(dev, aer_phys, > + PCI_AER_CAPABILITY_LENGTH); > + > + if (!aer) > + return -ENOMEM; > + > + ras_phys = parent_dport->ras_cap + parent_dport->component_reg_phys; > + dport_ras = devm_cxl_iomap_block(dev, ras_phys, > + CXL_RAS_CAPABILITY_LENGTH); > + > + if (!dport_ras) > + return -ENOMEM; > + > + cxlds->regs.aer = aer; > + cxlds->regs.dport_ras = dport_ras; > + > + return 0; > +} > + > +static int cxl_setup_ras(struct cxl_dev_state *cxlds, > + struct cxl_dport *parent_dport) > +{ > + int rc; > + > + rc = cxl_rch_map_ras(cxlds, parent_dport); > + if (rc) > + return rc; > + > + return cxl_ras_setup_interrupts(cxlds); > +} > + > static void cxl_rcrb_setup(struct cxl_dev_state *cxlds, > struct cxl_dport *parent_dport) > { > @@ -93,6 +220,13 @@ static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, > > cxl_rcrb_setup(cxlds, parent_dport); > > + rc = cxl_setup_ras(cxlds, parent_dport); > + /* Continue with RAS setup errors */ > + if (rc) > + dev_warn(&cxlmd->dev, "CXL RAS setup failed: %d\n", rc); > + else > + dev_info(&cxlmd->dev, "CXL error handling enabled\n"); > + > endpoint = devm_cxl_add_port(host, &cxlmd->dev, cxlds->component_reg_phys, > parent_dport); > if (IS_ERR(endpoint))
Hi dave, On 3/27/23 18:21, Dave Jiang wrote: > > > On 3/23/23 2:38 PM, Terry Bowman wrote: >> RCH downstream port error logging is missing in the current CXL driver. The >> missing AER and RAS error logging is needed for communicating driver error >> details to userspace. Update the driver to include PCIe AER and CXL RAS >> error logging. >> >> Add RCH downstream port error handling into the existing RCiEP handler. >> The downstream port error handler is added to the RCiEP error handler >> because the downstream port is implemented in a RCRB, is not PCI >> enumerable, and as a result is not directly accessible to the PCI AER >> root port driver. The AER root port driver calls the RCiEP handler for >> handling RCD errors and RCH downstream port protocol errors. >> >> Update the cxl_mem driver to map the RCH RAS and AER register discovered >> earlier. The RAS and AER registers will be used in the RCH error handlers. >> >> Disable RCH downstream port's root port cmd interrupts. Enable RCEC AER >> CIE/UIE reporting because they are disabled by default.[1] >> >> Update existing RCiEP correctable and uncorrectable handlers to also call >> the RCH handler. The RCH handler will read the downstream port AER >> registers, check for error severity, and if an error exists will log >> using an existing kernel AER trace routine. The RCH handler will also >> reuse the existing RAS logging routine to log downstream port RAS >> errors if they exist. >> >> [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors >> >> Co-developed-by: Robert Richter <rrichter@amd.com> >> Signed-off-by: Robert Richter <rrichter@amd.com> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> >> --- >> drivers/cxl/core/pci.c | 126 +++++++++++++++++++++++++++++++++---- >> drivers/cxl/core/regs.c | 1 + >> drivers/cxl/cxl.h | 13 ++++ >> drivers/cxl/mem.c | 134 ++++++++++++++++++++++++++++++++++++++++ >> 4 files changed, 262 insertions(+), 12 deletions(-) >> >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c >> index 7328a2552411..6e36471969ba 100644 >> --- a/drivers/cxl/core/pci.c >> +++ b/drivers/cxl/core/pci.c >> @@ -5,6 +5,7 @@ >> #include <linux/delay.h> >> #include <linux/pci.h> >> #include <linux/pci-doe.h> >> +#include <linux/aer.h> >> #include <cxlpci.h> >> #include <cxlmem.h> >> #include <cxl.h> >> @@ -605,32 +606,88 @@ void read_cdat_data(struct cxl_port *port) >> } >> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); >> -void cxl_cor_error_detected(struct pci_dev *pdev) >> +/* Get AER severity. Return false if there is no error. */ >> +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, >> + int *severity) >> +{ >> + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { >> + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) >> + *severity = AER_FATAL; >> + else >> + *severity = AER_NONFATAL; >> + return true; >> + } >> + >> + if (aer_regs->cor_status & ~aer_regs->cor_mask) { >> + *severity = AER_CORRECTABLE; >> + return true; >> + } >> + >> + return false; >> +} >> + >> +/* >> + * Copy the AER capability registers to a buffer. This is necessary >> + * because RCRB AER capability is MMIO mapped. Clear the status >> + * after copying. >> + * >> + * @aer_base: base address of AER capability block in RCRB >> + * @aer_regs: destination for copying AER capability >> + */ >> +static bool cxl_rch_get_aer_info(void __iomem *aer_base, >> + struct aer_capability_regs *aer_regs) >> +{ >> + int read_cnt = PCI_AER_CAPABILITY_LENGTH / sizeof(u32); >> + u32 *aer_regs_buf = (u32 *)aer_regs; >> + int n; >> + >> + if (!aer_base) >> + return false; >> + >> + for (n = 0; n < read_cnt; n++) >> + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); >> + >> + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); >> + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); >> + >> + return true; >> +} >> + >> +static void __cxl_log_correctable_ras(struct cxl_dev_state *cxlds, >> + void __iomem *ras_base) >> { >> - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); >> void __iomem *addr; >> u32 status; >> - if (!cxlds->regs.ras) >> + if (!ras_base) >> return; >> - addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET; >> + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; >> status = readl(addr); >> if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { >> writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); >> trace_cxl_aer_correctable_error(cxlds->cxlmd, status); >> } >> } >> -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); >> + >> +static void cxl_log_correctable_ras_endpoint(struct cxl_dev_state *cxlds) >> +{ >> + return __cxl_log_correctable_ras(cxlds, cxlds->regs.ras); >> +} >> + >> +static void cxl_log_correctable_ras_dport(struct cxl_dev_state *cxlds) >> +{ >> + return __cxl_log_correctable_ras(cxlds, cxlds->regs.dport_ras); >> +} >> /* CXL spec rev3.0 8.2.4.16.1 */ >> -static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) >> +static void header_log_copy(void __iomem *ras_base, u32 *log) >> { >> void __iomem *addr; >> u32 *log_addr; >> int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); >> - addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET; >> + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; >> log_addr = log; >> for (i = 0; i < log_u32_size; i++) { >> @@ -644,17 +701,18 @@ static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) >> * Log the state of the RAS status registers and prepare them to log the >> * next error status. Return 1 if reset needed. >> */ >> -static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) >> +static bool __cxl_report_and_clear(struct cxl_dev_state *cxlds, >> + void __iomem *ras_base) >> { >> u32 hl[CXL_HEADERLOG_SIZE_U32]; >> void __iomem *addr; >> u32 status; >> u32 fe; >> - if (!cxlds->regs.ras) >> + if (!ras_base) >> return false; >> - addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; >> + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; >> status = readl(addr); >> if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) >> return false; >> @@ -662,7 +720,7 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) >> /* If multiple errors, log header points to first error from ctrl reg */ >> if (hweight32(status) > 1) { >> void __iomem *rcc_addr = >> - cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET; >> + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; >> fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, >> readl(rcc_addr))); >> @@ -670,13 +728,54 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) >> fe = status; >> } >> - header_log_copy(cxlds, hl); >> + header_log_copy(ras_base, hl); >> trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); >> writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); >> return true; >> } >> +static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) >> +{ >> + return __cxl_report_and_clear(cxlds, cxlds->regs.ras); >> +} >> + >> +static bool cxl_report_and_clear_dport(struct cxl_dev_state *cxlds) >> +{ >> + return __cxl_report_and_clear(cxlds, cxlds->regs.dport_ras); >> +} >> + >> +static void cxl_rch_log_error(struct cxl_dev_state *cxlds) >> +{ >> + struct pci_dev *pdev = to_pci_dev(cxlds->dev); >> + struct aer_capability_regs aer_regs; >> + int severity; >> + >> + if (!cxl_rch_get_aer_info(cxlds->regs.aer, &aer_regs)) >> + return; >> + >> + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) >> + return; >> + >> + cper_print_aer(pdev, severity, &aer_regs); >> + >> + if (severity == AER_CORRECTABLE) >> + cxl_log_correctable_ras_dport(cxlds); >> + else >> + cxl_report_and_clear_dport(cxlds); >> +} >> + >> +void cxl_cor_error_detected(struct pci_dev *pdev) >> +{ >> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); >> + >> + if (cxlds->rcd) >> + cxl_rch_log_error(cxlds); >> + >> + cxl_log_correctable_ras_endpoint(cxlds); >> +} >> +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); >> + >> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, >> pci_channel_state_t state) >> { >> @@ -685,6 +784,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, >> struct device *dev = &cxlmd->dev; >> bool ue; >> + if (cxlds->rcd) >> + cxl_rch_log_error(cxlds); >> + >> /* >> * A frozen channel indicates an impending reset which is fatal to >> * CXL.mem operation, and will likely crash the system. On the off >> diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c >> index 108a349d8101..7130f35891da 100644 >> --- a/drivers/cxl/core/regs.c >> +++ b/drivers/cxl/core/regs.c >> @@ -198,6 +198,7 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, >> return ret_val; >> } >> +EXPORT_SYMBOL_NS_GPL(devm_cxl_iomap_block, CXL); >> int cxl_map_component_regs(struct device *dev, struct cxl_component_regs *regs, >> struct cxl_register_map *map, unsigned long map_mask) >> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h >> index 9fd7df48ce99..7036e34354bc 100644 >> --- a/drivers/cxl/cxl.h >> +++ b/drivers/cxl/cxl.h >> @@ -66,6 +66,8 @@ >> #define CXL_DECODER_MIN_GRANULARITY 256 >> #define CXL_DECODER_MAX_ENCODED_IG 6 >> +#define PCI_AER_CAPABILITY_LENGTH 56 >> + >> static inline int cxl_hdm_decoder_count(u32 cap_hdr) >> { >> int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); >> @@ -209,6 +211,15 @@ struct cxl_regs { >> struct_group_tagged(cxl_device_regs, device_regs, >> void __iomem *status, *mbox, *memdev; >> ); >> + >> + /* >> + * Pointer to RCH cxl_dport AER. (only for RCH/RCD mode) >> + * @dport_aer: CXL 2.0 12.2.11 RCH Downstream Port-detected Errors >> + */ >> + struct_group_tagged(cxl_rch_regs, rch_regs, >> + void __iomem *aer; >> + void __iomem *dport_ras; >> + ); >> }; >> struct cxl_reg_map { >> @@ -249,6 +260,8 @@ struct cxl_register_map { >> }; >> }; >> +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, >> + resource_size_t length); >> void cxl_probe_component_regs(struct device *dev, void __iomem *base, >> struct cxl_component_reg_map *map); >> void cxl_probe_device_regs(struct device *dev, void __iomem *base, >> diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c >> index 12e8e8ebaac0..e217c44ed749 100644 >> --- a/drivers/cxl/mem.c >> +++ b/drivers/cxl/mem.c >> @@ -4,6 +4,7 @@ >> #include <linux/device.h> >> #include <linux/module.h> >> #include <linux/pci.h> >> +#include <linux/aer.h> >> #include "cxlmem.h" >> #include "cxlpci.h" >> @@ -45,6 +46,132 @@ static int cxl_mem_dpa_show(struct seq_file *file, void *data) >> return 0; >> } >> +static int rcec_enable_aer_ints(struct pci_dev *pdev) >> +{ >> + struct pci_dev *rcec = pdev->rcec; >> + int aer, rc; >> + u32 mask; >> + >> + if (!rcec) >> + return -ENODEV; >> + >> + /* >> + * Internal errors are masked by default, unmask RCEC's here >> + * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h) >> + * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h) >> + */ >> + aer = rcec->aer_cap; >> + rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask); >> + if (rc) >> + return rc; >> + mask &= ~PCI_ERR_UNC_INTN; >> + rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask); >> + if (rc) >> + return rc; >> + >> + rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask); >> + if (rc) >> + return rc; >> + mask &= ~PCI_ERR_COR_INTERNAL; >> + rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask); >> + >> + return rc; >> +} >> + >> +static void disable_aer(void *_pdev) >> +{ >> + struct pci_dev *pdev = (struct pci_dev *)_pdev; >> + >> + pci_disable_pcie_error_reporting(pdev); >> + >> + /* >> + * Keep the RCEC's internal AER enabled. There >> + * could be other RCiEPs using this RCEC. >> + */ >> +} >> + >> +static void rch_disable_root_ints(void __iomem *aer_base) >> +{ >> + u32 aer_cmd_mask, aer_cmd; >> + >> + /* >> + * Disable RCH root port command interrupts. >> + * CXL3.0 12.2.1.1 - RCH Downstream Port-detected Errors >> + */ >> + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | >> + PCI_ERR_ROOT_CMD_NONFATAL_EN | >> + PCI_ERR_ROOT_CMD_FATAL_EN); >> + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); >> + aer_cmd &= ~aer_cmd_mask; >> + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); >> +} >> + >> +static int cxl_ras_setup_interrupts(struct cxl_dev_state *cxlds) >> +{ >> + struct pci_dev *pdev = to_pci_dev(cxlds->dev); >> + int rc; >> + >> + if (cxlds->rcd) { >> + rch_disable_root_ints(cxlds->regs.aer); >> + >> + rc = rcec_enable_aer_ints(pdev); >> + if (rc) >> + return rc; >> + } >> + >> + rc = pci_enable_pcie_error_reporting(pdev); > > Hi Terry, not sure if you saw this thread [1], but with the new changes upstream [2] to the PCIe subsystem, Bjorn says we no longer need to call pci_enable_pcie_error_report() by the driver. > > [1]: https://lore.kernel.org/linux-cxl/c2e244bd-a94b-8de2-e43c-7ff8a756cbc7@intel.com/T/#mef401fb0580ebb4c4bc2a164f87e12b60cf76693 > [2]: commit f26e58bf6f54 ("PCI/AER: Enable error reporting when AER is native") > > DJ > Ok. I'll remove the call to pci_enable_pcie_error_reporting(). This will help simplify since it allows removing disable_aer() and call to devm_add_action_or_reset() as well. Regards, Terry
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 7328a2552411..6e36471969ba 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -5,6 +5,7 @@ #include <linux/delay.h> #include <linux/pci.h> #include <linux/pci-doe.h> +#include <linux/aer.h> #include <cxlpci.h> #include <cxlmem.h> #include <cxl.h> @@ -605,32 +606,88 @@ void read_cdat_data(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); -void cxl_cor_error_detected(struct pci_dev *pdev) +/* Get AER severity. Return false if there is no error. */ +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, + int *severity) +{ + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) + *severity = AER_FATAL; + else + *severity = AER_NONFATAL; + return true; + } + + if (aer_regs->cor_status & ~aer_regs->cor_mask) { + *severity = AER_CORRECTABLE; + return true; + } + + return false; +} + +/* + * Copy the AER capability registers to a buffer. This is necessary + * because RCRB AER capability is MMIO mapped. Clear the status + * after copying. + * + * @aer_base: base address of AER capability block in RCRB + * @aer_regs: destination for copying AER capability + */ +static bool cxl_rch_get_aer_info(void __iomem *aer_base, + struct aer_capability_regs *aer_regs) +{ + int read_cnt = PCI_AER_CAPABILITY_LENGTH / sizeof(u32); + u32 *aer_regs_buf = (u32 *)aer_regs; + int n; + + if (!aer_base) + return false; + + for (n = 0; n < read_cnt; n++) + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); + + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); + + return true; +} + +static void __cxl_log_correctable_ras(struct cxl_dev_state *cxlds, + void __iomem *ras_base) { - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); void __iomem *addr; u32 status; - if (!cxlds->regs.ras) + if (!ras_base) return; - addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET; + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; status = readl(addr); if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); trace_cxl_aer_correctable_error(cxlds->cxlmd, status); } } -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); + +static void cxl_log_correctable_ras_endpoint(struct cxl_dev_state *cxlds) +{ + return __cxl_log_correctable_ras(cxlds, cxlds->regs.ras); +} + +static void cxl_log_correctable_ras_dport(struct cxl_dev_state *cxlds) +{ + return __cxl_log_correctable_ras(cxlds, cxlds->regs.dport_ras); +} /* CXL spec rev3.0 8.2.4.16.1 */ -static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) +static void header_log_copy(void __iomem *ras_base, u32 *log) { void __iomem *addr; u32 *log_addr; int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); - addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET; + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; log_addr = log; for (i = 0; i < log_u32_size; i++) { @@ -644,17 +701,18 @@ static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) * Log the state of the RAS status registers and prepare them to log the * next error status. Return 1 if reset needed. */ -static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) +static bool __cxl_report_and_clear(struct cxl_dev_state *cxlds, + void __iomem *ras_base) { u32 hl[CXL_HEADERLOG_SIZE_U32]; void __iomem *addr; u32 status; u32 fe; - if (!cxlds->regs.ras) + if (!ras_base) return false; - addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; status = readl(addr); if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) return false; @@ -662,7 +720,7 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) /* If multiple errors, log header points to first error from ctrl reg */ if (hweight32(status) > 1) { void __iomem *rcc_addr = - cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET; + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, readl(rcc_addr))); @@ -670,13 +728,54 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) fe = status; } - header_log_copy(cxlds, hl); + header_log_copy(ras_base, hl); trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); return true; } +static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) +{ + return __cxl_report_and_clear(cxlds, cxlds->regs.ras); +} + +static bool cxl_report_and_clear_dport(struct cxl_dev_state *cxlds) +{ + return __cxl_report_and_clear(cxlds, cxlds->regs.dport_ras); +} + +static void cxl_rch_log_error(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct aer_capability_regs aer_regs; + int severity; + + if (!cxl_rch_get_aer_info(cxlds->regs.aer, &aer_regs)) + return; + + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) + return; + + cper_print_aer(pdev, severity, &aer_regs); + + if (severity == AER_CORRECTABLE) + cxl_log_correctable_ras_dport(cxlds); + else + cxl_report_and_clear_dport(cxlds); +} + +void cxl_cor_error_detected(struct pci_dev *pdev) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + + if (cxlds->rcd) + cxl_rch_log_error(cxlds); + + cxl_log_correctable_ras_endpoint(cxlds); +} +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); + pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state) { @@ -685,6 +784,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, struct device *dev = &cxlmd->dev; bool ue; + if (cxlds->rcd) + cxl_rch_log_error(cxlds); + /* * A frozen channel indicates an impending reset which is fatal to * CXL.mem operation, and will likely crash the system. On the off diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 108a349d8101..7130f35891da 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -198,6 +198,7 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, return ret_val; } +EXPORT_SYMBOL_NS_GPL(devm_cxl_iomap_block, CXL); int cxl_map_component_regs(struct device *dev, struct cxl_component_regs *regs, struct cxl_register_map *map, unsigned long map_mask) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9fd7df48ce99..7036e34354bc 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -66,6 +66,8 @@ #define CXL_DECODER_MIN_GRANULARITY 256 #define CXL_DECODER_MAX_ENCODED_IG 6 +#define PCI_AER_CAPABILITY_LENGTH 56 + static inline int cxl_hdm_decoder_count(u32 cap_hdr) { int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); @@ -209,6 +211,15 @@ struct cxl_regs { struct_group_tagged(cxl_device_regs, device_regs, void __iomem *status, *mbox, *memdev; ); + + /* + * Pointer to RCH cxl_dport AER. (only for RCH/RCD mode) + * @dport_aer: CXL 2.0 12.2.11 RCH Downstream Port-detected Errors + */ + struct_group_tagged(cxl_rch_regs, rch_regs, + void __iomem *aer; + void __iomem *dport_ras; + ); }; struct cxl_reg_map { @@ -249,6 +260,8 @@ struct cxl_register_map { }; }; +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length); void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 12e8e8ebaac0..e217c44ed749 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -4,6 +4,7 @@ #include <linux/device.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/aer.h> #include "cxlmem.h" #include "cxlpci.h" @@ -45,6 +46,132 @@ static int cxl_mem_dpa_show(struct seq_file *file, void *data) return 0; } +static int rcec_enable_aer_ints(struct pci_dev *pdev) +{ + struct pci_dev *rcec = pdev->rcec; + int aer, rc; + u32 mask; + + if (!rcec) + return -ENODEV; + + /* + * Internal errors are masked by default, unmask RCEC's here + * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h) + * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h) + */ + aer = rcec->aer_cap; + rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask); + if (rc) + return rc; + mask &= ~PCI_ERR_UNC_INTN; + rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask); + if (rc) + return rc; + + rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask); + if (rc) + return rc; + mask &= ~PCI_ERR_COR_INTERNAL; + rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask); + + return rc; +} + +static void disable_aer(void *_pdev) +{ + struct pci_dev *pdev = (struct pci_dev *)_pdev; + + pci_disable_pcie_error_reporting(pdev); + + /* + * Keep the RCEC's internal AER enabled. There + * could be other RCiEPs using this RCEC. + */ +} + +static void rch_disable_root_ints(void __iomem *aer_base) +{ + u32 aer_cmd_mask, aer_cmd; + + /* + * Disable RCH root port command interrupts. + * CXL3.0 12.2.1.1 - RCH Downstream Port-detected Errors + */ + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | + PCI_ERR_ROOT_CMD_NONFATAL_EN | + PCI_ERR_ROOT_CMD_FATAL_EN); + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); + aer_cmd &= ~aer_cmd_mask; + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); +} + +static int cxl_ras_setup_interrupts(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int rc; + + if (cxlds->rcd) { + rch_disable_root_ints(cxlds->regs.aer); + + rc = rcec_enable_aer_ints(pdev); + if (rc) + return rc; + } + + rc = pci_enable_pcie_error_reporting(pdev); + if (rc) + return rc; + + return devm_add_action_or_reset(&pdev->dev, disable_aer, pdev); +} + +static int cxl_rch_map_ras(struct cxl_dev_state *cxlds, + struct cxl_dport *parent_dport) +{ + struct device *dev = parent_dport->dport; + resource_size_t aer_phys, ras_phys; + void __iomem *aer, *dport_ras; + + if (!parent_dport->rch) + return 0; + + if (!parent_dport->aer_cap || !parent_dport->ras_cap || + parent_dport->component_reg_phys == CXL_RESOURCE_NONE) + return -ENODEV; + + aer_phys = parent_dport->aer_cap + parent_dport->rcrb; + aer = devm_cxl_iomap_block(dev, aer_phys, + PCI_AER_CAPABILITY_LENGTH); + + if (!aer) + return -ENOMEM; + + ras_phys = parent_dport->ras_cap + parent_dport->component_reg_phys; + dport_ras = devm_cxl_iomap_block(dev, ras_phys, + CXL_RAS_CAPABILITY_LENGTH); + + if (!dport_ras) + return -ENOMEM; + + cxlds->regs.aer = aer; + cxlds->regs.dport_ras = dport_ras; + + return 0; +} + +static int cxl_setup_ras(struct cxl_dev_state *cxlds, + struct cxl_dport *parent_dport) +{ + int rc; + + rc = cxl_rch_map_ras(cxlds, parent_dport); + if (rc) + return rc; + + return cxl_ras_setup_interrupts(cxlds); +} + static void cxl_rcrb_setup(struct cxl_dev_state *cxlds, struct cxl_dport *parent_dport) { @@ -93,6 +220,13 @@ static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, cxl_rcrb_setup(cxlds, parent_dport); + rc = cxl_setup_ras(cxlds, parent_dport); + /* Continue with RAS setup errors */ + if (rc) + dev_warn(&cxlmd->dev, "CXL RAS setup failed: %d\n", rc); + else + dev_info(&cxlmd->dev, "CXL error handling enabled\n"); + endpoint = devm_cxl_add_port(host, &cxlmd->dev, cxlds->component_reg_phys, parent_dport); if (IS_ERR(endpoint))