diff mbox series

[v2,5/5] cxl/pci: Add RCH downstream port error logging

Message ID 20230323213808.398039-6-terry.bowman@amd.com
State Superseded
Headers show
Series cxl/pci: Add support for RCH RAS error handling | expand

Commit Message

Bowman, Terry March 23, 2023, 9:38 p.m. UTC
RCH downstream port error logging is missing in the current CXL driver. The
missing AER and RAS error logging is needed for communicating driver error
details to userspace. Update the driver to include PCIe AER and CXL RAS
error logging.

Add RCH downstream port error handling into the existing RCiEP handler.
The downstream port error handler is added to the RCiEP error handler
because the downstream port is implemented in a RCRB, is not PCI
enumerable, and as a result is not directly accessible to the PCI AER
root port driver. The AER root port driver calls the RCiEP handler for
handling RCD errors and RCH downstream port protocol errors.

Update the cxl_mem driver to map the RCH RAS and AER register discovered
earlier. The RAS and AER registers will be used in the RCH error handlers.

Disable RCH downstream port's root port cmd interrupts. Enable RCEC AER
CIE/UIE reporting because they are disabled by default.[1]

Update existing RCiEP correctable and uncorrectable handlers to also call
the RCH handler. The RCH handler will read the downstream port AER
registers, check for error severity, and if an error exists will log
using an existing kernel AER trace routine. The RCH handler will also
reuse the existing RAS logging routine to log downstream port RAS
errors if they exist.

[1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors

Co-developed-by: Robert Richter <rrichter@amd.com>
Signed-off-by: Robert Richter <rrichter@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/pci.c  | 126 +++++++++++++++++++++++++++++++++----
 drivers/cxl/core/regs.c |   1 +
 drivers/cxl/cxl.h       |  13 ++++
 drivers/cxl/mem.c       | 134 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 262 insertions(+), 12 deletions(-)

Comments

kernel test robot March 24, 2023, 5:39 a.m. UTC | #1
Hi Terry,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323]
[cannot apply to cxl/pending]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com
patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging
config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20230324/202303241305.7Xdy6342-lkp@intel.com/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
        git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/cxl/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303241305.7Xdy6342-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/cxl/mem.c:51:31: error: no member named 'rcec' in 'struct pci_dev'
           struct pci_dev *rcec = pdev->rcec;
                                  ~~~~  ^
>> drivers/cxl/mem.c:63:14: error: no member named 'aer_cap' in 'struct pci_dev'
           aer = rcec->aer_cap;
                 ~~~~  ^
   2 errors generated.


vim +51 drivers/cxl/mem.c

    48	
    49	static int rcec_enable_aer_ints(struct pci_dev *pdev)
    50	{
  > 51		struct pci_dev *rcec = pdev->rcec;
    52		int aer, rc;
    53		u32 mask;
    54	
    55		if (!rcec)
    56			return -ENODEV;
    57	
    58		/*
    59		 * Internal errors are masked by default, unmask RCEC's here
    60		 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
    61		 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
    62		 */
  > 63		aer = rcec->aer_cap;
    64		rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
    65		if (rc)
    66			return rc;
    67		mask &= ~PCI_ERR_UNC_INTN;
    68		rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
    69		if (rc)
    70			return rc;
    71	
    72		rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
    73		if (rc)
    74			return rc;
    75		mask &= ~PCI_ERR_COR_INTERNAL;
    76		rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
    77	
    78		return rc;
    79	}
    80
kernel test robot March 24, 2023, 6:09 a.m. UTC | #2
Hi Terry,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323]
[cannot apply to cxl/pending]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com
patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging
config: riscv-randconfig-r024-20230322 (https://download.01.org/0day-ci/archive/20230324/202303241313.B0maYlFU-lkp@intel.com/config)
compiler: riscv64-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
        git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash drivers/cxl/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303241313.B0maYlFU-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/cxl/mem.c: In function 'rcec_enable_aer_ints':
>> drivers/cxl/mem.c:63:21: error: 'struct pci_dev' has no member named 'aer_cap'; did you mean 'ats_cap'?
      63 |         aer = rcec->aer_cap;
         |                     ^~~~~~~
         |                     ats_cap


vim +63 drivers/cxl/mem.c

    48	
    49	static int rcec_enable_aer_ints(struct pci_dev *pdev)
    50	{
    51		struct pci_dev *rcec = pdev->rcec;
    52		int aer, rc;
    53		u32 mask;
    54	
    55		if (!rcec)
    56			return -ENODEV;
    57	
    58		/*
    59		 * Internal errors are masked by default, unmask RCEC's here
    60		 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
    61		 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
    62		 */
  > 63		aer = rcec->aer_cap;
    64		rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
    65		if (rc)
    66			return rc;
    67		mask &= ~PCI_ERR_UNC_INTN;
    68		rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
    69		if (rc)
    70			return rc;
    71	
    72		rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
    73		if (rc)
    74			return rc;
    75		mask &= ~PCI_ERR_COR_INTERNAL;
    76		rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
    77	
    78		return rc;
    79	}
    80
kernel test robot March 24, 2023, 6:30 a.m. UTC | #3
Hi Terry,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323]
[cannot apply to cxl/pending]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com
patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging
config: i386-randconfig-a001 (https://download.01.org/0day-ci/archive/20230324/202303241458.BV292BDH-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
        git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 olddefconfig
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303241458.BV292BDH-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/cxl/mem.c: In function 'rcec_enable_aer_ints':
>> drivers/cxl/mem.c:51:36: error: 'struct pci_dev' has no member named 'rcec'
      51 |         struct pci_dev *rcec = pdev->rcec;
         |                                    ^~
>> drivers/cxl/mem.c:63:21: error: 'struct pci_dev' has no member named 'aer_cap'; did you mean 'ats_cap'?
      63 |         aer = rcec->aer_cap;
         |                     ^~~~~~~
         |                     ats_cap


vim +51 drivers/cxl/mem.c

    48	
    49	static int rcec_enable_aer_ints(struct pci_dev *pdev)
    50	{
  > 51		struct pci_dev *rcec = pdev->rcec;
    52		int aer, rc;
    53		u32 mask;
    54	
    55		if (!rcec)
    56			return -ENODEV;
    57	
    58		/*
    59		 * Internal errors are masked by default, unmask RCEC's here
    60		 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
    61		 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
    62		 */
  > 63		aer = rcec->aer_cap;
    64		rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
    65		if (rc)
    66			return rc;
    67		mask &= ~PCI_ERR_UNC_INTN;
    68		rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
    69		if (rc)
    70			return rc;
    71	
    72		rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
    73		if (rc)
    74			return rc;
    75		mask &= ~PCI_ERR_COR_INTERNAL;
    76		rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
    77	
    78		return rc;
    79	}
    80
Bowman, Terry March 24, 2023, 5:41 p.m. UTC | #4
I added a comment below.

On 3/24/23 01:30, kernel test robot wrote:
> Hi Terry,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on pci/next]
> [also build test ERROR on pci/for-linus efi/next cxl/next linus/master v6.3-rc3 next-20230323]
> [cannot apply to cxl/pending]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
> patch link:    https://lore.kernel.org/r/20230323213808.398039-6-terry.bowman%40amd.com
> patch subject: [PATCH v2 5/5] cxl/pci: Add RCH downstream port error logging
> config: i386-randconfig-a001 (https://download.01.org/0day-ci/archive/20230324/202303241458.BV292BDH-lkp@intel.com/config)
> compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
> reproduce (this is a W=1 build):
>         # https://github.com/intel-lab-lkp/linux/commit/c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Terry-Bowman/cxl-pci-Add-RCH-downstream-port-AER-and-RAS-register-discovery/20230324-054044
>         git checkout c40ca148e9cff1a1c32cd4c5c9b252bf0cf201b6
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         make W=1 O=build_dir ARCH=i386 olddefconfig
>         make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/
> 
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
> | Link: https://lore.kernel.org/oe-kbuild-all/202303241458.BV292BDH-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
>    drivers/cxl/mem.c: In function 'rcec_enable_aer_ints':
>>> drivers/cxl/mem.c:51:36: error: 'struct pci_dev' has no member named 'rcec'
>       51 |         struct pci_dev *rcec = pdev->rcec;
>          |                                    ^~
>>> drivers/cxl/mem.c:63:21: error: 'struct pci_dev' has no member named 'aer_cap'; did you mean 'ats_cap'?
>       63 |         aer = rcec->aer_cap;
>          |                     ^~~~~~~
>          |                     ats_cap
> 
> 
> vim +51 drivers/cxl/mem.c
> 
>     48	
>     49	static int rcec_enable_aer_ints(struct pci_dev *pdev)
>     50	{
>   > 51		struct pci_dev *rcec = pdev->rcec;
>     52		int aer, rc;
>     53		u32 mask;
>     54	
>     55		if (!rcec)
>     56			return -ENODEV;
>     57	
>     58		/*
>     59		 * Internal errors are masked by default, unmask RCEC's here
>     60		 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
>     61		 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
>     62		 */
>   > 63		aer = rcec->aer_cap;
>     64		rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
>     65		if (rc)
>     66			return rc;
>     67		mask &= ~PCI_ERR_UNC_INTN;
>     68		rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
>     69		if (rc)
>     70			return rc;
>     71	
>     72		rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
>     73		if (rc)
>     74			return rc;
>     75		mask &= ~PCI_ERR_COR_INTERNAL;
>     76		rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
>     77	
>     78		return rc;
>     79	}
>     80	
> 

I will add #ifdef checks for CONFIG_PCIEPORTBUS and CONFIG_PCIEAER
around the related code.

Regards,
Terry
Dave Jiang March 27, 2023, 11:21 p.m. UTC | #5
On 3/23/23 2:38 PM, Terry Bowman wrote:
> RCH downstream port error logging is missing in the current CXL driver. The
> missing AER and RAS error logging is needed for communicating driver error
> details to userspace. Update the driver to include PCIe AER and CXL RAS
> error logging.
> 
> Add RCH downstream port error handling into the existing RCiEP handler.
> The downstream port error handler is added to the RCiEP error handler
> because the downstream port is implemented in a RCRB, is not PCI
> enumerable, and as a result is not directly accessible to the PCI AER
> root port driver. The AER root port driver calls the RCiEP handler for
> handling RCD errors and RCH downstream port protocol errors.
> 
> Update the cxl_mem driver to map the RCH RAS and AER register discovered
> earlier. The RAS and AER registers will be used in the RCH error handlers.
> 
> Disable RCH downstream port's root port cmd interrupts. Enable RCEC AER
> CIE/UIE reporting because they are disabled by default.[1]
> 
> Update existing RCiEP correctable and uncorrectable handlers to also call
> the RCH handler. The RCH handler will read the downstream port AER
> registers, check for error severity, and if an error exists will log
> using an existing kernel AER trace routine. The RCH handler will also
> reuse the existing RAS logging routine to log downstream port RAS
> errors if they exist.
> 
> [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> 
> Co-developed-by: Robert Richter <rrichter@amd.com>
> Signed-off-by: Robert Richter <rrichter@amd.com>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>   drivers/cxl/core/pci.c  | 126 +++++++++++++++++++++++++++++++++----
>   drivers/cxl/core/regs.c |   1 +
>   drivers/cxl/cxl.h       |  13 ++++
>   drivers/cxl/mem.c       | 134 ++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 262 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 7328a2552411..6e36471969ba 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -5,6 +5,7 @@
>   #include <linux/delay.h>
>   #include <linux/pci.h>
>   #include <linux/pci-doe.h>
> +#include <linux/aer.h>
>   #include <cxlpci.h>
>   #include <cxlmem.h>
>   #include <cxl.h>
> @@ -605,32 +606,88 @@ void read_cdat_data(struct cxl_port *port)
>   }
>   EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>   
> -void cxl_cor_error_detected(struct pci_dev *pdev)
> +/* Get AER severity. Return false if there is no error. */
> +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
> +				     int *severity)
> +{
> +	if (aer_regs->uncor_status & ~aer_regs->uncor_mask) {
> +		if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
> +			*severity = AER_FATAL;
> +		else
> +			*severity = AER_NONFATAL;
> +		return true;
> +	}
> +
> +	if (aer_regs->cor_status & ~aer_regs->cor_mask) {
> +		*severity = AER_CORRECTABLE;
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +/*
> + * Copy the AER capability registers to a buffer. This is necessary
> + * because RCRB AER capability is MMIO mapped. Clear the status
> + * after copying.
> + *
> + * @aer_base: base address of AER capability block in RCRB
> + * @aer_regs: destination for copying AER capability
> + */
> +static bool cxl_rch_get_aer_info(void __iomem *aer_base,
> +				 struct aer_capability_regs *aer_regs)
> +{
> +	int read_cnt = PCI_AER_CAPABILITY_LENGTH / sizeof(u32);
> +	u32 *aer_regs_buf = (u32 *)aer_regs;
> +	int n;
> +
> +	if (!aer_base)
> +		return false;
> +
> +	for (n = 0; n < read_cnt; n++)
> +		aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
> +
> +	writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS);
> +	writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS);
> +
> +	return true;
> +}
> +
> +static void __cxl_log_correctable_ras(struct cxl_dev_state *cxlds,
> +				      void __iomem *ras_base)
>   {
> -	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>   	void __iomem *addr;
>   	u32 status;
>   
> -	if (!cxlds->regs.ras)
> +	if (!ras_base)
>   		return;
>   
> -	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
> +	addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
>   	status = readl(addr);
>   	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
>   		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
>   		trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>   	}
>   }
> -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
> +
> +static void cxl_log_correctable_ras_endpoint(struct cxl_dev_state *cxlds)
> +{
> +	return __cxl_log_correctable_ras(cxlds, cxlds->regs.ras);
> +}
> +
> +static void cxl_log_correctable_ras_dport(struct cxl_dev_state *cxlds)
> +{
> +	return __cxl_log_correctable_ras(cxlds, cxlds->regs.dport_ras);
> +}
>   
>   /* CXL spec rev3.0 8.2.4.16.1 */
> -static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
> +static void header_log_copy(void __iomem *ras_base, u32 *log)
>   {
>   	void __iomem *addr;
>   	u32 *log_addr;
>   	int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
>   
> -	addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET;
> +	addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET;
>   	log_addr = log;
>   
>   	for (i = 0; i < log_u32_size; i++) {
> @@ -644,17 +701,18 @@ static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
>    * Log the state of the RAS status registers and prepare them to log the
>    * next error status. Return 1 if reset needed.
>    */
> -static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
> +static bool __cxl_report_and_clear(struct cxl_dev_state *cxlds,
> +				  void __iomem *ras_base)
>   {
>   	u32 hl[CXL_HEADERLOG_SIZE_U32];
>   	void __iomem *addr;
>   	u32 status;
>   	u32 fe;
>   
> -	if (!cxlds->regs.ras)
> +	if (!ras_base)
>   		return false;
>   
> -	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
> +	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>   	status = readl(addr);
>   	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
>   		return false;
> @@ -662,7 +720,7 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>   	/* If multiple errors, log header points to first error from ctrl reg */
>   	if (hweight32(status) > 1) {
>   		void __iomem *rcc_addr =
> -			cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
> +			ras_base + CXL_RAS_CAP_CONTROL_OFFSET;
>   
>   		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>   				   readl(rcc_addr)));
> @@ -670,13 +728,54 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>   		fe = status;
>   	}
>   
> -	header_log_copy(cxlds, hl);
> +	header_log_copy(ras_base, hl);
>   	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl);
>   	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>   
>   	return true;
>   }
>   
> +static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
> +{
> +	return __cxl_report_and_clear(cxlds, cxlds->regs.ras);
> +}
> +
> +static bool cxl_report_and_clear_dport(struct cxl_dev_state *cxlds)
> +{
> +	return __cxl_report_and_clear(cxlds, cxlds->regs.dport_ras);
> +}
> +
> +static void cxl_rch_log_error(struct cxl_dev_state *cxlds)
> +{
> +	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +	struct aer_capability_regs aer_regs;
> +	int severity;
> +
> +	if (!cxl_rch_get_aer_info(cxlds->regs.aer, &aer_regs))
> +		return;
> +
> +	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
> +		return;
> +
> +	cper_print_aer(pdev, severity, &aer_regs);
> +
> +	if (severity == AER_CORRECTABLE)
> +		cxl_log_correctable_ras_dport(cxlds);
> +	else
> +		cxl_report_and_clear_dport(cxlds);
> +}
> +
> +void cxl_cor_error_detected(struct pci_dev *pdev)
> +{
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +
> +	if (cxlds->rcd)
> +		cxl_rch_log_error(cxlds);
> +
> +	cxl_log_correctable_ras_endpoint(cxlds);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
> +
>   pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>   				    pci_channel_state_t state)
>   {
> @@ -685,6 +784,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>   	struct device *dev = &cxlmd->dev;
>   	bool ue;
>   
> +	if (cxlds->rcd)
> +		cxl_rch_log_error(cxlds);
> +
>   	/*
>   	 * A frozen channel indicates an impending reset which is fatal to
>   	 * CXL.mem operation, and will likely crash the system. On the off
> diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
> index 108a349d8101..7130f35891da 100644
> --- a/drivers/cxl/core/regs.c
> +++ b/drivers/cxl/core/regs.c
> @@ -198,6 +198,7 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
>   
>   	return ret_val;
>   }
> +EXPORT_SYMBOL_NS_GPL(devm_cxl_iomap_block, CXL);
>   
>   int cxl_map_component_regs(struct device *dev, struct cxl_component_regs *regs,
>   			   struct cxl_register_map *map, unsigned long map_mask)
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 9fd7df48ce99..7036e34354bc 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -66,6 +66,8 @@
>   #define CXL_DECODER_MIN_GRANULARITY 256
>   #define CXL_DECODER_MAX_ENCODED_IG 6
>   
> +#define PCI_AER_CAPABILITY_LENGTH 56
> +
>   static inline int cxl_hdm_decoder_count(u32 cap_hdr)
>   {
>   	int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr);
> @@ -209,6 +211,15 @@ struct cxl_regs {
>   	struct_group_tagged(cxl_device_regs, device_regs,
>   		void __iomem *status, *mbox, *memdev;
>   	);
> +
> +	/*
> +	 * Pointer to RCH cxl_dport AER. (only for RCH/RCD mode)
> +	 * @dport_aer: CXL 2.0 12.2.11 RCH Downstream Port-detected Errors
> +	 */
> +	struct_group_tagged(cxl_rch_regs, rch_regs,
> +		void __iomem *aer;
> +		void __iomem *dport_ras;
> +	);
>   };
>   
>   struct cxl_reg_map {
> @@ -249,6 +260,8 @@ struct cxl_register_map {
>   	};
>   };
>   
> +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
> +				   resource_size_t length);
>   void cxl_probe_component_regs(struct device *dev, void __iomem *base,
>   			      struct cxl_component_reg_map *map);
>   void cxl_probe_device_regs(struct device *dev, void __iomem *base,
> diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> index 12e8e8ebaac0..e217c44ed749 100644
> --- a/drivers/cxl/mem.c
> +++ b/drivers/cxl/mem.c
> @@ -4,6 +4,7 @@
>   #include <linux/device.h>
>   #include <linux/module.h>
>   #include <linux/pci.h>
> +#include <linux/aer.h>
>   
>   #include "cxlmem.h"
>   #include "cxlpci.h"
> @@ -45,6 +46,132 @@ static int cxl_mem_dpa_show(struct seq_file *file, void *data)
>   	return 0;
>   }
>   
> +static int rcec_enable_aer_ints(struct pci_dev *pdev)
> +{
> +	struct pci_dev *rcec = pdev->rcec;
> +	int aer, rc;
> +	u32 mask;
> +
> +	if (!rcec)
> +		return -ENODEV;
> +
> +	/*
> +	 * Internal errors are masked by default, unmask RCEC's here
> +	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
> +	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
> +	 */
> +	aer = rcec->aer_cap;
> +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
> +	if (rc)
> +		return rc;
> +	mask &= ~PCI_ERR_UNC_INTN;
> +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
> +	if (rc)
> +		return rc;
> +
> +	rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
> +	if (rc)
> +		return rc;
> +	mask &= ~PCI_ERR_COR_INTERNAL;
> +	rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
> +
> +	return rc;
> +}
> +
> +static void disable_aer(void *_pdev)
> +{
> +	struct pci_dev *pdev = (struct pci_dev *)_pdev;
> +
> +	pci_disable_pcie_error_reporting(pdev);
> +
> +	/*
> +	 * Keep the RCEC's internal AER enabled. There
> +	 * could be other RCiEPs using this RCEC.
> +	 */
> +}
> +
> +static void rch_disable_root_ints(void __iomem *aer_base)
> +{
> +	u32 aer_cmd_mask, aer_cmd;
> +
> +	/*
> +	 * Disable RCH root port command interrupts.
> +	 * CXL3.0 12.2.1.1 - RCH Downstream Port-detected Errors
> +	 */
> +	aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN |
> +			PCI_ERR_ROOT_CMD_NONFATAL_EN |
> +			PCI_ERR_ROOT_CMD_FATAL_EN);
> +	aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND);
> +	aer_cmd &= ~aer_cmd_mask;
> +	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
> +}
> +
> +static int cxl_ras_setup_interrupts(struct cxl_dev_state *cxlds)
> +{
> +	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> +	int rc;
> +
> +	if (cxlds->rcd) {
> +		rch_disable_root_ints(cxlds->regs.aer);
> +
> +		rc = rcec_enable_aer_ints(pdev);
> +		if (rc)
> +			return rc;
> +	}
> +
> +	rc = pci_enable_pcie_error_reporting(pdev);

Hi Terry, not sure if you saw this thread [1], but with the new changes 
upstream [2] to the PCIe subsystem, Bjorn says we no longer need to call 
pci_enable_pcie_error_report() by the driver.

[1]: 
https://lore.kernel.org/linux-cxl/c2e244bd-a94b-8de2-e43c-7ff8a756cbc7@intel.com/T/#mef401fb0580ebb4c4bc2a164f87e12b60cf76693
[2]: commit f26e58bf6f54 ("PCI/AER: Enable error reporting when AER is 
native")

DJ

> +	if (rc)
> +		return rc;
> +
> +	return devm_add_action_or_reset(&pdev->dev, disable_aer, pdev);
> +}
> +
> +static int cxl_rch_map_ras(struct cxl_dev_state *cxlds,
> +			   struct cxl_dport *parent_dport)
> +{
> +	struct device *dev = parent_dport->dport;
> +	resource_size_t aer_phys, ras_phys;
> +	void __iomem *aer, *dport_ras;
> +
> +	if (!parent_dport->rch)
> +		return 0;
> +
> +	if (!parent_dport->aer_cap || !parent_dport->ras_cap ||
> +	    parent_dport->component_reg_phys == CXL_RESOURCE_NONE)
> +		return -ENODEV;
> +
> +	aer_phys = parent_dport->aer_cap + parent_dport->rcrb;
> +	aer = devm_cxl_iomap_block(dev, aer_phys,
> +				   PCI_AER_CAPABILITY_LENGTH);
> +
> +	if (!aer)
> +		return -ENOMEM;
> +
> +	ras_phys = parent_dport->ras_cap + parent_dport->component_reg_phys;
> +	dport_ras = devm_cxl_iomap_block(dev, ras_phys,
> +					 CXL_RAS_CAPABILITY_LENGTH);
> +
> +	if (!dport_ras)
> +		return -ENOMEM;
> +
> +	cxlds->regs.aer = aer;
> +	cxlds->regs.dport_ras = dport_ras;
> +
> +	return 0;
> +}
> +
> +static int cxl_setup_ras(struct cxl_dev_state *cxlds,
> +			 struct cxl_dport *parent_dport)
> +{
> +	int rc;
> +
> +	rc = cxl_rch_map_ras(cxlds, parent_dport);
> +	if (rc)
> +		return rc;
> +
> +	return cxl_ras_setup_interrupts(cxlds);
> +}
> +
>   static void cxl_rcrb_setup(struct cxl_dev_state *cxlds,
>   			   struct cxl_dport *parent_dport)
>   {
> @@ -93,6 +220,13 @@ static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd,
>   
>   	cxl_rcrb_setup(cxlds, parent_dport);
>   
> +	rc = cxl_setup_ras(cxlds, parent_dport);
> +	/* Continue with RAS setup errors */
> +	if (rc)
> +		dev_warn(&cxlmd->dev, "CXL RAS setup failed: %d\n", rc);
> +	else
> +		dev_info(&cxlmd->dev, "CXL error handling enabled\n");
> +
>   	endpoint = devm_cxl_add_port(host, &cxlmd->dev, cxlds->component_reg_phys,
>   				     parent_dport);
>   	if (IS_ERR(endpoint))
Bowman, Terry March 28, 2023, 1:53 p.m. UTC | #6
Hi dave,


On 3/27/23 18:21, Dave Jiang wrote:
> 
> 
> On 3/23/23 2:38 PM, Terry Bowman wrote:
>> RCH downstream port error logging is missing in the current CXL driver. The
>> missing AER and RAS error logging is needed for communicating driver error
>> details to userspace. Update the driver to include PCIe AER and CXL RAS
>> error logging.
>>
>> Add RCH downstream port error handling into the existing RCiEP handler.
>> The downstream port error handler is added to the RCiEP error handler
>> because the downstream port is implemented in a RCRB, is not PCI
>> enumerable, and as a result is not directly accessible to the PCI AER
>> root port driver. The AER root port driver calls the RCiEP handler for
>> handling RCD errors and RCH downstream port protocol errors.
>>
>> Update the cxl_mem driver to map the RCH RAS and AER register discovered
>> earlier. The RAS and AER registers will be used in the RCH error handlers.
>>
>> Disable RCH downstream port's root port cmd interrupts. Enable RCEC AER
>> CIE/UIE reporting because they are disabled by default.[1]
>>
>> Update existing RCiEP correctable and uncorrectable handlers to also call
>> the RCH handler. The RCH handler will read the downstream port AER
>> registers, check for error severity, and if an error exists will log
>> using an existing kernel AER trace routine. The RCH handler will also
>> reuse the existing RAS logging routine to log downstream port RAS
>> errors if they exist.
>>
>> [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
>>
>> Co-developed-by: Robert Richter <rrichter@amd.com>
>> Signed-off-by: Robert Richter <rrichter@amd.com>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>   drivers/cxl/core/pci.c  | 126 +++++++++++++++++++++++++++++++++----
>>   drivers/cxl/core/regs.c |   1 +
>>   drivers/cxl/cxl.h       |  13 ++++
>>   drivers/cxl/mem.c       | 134 ++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 262 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 7328a2552411..6e36471969ba 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -5,6 +5,7 @@
>>   #include <linux/delay.h>
>>   #include <linux/pci.h>
>>   #include <linux/pci-doe.h>
>> +#include <linux/aer.h>
>>   #include <cxlpci.h>
>>   #include <cxlmem.h>
>>   #include <cxl.h>
>> @@ -605,32 +606,88 @@ void read_cdat_data(struct cxl_port *port)
>>   }
>>   EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>>   -void cxl_cor_error_detected(struct pci_dev *pdev)
>> +/* Get AER severity. Return false if there is no error. */
>> +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
>> +                     int *severity)
>> +{
>> +    if (aer_regs->uncor_status & ~aer_regs->uncor_mask) {
>> +        if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
>> +            *severity = AER_FATAL;
>> +        else
>> +            *severity = AER_NONFATAL;
>> +        return true;
>> +    }
>> +
>> +    if (aer_regs->cor_status & ~aer_regs->cor_mask) {
>> +        *severity = AER_CORRECTABLE;
>> +        return true;
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +/*
>> + * Copy the AER capability registers to a buffer. This is necessary
>> + * because RCRB AER capability is MMIO mapped. Clear the status
>> + * after copying.
>> + *
>> + * @aer_base: base address of AER capability block in RCRB
>> + * @aer_regs: destination for copying AER capability
>> + */
>> +static bool cxl_rch_get_aer_info(void __iomem *aer_base,
>> +                 struct aer_capability_regs *aer_regs)
>> +{
>> +    int read_cnt = PCI_AER_CAPABILITY_LENGTH / sizeof(u32);
>> +    u32 *aer_regs_buf = (u32 *)aer_regs;
>> +    int n;
>> +
>> +    if (!aer_base)
>> +        return false;
>> +
>> +    for (n = 0; n < read_cnt; n++)
>> +        aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
>> +
>> +    writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS);
>> +    writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS);
>> +
>> +    return true;
>> +}
>> +
>> +static void __cxl_log_correctable_ras(struct cxl_dev_state *cxlds,
>> +                      void __iomem *ras_base)
>>   {
>> -    struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>>       void __iomem *addr;
>>       u32 status;
>>   -    if (!cxlds->regs.ras)
>> +    if (!ras_base)
>>           return;
>>   -    addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
>> +    addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
>>       status = readl(addr);
>>       if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
>>           writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
>>           trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>>       }
>>   }
>> -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
>> +
>> +static void cxl_log_correctable_ras_endpoint(struct cxl_dev_state *cxlds)
>> +{
>> +    return __cxl_log_correctable_ras(cxlds, cxlds->regs.ras);
>> +}
>> +
>> +static void cxl_log_correctable_ras_dport(struct cxl_dev_state *cxlds)
>> +{
>> +    return __cxl_log_correctable_ras(cxlds, cxlds->regs.dport_ras);
>> +}
>>     /* CXL spec rev3.0 8.2.4.16.1 */
>> -static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
>> +static void header_log_copy(void __iomem *ras_base, u32 *log)
>>   {
>>       void __iomem *addr;
>>       u32 *log_addr;
>>       int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
>>   -    addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET;
>> +    addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET;
>>       log_addr = log;
>>         for (i = 0; i < log_u32_size; i++) {
>> @@ -644,17 +701,18 @@ static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
>>    * Log the state of the RAS status registers and prepare them to log the
>>    * next error status. Return 1 if reset needed.
>>    */
>> -static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>> +static bool __cxl_report_and_clear(struct cxl_dev_state *cxlds,
>> +                  void __iomem *ras_base)
>>   {
>>       u32 hl[CXL_HEADERLOG_SIZE_U32];
>>       void __iomem *addr;
>>       u32 status;
>>       u32 fe;
>>   -    if (!cxlds->regs.ras)
>> +    if (!ras_base)
>>           return false;
>>   -    addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>> +    addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>>       status = readl(addr);
>>       if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
>>           return false;
>> @@ -662,7 +720,7 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>>       /* If multiple errors, log header points to first error from ctrl reg */
>>       if (hweight32(status) > 1) {
>>           void __iomem *rcc_addr =
>> -            cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
>> +            ras_base + CXL_RAS_CAP_CONTROL_OFFSET;
>>             fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>>                      readl(rcc_addr)));
>> @@ -670,13 +728,54 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>>           fe = status;
>>       }
>>   -    header_log_copy(cxlds, hl);
>> +    header_log_copy(ras_base, hl);
>>       trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl);
>>       writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>>         return true;
>>   }
>>   +static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
>> +{
>> +    return __cxl_report_and_clear(cxlds, cxlds->regs.ras);
>> +}
>> +
>> +static bool cxl_report_and_clear_dport(struct cxl_dev_state *cxlds)
>> +{
>> +    return __cxl_report_and_clear(cxlds, cxlds->regs.dport_ras);
>> +}
>> +
>> +static void cxl_rch_log_error(struct cxl_dev_state *cxlds)
>> +{
>> +    struct pci_dev *pdev = to_pci_dev(cxlds->dev);
>> +    struct aer_capability_regs aer_regs;
>> +    int severity;
>> +
>> +    if (!cxl_rch_get_aer_info(cxlds->regs.aer, &aer_regs))
>> +        return;
>> +
>> +    if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
>> +        return;
>> +
>> +    cper_print_aer(pdev, severity, &aer_regs);
>> +
>> +    if (severity == AER_CORRECTABLE)
>> +        cxl_log_correctable_ras_dport(cxlds);
>> +    else
>> +        cxl_report_and_clear_dport(cxlds);
>> +}
>> +
>> +void cxl_cor_error_detected(struct pci_dev *pdev)
>> +{
>> +    struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>> +
>> +    if (cxlds->rcd)
>> +        cxl_rch_log_error(cxlds);
>> +
>> +    cxl_log_correctable_ras_endpoint(cxlds);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
>> +
>>   pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>                       pci_channel_state_t state)
>>   {
>> @@ -685,6 +784,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>       struct device *dev = &cxlmd->dev;
>>       bool ue;
>>   +    if (cxlds->rcd)
>> +        cxl_rch_log_error(cxlds);
>> +
>>       /*
>>        * A frozen channel indicates an impending reset which is fatal to
>>        * CXL.mem operation, and will likely crash the system. On the off
>> diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
>> index 108a349d8101..7130f35891da 100644
>> --- a/drivers/cxl/core/regs.c
>> +++ b/drivers/cxl/core/regs.c
>> @@ -198,6 +198,7 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
>>         return ret_val;
>>   }
>> +EXPORT_SYMBOL_NS_GPL(devm_cxl_iomap_block, CXL);
>>     int cxl_map_component_regs(struct device *dev, struct cxl_component_regs *regs,
>>                  struct cxl_register_map *map, unsigned long map_mask)
>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index 9fd7df48ce99..7036e34354bc 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -66,6 +66,8 @@
>>   #define CXL_DECODER_MIN_GRANULARITY 256
>>   #define CXL_DECODER_MAX_ENCODED_IG 6
>>   +#define PCI_AER_CAPABILITY_LENGTH 56
>> +
>>   static inline int cxl_hdm_decoder_count(u32 cap_hdr)
>>   {
>>       int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr);
>> @@ -209,6 +211,15 @@ struct cxl_regs {
>>       struct_group_tagged(cxl_device_regs, device_regs,
>>           void __iomem *status, *mbox, *memdev;
>>       );
>> +
>> +    /*
>> +     * Pointer to RCH cxl_dport AER. (only for RCH/RCD mode)
>> +     * @dport_aer: CXL 2.0 12.2.11 RCH Downstream Port-detected Errors
>> +     */
>> +    struct_group_tagged(cxl_rch_regs, rch_regs,
>> +        void __iomem *aer;
>> +        void __iomem *dport_ras;
>> +    );
>>   };
>>     struct cxl_reg_map {
>> @@ -249,6 +260,8 @@ struct cxl_register_map {
>>       };
>>   };
>>   +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
>> +                   resource_size_t length);
>>   void cxl_probe_component_regs(struct device *dev, void __iomem *base,
>>                     struct cxl_component_reg_map *map);
>>   void cxl_probe_device_regs(struct device *dev, void __iomem *base,
>> diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
>> index 12e8e8ebaac0..e217c44ed749 100644
>> --- a/drivers/cxl/mem.c
>> +++ b/drivers/cxl/mem.c
>> @@ -4,6 +4,7 @@
>>   #include <linux/device.h>
>>   #include <linux/module.h>
>>   #include <linux/pci.h>
>> +#include <linux/aer.h>
>>     #include "cxlmem.h"
>>   #include "cxlpci.h"
>> @@ -45,6 +46,132 @@ static int cxl_mem_dpa_show(struct seq_file *file, void *data)
>>       return 0;
>>   }
>>   +static int rcec_enable_aer_ints(struct pci_dev *pdev)
>> +{
>> +    struct pci_dev *rcec = pdev->rcec;
>> +    int aer, rc;
>> +    u32 mask;
>> +
>> +    if (!rcec)
>> +        return -ENODEV;
>> +
>> +    /*
>> +     * Internal errors are masked by default, unmask RCEC's here
>> +     * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
>> +     * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
>> +     */
>> +    aer = rcec->aer_cap;
>> +    rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
>> +    if (rc)
>> +        return rc;
>> +    mask &= ~PCI_ERR_UNC_INTN;
>> +    rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
>> +    if (rc)
>> +        return rc;
>> +
>> +    rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
>> +    if (rc)
>> +        return rc;
>> +    mask &= ~PCI_ERR_COR_INTERNAL;
>> +    rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
>> +
>> +    return rc;
>> +}
>> +
>> +static void disable_aer(void *_pdev)
>> +{
>> +    struct pci_dev *pdev = (struct pci_dev *)_pdev;
>> +
>> +    pci_disable_pcie_error_reporting(pdev);
>> +
>> +    /*
>> +     * Keep the RCEC's internal AER enabled. There
>> +     * could be other RCiEPs using this RCEC.
>> +     */
>> +}
>> +
>> +static void rch_disable_root_ints(void __iomem *aer_base)
>> +{
>> +    u32 aer_cmd_mask, aer_cmd;
>> +
>> +    /*
>> +     * Disable RCH root port command interrupts.
>> +     * CXL3.0 12.2.1.1 - RCH Downstream Port-detected Errors
>> +     */
>> +    aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN |
>> +            PCI_ERR_ROOT_CMD_NONFATAL_EN |
>> +            PCI_ERR_ROOT_CMD_FATAL_EN);
>> +    aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND);
>> +    aer_cmd &= ~aer_cmd_mask;
>> +    writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>> +}
>> +
>> +static int cxl_ras_setup_interrupts(struct cxl_dev_state *cxlds)
>> +{
>> +    struct pci_dev *pdev = to_pci_dev(cxlds->dev);
>> +    int rc;
>> +
>> +    if (cxlds->rcd) {
>> +        rch_disable_root_ints(cxlds->regs.aer);
>> +
>> +        rc = rcec_enable_aer_ints(pdev);
>> +        if (rc)
>> +            return rc;
>> +    }
>> +
>> +    rc = pci_enable_pcie_error_reporting(pdev);
> 
> Hi Terry, not sure if you saw this thread [1], but with the new changes upstream [2] to the PCIe subsystem, Bjorn says we no longer need to call pci_enable_pcie_error_report() by the driver.
> 
> [1]: https://lore.kernel.org/linux-cxl/c2e244bd-a94b-8de2-e43c-7ff8a756cbc7@intel.com/T/#mef401fb0580ebb4c4bc2a164f87e12b60cf76693
> [2]: commit f26e58bf6f54 ("PCI/AER: Enable error reporting when AER is native")
> 
> DJ
> 

Ok. I'll remove the call to pci_enable_pcie_error_reporting(). This 
will help simplify since it allows removing disable_aer() and call 
to devm_add_action_or_reset() as well.

Regards,
Terry
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 7328a2552411..6e36471969ba 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -5,6 +5,7 @@ 
 #include <linux/delay.h>
 #include <linux/pci.h>
 #include <linux/pci-doe.h>
+#include <linux/aer.h>
 #include <cxlpci.h>
 #include <cxlmem.h>
 #include <cxl.h>
@@ -605,32 +606,88 @@  void read_cdat_data(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
 
-void cxl_cor_error_detected(struct pci_dev *pdev)
+/* Get AER severity. Return false if there is no error. */
+static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
+				     int *severity)
+{
+	if (aer_regs->uncor_status & ~aer_regs->uncor_mask) {
+		if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
+			*severity = AER_FATAL;
+		else
+			*severity = AER_NONFATAL;
+		return true;
+	}
+
+	if (aer_regs->cor_status & ~aer_regs->cor_mask) {
+		*severity = AER_CORRECTABLE;
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Copy the AER capability registers to a buffer. This is necessary
+ * because RCRB AER capability is MMIO mapped. Clear the status
+ * after copying.
+ *
+ * @aer_base: base address of AER capability block in RCRB
+ * @aer_regs: destination for copying AER capability
+ */
+static bool cxl_rch_get_aer_info(void __iomem *aer_base,
+				 struct aer_capability_regs *aer_regs)
+{
+	int read_cnt = PCI_AER_CAPABILITY_LENGTH / sizeof(u32);
+	u32 *aer_regs_buf = (u32 *)aer_regs;
+	int n;
+
+	if (!aer_base)
+		return false;
+
+	for (n = 0; n < read_cnt; n++)
+		aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
+
+	writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS);
+	writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS);
+
+	return true;
+}
+
+static void __cxl_log_correctable_ras(struct cxl_dev_state *cxlds,
+				      void __iomem *ras_base)
 {
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
 	void __iomem *addr;
 	u32 status;
 
-	if (!cxlds->regs.ras)
+	if (!ras_base)
 		return;
 
-	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
+	addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
 	status = readl(addr);
 	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
 		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
 		trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
 	}
 }
-EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
+
+static void cxl_log_correctable_ras_endpoint(struct cxl_dev_state *cxlds)
+{
+	return __cxl_log_correctable_ras(cxlds, cxlds->regs.ras);
+}
+
+static void cxl_log_correctable_ras_dport(struct cxl_dev_state *cxlds)
+{
+	return __cxl_log_correctable_ras(cxlds, cxlds->regs.dport_ras);
+}
 
 /* CXL spec rev3.0 8.2.4.16.1 */
-static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
+static void header_log_copy(void __iomem *ras_base, u32 *log)
 {
 	void __iomem *addr;
 	u32 *log_addr;
 	int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
 
-	addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET;
+	addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET;
 	log_addr = log;
 
 	for (i = 0; i < log_u32_size; i++) {
@@ -644,17 +701,18 @@  static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
  * Log the state of the RAS status registers and prepare them to log the
  * next error status. Return 1 if reset needed.
  */
-static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
+static bool __cxl_report_and_clear(struct cxl_dev_state *cxlds,
+				  void __iomem *ras_base)
 {
 	u32 hl[CXL_HEADERLOG_SIZE_U32];
 	void __iomem *addr;
 	u32 status;
 	u32 fe;
 
-	if (!cxlds->regs.ras)
+	if (!ras_base)
 		return false;
 
-	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
+	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
 	status = readl(addr);
 	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
 		return false;
@@ -662,7 +720,7 @@  static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
 	/* If multiple errors, log header points to first error from ctrl reg */
 	if (hweight32(status) > 1) {
 		void __iomem *rcc_addr =
-			cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
+			ras_base + CXL_RAS_CAP_CONTROL_OFFSET;
 
 		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
 				   readl(rcc_addr)));
@@ -670,13 +728,54 @@  static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
 		fe = status;
 	}
 
-	header_log_copy(cxlds, hl);
+	header_log_copy(ras_base, hl);
 	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl);
 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
 
 	return true;
 }
 
+static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
+{
+	return __cxl_report_and_clear(cxlds, cxlds->regs.ras);
+}
+
+static bool cxl_report_and_clear_dport(struct cxl_dev_state *cxlds)
+{
+	return __cxl_report_and_clear(cxlds, cxlds->regs.dport_ras);
+}
+
+static void cxl_rch_log_error(struct cxl_dev_state *cxlds)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	struct aer_capability_regs aer_regs;
+	int severity;
+
+	if (!cxl_rch_get_aer_info(cxlds->regs.aer, &aer_regs))
+		return;
+
+	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
+		return;
+
+	cper_print_aer(pdev, severity, &aer_regs);
+
+	if (severity == AER_CORRECTABLE)
+		cxl_log_correctable_ras_dport(cxlds);
+	else
+		cxl_report_and_clear_dport(cxlds);
+}
+
+void cxl_cor_error_detected(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+
+	if (cxlds->rcd)
+		cxl_rch_log_error(cxlds);
+
+	cxl_log_correctable_ras_endpoint(cxlds);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
+
 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 				    pci_channel_state_t state)
 {
@@ -685,6 +784,9 @@  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 	struct device *dev = &cxlmd->dev;
 	bool ue;
 
+	if (cxlds->rcd)
+		cxl_rch_log_error(cxlds);
+
 	/*
 	 * A frozen channel indicates an impending reset which is fatal to
 	 * CXL.mem operation, and will likely crash the system. On the off
diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
index 108a349d8101..7130f35891da 100644
--- a/drivers/cxl/core/regs.c
+++ b/drivers/cxl/core/regs.c
@@ -198,6 +198,7 @@  void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
 
 	return ret_val;
 }
+EXPORT_SYMBOL_NS_GPL(devm_cxl_iomap_block, CXL);
 
 int cxl_map_component_regs(struct device *dev, struct cxl_component_regs *regs,
 			   struct cxl_register_map *map, unsigned long map_mask)
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 9fd7df48ce99..7036e34354bc 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -66,6 +66,8 @@ 
 #define CXL_DECODER_MIN_GRANULARITY 256
 #define CXL_DECODER_MAX_ENCODED_IG 6
 
+#define PCI_AER_CAPABILITY_LENGTH 56
+
 static inline int cxl_hdm_decoder_count(u32 cap_hdr)
 {
 	int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr);
@@ -209,6 +211,15 @@  struct cxl_regs {
 	struct_group_tagged(cxl_device_regs, device_regs,
 		void __iomem *status, *mbox, *memdev;
 	);
+
+	/*
+	 * Pointer to RCH cxl_dport AER. (only for RCH/RCD mode)
+	 * @dport_aer: CXL 2.0 12.2.11 RCH Downstream Port-detected Errors
+	 */
+	struct_group_tagged(cxl_rch_regs, rch_regs,
+		void __iomem *aer;
+		void __iomem *dport_ras;
+	);
 };
 
 struct cxl_reg_map {
@@ -249,6 +260,8 @@  struct cxl_register_map {
 	};
 };
 
+void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
+				   resource_size_t length);
 void cxl_probe_component_regs(struct device *dev, void __iomem *base,
 			      struct cxl_component_reg_map *map);
 void cxl_probe_device_regs(struct device *dev, void __iomem *base,
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 12e8e8ebaac0..e217c44ed749 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -4,6 +4,7 @@ 
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/aer.h>
 
 #include "cxlmem.h"
 #include "cxlpci.h"
@@ -45,6 +46,132 @@  static int cxl_mem_dpa_show(struct seq_file *file, void *data)
 	return 0;
 }
 
+static int rcec_enable_aer_ints(struct pci_dev *pdev)
+{
+	struct pci_dev *rcec = pdev->rcec;
+	int aer, rc;
+	u32 mask;
+
+	if (!rcec)
+		return -ENODEV;
+
+	/*
+	 * Internal errors are masked by default, unmask RCEC's here
+	 * PCI6.0 7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)
+	 * PCI6.0 7.8.4.6 Correctable Error Mask Register (Offset 14h)
+	 */
+	aer = rcec->aer_cap;
+	rc = pci_read_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, &mask);
+	if (rc)
+		return rc;
+	mask &= ~PCI_ERR_UNC_INTN;
+	rc = pci_write_config_dword(rcec, aer + PCI_ERR_UNCOR_MASK, mask);
+	if (rc)
+		return rc;
+
+	rc = pci_read_config_dword(rcec, aer + PCI_ERR_COR_MASK, &mask);
+	if (rc)
+		return rc;
+	mask &= ~PCI_ERR_COR_INTERNAL;
+	rc = pci_write_config_dword(rcec, aer + PCI_ERR_COR_MASK, mask);
+
+	return rc;
+}
+
+static void disable_aer(void *_pdev)
+{
+	struct pci_dev *pdev = (struct pci_dev *)_pdev;
+
+	pci_disable_pcie_error_reporting(pdev);
+
+	/*
+	 * Keep the RCEC's internal AER enabled. There
+	 * could be other RCiEPs using this RCEC.
+	 */
+}
+
+static void rch_disable_root_ints(void __iomem *aer_base)
+{
+	u32 aer_cmd_mask, aer_cmd;
+
+	/*
+	 * Disable RCH root port command interrupts.
+	 * CXL3.0 12.2.1.1 - RCH Downstream Port-detected Errors
+	 */
+	aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN |
+			PCI_ERR_ROOT_CMD_NONFATAL_EN |
+			PCI_ERR_ROOT_CMD_FATAL_EN);
+	aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND);
+	aer_cmd &= ~aer_cmd_mask;
+	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
+}
+
+static int cxl_ras_setup_interrupts(struct cxl_dev_state *cxlds)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	int rc;
+
+	if (cxlds->rcd) {
+		rch_disable_root_ints(cxlds->regs.aer);
+
+		rc = rcec_enable_aer_ints(pdev);
+		if (rc)
+			return rc;
+	}
+
+	rc = pci_enable_pcie_error_reporting(pdev);
+	if (rc)
+		return rc;
+
+	return devm_add_action_or_reset(&pdev->dev, disable_aer, pdev);
+}
+
+static int cxl_rch_map_ras(struct cxl_dev_state *cxlds,
+			   struct cxl_dport *parent_dport)
+{
+	struct device *dev = parent_dport->dport;
+	resource_size_t aer_phys, ras_phys;
+	void __iomem *aer, *dport_ras;
+
+	if (!parent_dport->rch)
+		return 0;
+
+	if (!parent_dport->aer_cap || !parent_dport->ras_cap ||
+	    parent_dport->component_reg_phys == CXL_RESOURCE_NONE)
+		return -ENODEV;
+
+	aer_phys = parent_dport->aer_cap + parent_dport->rcrb;
+	aer = devm_cxl_iomap_block(dev, aer_phys,
+				   PCI_AER_CAPABILITY_LENGTH);
+
+	if (!aer)
+		return -ENOMEM;
+
+	ras_phys = parent_dport->ras_cap + parent_dport->component_reg_phys;
+	dport_ras = devm_cxl_iomap_block(dev, ras_phys,
+					 CXL_RAS_CAPABILITY_LENGTH);
+
+	if (!dport_ras)
+		return -ENOMEM;
+
+	cxlds->regs.aer = aer;
+	cxlds->regs.dport_ras = dport_ras;
+
+	return 0;
+}
+
+static int cxl_setup_ras(struct cxl_dev_state *cxlds,
+			 struct cxl_dport *parent_dport)
+{
+	int rc;
+
+	rc = cxl_rch_map_ras(cxlds, parent_dport);
+	if (rc)
+		return rc;
+
+	return cxl_ras_setup_interrupts(cxlds);
+}
+
 static void cxl_rcrb_setup(struct cxl_dev_state *cxlds,
 			   struct cxl_dport *parent_dport)
 {
@@ -93,6 +220,13 @@  static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd,
 
 	cxl_rcrb_setup(cxlds, parent_dport);
 
+	rc = cxl_setup_ras(cxlds, parent_dport);
+	/* Continue with RAS setup errors */
+	if (rc)
+		dev_warn(&cxlmd->dev, "CXL RAS setup failed: %d\n", rc);
+	else
+		dev_info(&cxlmd->dev, "CXL error handling enabled\n");
+
 	endpoint = devm_cxl_add_port(host, &cxlmd->dev, cxlds->component_reg_phys,
 				     parent_dport);
 	if (IS_ERR(endpoint))