diff mbox series

[RFC,4/6] PCI/AER: Extend RCH RAS error handling to support VH topology case

Message ID 20240313083602.239201-5-ming4.li@intel.com
State New
Headers show
Series Add support for root port RAS error handling | expand

Commit Message

Li, Ming4 March 13, 2024, 8:36 a.m. UTC
When RCEC captures CXL.cachemem protocol errors detected by CXL root
port, the recommendation from CXL r3.1 9.18.1.5 is :

	"Probe all CXL Downstream Ports and determine whether they have logged an
	error in the CXL.io or CXL.cachemem status registers."

The flow is similar with RCH RAS error handling, so reuse it to support
above case.

Signed-off-by: Li Ming <ming4.li@intel.com>
---
 drivers/pci/pcie/aer.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

Comments

Dan Williams March 15, 2024, 2:30 a.m. UTC | #1
Li Ming wrote:
> When RCEC captures CXL.cachemem protocol errors detected by CXL root
> port, the recommendation from CXL r3.1 9.18.1.5 is :
> 
> 	"Probe all CXL Downstream Ports and determine whether they have logged an
> 	error in the CXL.io or CXL.cachemem status registers."
> 
> The flow is similar with RCH RAS error handling, so reuse it to support
> above case.
> 
> Signed-off-by: Li Ming <ming4.li@intel.com>
> ---
>  drivers/pci/pcie/aer.c | 20 ++++++++++++--------
>  1 file changed, 12 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 364c74e47273..79bfa5fb78f4 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -996,11 +996,15 @@ static bool is_internal_error(struct aer_err_info *info)
>  	return info->status & PCI_ERR_UNC_INTN;
>  }
>  
> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
> +static int cxl_handle_error_iter(struct pci_dev *dev, void *data)
>  {
>  	struct aer_err_info *info = (struct aer_err_info *)data;
>  	const struct pci_error_handlers *err_handler;
>  
> +	/* Skip the RCiEP devices not associating with RCEC */
> +	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
> +	    !dev->rcec)
> +		return 0;
>  	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
>  		return 0;

is_cxl_mem_dev(dev) will always be false in the VH case, so how does
this change help the VH case?
Li, Ming4 March 15, 2024, 3:43 a.m. UTC | #2
On 3/15/2024 10:30 AM, Dan Williams wrote:
> Li Ming wrote:
>> When RCEC captures CXL.cachemem protocol errors detected by CXL root
>> port, the recommendation from CXL r3.1 9.18.1.5 is :
>>
>> 	"Probe all CXL Downstream Ports and determine whether they have logged an
>> 	error in the CXL.io or CXL.cachemem status registers."
>>
>> The flow is similar with RCH RAS error handling, so reuse it to support
>> above case.
>>
>> Signed-off-by: Li Ming <ming4.li@intel.com>
>> ---
>>  drivers/pci/pcie/aer.c | 20 ++++++++++++--------
>>  1 file changed, 12 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 364c74e47273..79bfa5fb78f4 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -996,11 +996,15 @@ static bool is_internal_error(struct aer_err_info *info)
>>  	return info->status & PCI_ERR_UNC_INTN;
>>  }
>>  
>> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
>> +static int cxl_handle_error_iter(struct pci_dev *dev, void *data)
>>  {
>>  	struct aer_err_info *info = (struct aer_err_info *)data;
>>  	const struct pci_error_handlers *err_handler;
>>  
>> +	/* Skip the RCiEP devices not associating with RCEC */
>> +	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
>> +	    !dev->rcec)
>> +		return 0;
>>  	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
>>  		return 0;
> 
> is_cxl_mem_dev(dev) will always be false in the VH case, so how does
> this change help the VH case?

Hi Dan,

I think it won't be false if the CXL memory device is an endpoint.
pcie_walk_rcec_all() will walk all pci_dev in RCEC assocaited bus ranges. So these two checkings can help us to filter:
1. CXL memory device is an RCiEP associated with RCEC in the RCH case
2. CXL memory device is not an RCiEP, so it should be an endpoint in the VH case.
Dan Williams March 15, 2024, 4:05 a.m. UTC | #3
Li, Ming wrote:
> On 3/15/2024 10:30 AM, Dan Williams wrote:
> > Li Ming wrote:
> >> When RCEC captures CXL.cachemem protocol errors detected by CXL root
> >> port, the recommendation from CXL r3.1 9.18.1.5 is :
> >>
> >> 	"Probe all CXL Downstream Ports and determine whether they have logged an
> >> 	error in the CXL.io or CXL.cachemem status registers."
> >>
> >> The flow is similar with RCH RAS error handling, so reuse it to support
> >> above case.
> >>
> >> Signed-off-by: Li Ming <ming4.li@intel.com>
> >> ---
> >>  drivers/pci/pcie/aer.c | 20 ++++++++++++--------
> >>  1 file changed, 12 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> >> index 364c74e47273..79bfa5fb78f4 100644
> >> --- a/drivers/pci/pcie/aer.c
> >> +++ b/drivers/pci/pcie/aer.c
> >> @@ -996,11 +996,15 @@ static bool is_internal_error(struct aer_err_info *info)
> >>  	return info->status & PCI_ERR_UNC_INTN;
> >>  }
> >>  
> >> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
> >> +static int cxl_handle_error_iter(struct pci_dev *dev, void *data)
> >>  {
> >>  	struct aer_err_info *info = (struct aer_err_info *)data;
> >>  	const struct pci_error_handlers *err_handler;
> >>  
> >> +	/* Skip the RCiEP devices not associating with RCEC */
> >> +	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
> >> +	    !dev->rcec)
> >> +		return 0;
> >>  	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
> >>  		return 0;
> > 
> > is_cxl_mem_dev(dev) will always be false in the VH case, so how does
> > this change help the VH case?
> 
> Hi Dan,
> 
> I think it won't be false if the CXL memory device is an endpoint.
> pcie_walk_rcec_all() will walk all pci_dev in RCEC assocaited bus ranges. So these two checkings can help us to filter:
> 1. CXL memory device is an RCiEP associated with RCEC in the RCH case
> 2. CXL memory device is not an RCiEP, so it should be an endpoint in the VH case.

It will be an endpoint, but I though cxl_handle_error_iter() is only
called for RCIEPs and RPs that are share a bus range with the RCEC. The
endpoint in the VH case is downstream of the RP.

I had been assuming that pci_walk_bus() limits itself to buses within
the Root Complex however it descends the entire bus hierarchy so this
implementation will walk the entire topology on all root ports
associated with the RCEC looking for any CXL device. That feels wrong.

I would expect that this limits it self to only finding root ports and
then only proceeding if that root port has a directly attached CXL
device.

Note, when you send a v2 of this RFC be sure to copy linux-pci for these
core changes to PCI error handling.
Li, Ming4 March 15, 2024, 5:08 a.m. UTC | #4
On 3/15/2024 12:05 PM, Dan Williams wrote:
> Li, Ming wrote:
>> On 3/15/2024 10:30 AM, Dan Williams wrote:
>>> Li Ming wrote:
>>>> When RCEC captures CXL.cachemem protocol errors detected by CXL root
>>>> port, the recommendation from CXL r3.1 9.18.1.5 is :
>>>>
>>>> 	"Probe all CXL Downstream Ports and determine whether they have logged an
>>>> 	error in the CXL.io or CXL.cachemem status registers."
>>>>
>>>> The flow is similar with RCH RAS error handling, so reuse it to support
>>>> above case.
>>>>
>>>> Signed-off-by: Li Ming <ming4.li@intel.com>
>>>> ---
>>>>  drivers/pci/pcie/aer.c | 20 ++++++++++++--------
>>>>  1 file changed, 12 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>>> index 364c74e47273..79bfa5fb78f4 100644
>>>> --- a/drivers/pci/pcie/aer.c
>>>> +++ b/drivers/pci/pcie/aer.c
>>>> @@ -996,11 +996,15 @@ static bool is_internal_error(struct aer_err_info *info)
>>>>  	return info->status & PCI_ERR_UNC_INTN;
>>>>  }
>>>>  
>>>> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
>>>> +static int cxl_handle_error_iter(struct pci_dev *dev, void *data)
>>>>  {
>>>>  	struct aer_err_info *info = (struct aer_err_info *)data;
>>>>  	const struct pci_error_handlers *err_handler;
>>>>  
>>>> +	/* Skip the RCiEP devices not associating with RCEC */
>>>> +	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
>>>> +	    !dev->rcec)
>>>> +		return 0;
>>>>  	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
>>>>  		return 0;
>>>
>>> is_cxl_mem_dev(dev) will always be false in the VH case, so how does
>>> this change help the VH case?
>>
>> Hi Dan,
>>
>> I think it won't be false if the CXL memory device is an endpoint.
>> pcie_walk_rcec_all() will walk all pci_dev in RCEC assocaited bus ranges. So these two checkings can help us to filter:
>> 1. CXL memory device is an RCiEP associated with RCEC in the RCH case
>> 2. CXL memory device is not an RCiEP, so it should be an endpoint in the VH case.
> 
> It will be an endpoint, but I though cxl_handle_error_iter() is only
> called for RCIEPs and RPs that are share a bus range with the RCEC. The
> endpoint in the VH case is downstream of the RP.
> 
> I had been assuming that pci_walk_bus() limits itself to buses within
> the Root Complex however it descends the entire bus hierarchy so this
> implementation will walk the entire topology on all root ports
> associated with the RCEC looking for any CXL device. That feels wrong.
> 
> I would expect that this limits it self to only finding root ports and
> then only proceeding if that root port has a directly attached CXL
> device.
> 
Got it, will change it in v2, thank you.

> Note, when you send a v2 of this RFC be sure to copy linux-pci for these
> core changes to PCI error handling.
Sure, I made a mistake here.
Terry Bowman March 25, 2024, 7:14 p.m. UTC | #5
Hi Li,

I added comments below

On 3/13/24 03:36, Li Ming wrote:
> When RCEC captures CXL.cachemem protocol errors detected by CXL root
> port, the recommendation from CXL r3.1 9.18.1.5 is :
> 
> 	"Probe all CXL Downstream Ports and determine whether they have logged an
> 	error in the CXL.io or CXL.cachemem status registers."
> 
> The flow is similar with RCH RAS error handling, so reuse it to support
> above case.
> 
> Signed-off-by: Li Ming <ming4.li@intel.com>
> ---
>  drivers/pci/pcie/aer.c | 20 ++++++++++++--------
>  1 file changed, 12 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 364c74e47273..79bfa5fb78f4 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -996,11 +996,15 @@ static bool is_internal_error(struct aer_err_info *info)
>  	return info->status & PCI_ERR_UNC_INTN;
>  }
>  
> -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
> +static int cxl_handle_error_iter(struct pci_dev *dev, void *data)
>  {
>  	struct aer_err_info *info = (struct aer_err_info *)data;
>  	const struct pci_error_handlers *err_handler;
>  
> +	/* Skip the RCiEP devices not associating with RCEC */
> +	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
> +	    !dev->rcec)
> +		return 0>  	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
>  		return 0;
>  
> @@ -1025,16 +1029,16 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
>  	return 0;
>  }
>  
> -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
> +static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
>  {
>  	/*
>  	 * Internal errors of an RCEC indicate an AER error in an
> -	 * RCH's downstream port. Check and handle them in the CXL.mem
> -	 * device driver.
> +	 * RCH's downstream port or a CXL root port. Check and handle
> +	 * them in the CXL.mem device driver.
>  	 */

"Internal errors of an RCEC indicate an AER error in an RCH's downstream port or a CXL root port."

Might be more correct to restate as:

"AER internal errors are used by root ports and RCECs to indicate AER in downstream CXL ports (RCH, USP, DSP) or devices"

Regards,
Terry
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 364c74e47273..79bfa5fb78f4 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -996,11 +996,15 @@  static bool is_internal_error(struct aer_err_info *info)
 	return info->status & PCI_ERR_UNC_INTN;
 }
 
-static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
+static int cxl_handle_error_iter(struct pci_dev *dev, void *data)
 {
 	struct aer_err_info *info = (struct aer_err_info *)data;
 	const struct pci_error_handlers *err_handler;
 
+	/* Skip the RCiEP devices not associating with RCEC */
+	if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) &&
+	    !dev->rcec)
+		return 0;
 	if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev))
 		return 0;
 
@@ -1025,16 +1029,16 @@  static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data)
 	return 0;
 }
 
-static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
+static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 {
 	/*
 	 * Internal errors of an RCEC indicate an AER error in an
-	 * RCH's downstream port. Check and handle them in the CXL.mem
-	 * device driver.
+	 * RCH's downstream port or a CXL root port. Check and handle
+	 * them in the CXL.mem device driver.
 	 */
 	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
 	    is_internal_error(info))
-		pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
+		pcie_walk_rcec_all(dev, cxl_handle_error_iter, info);
 }
 
 static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
@@ -1080,8 +1084,8 @@  static void cxl_enable_rcec(struct pci_dev *rcec)
 
 #else
 static inline void cxl_enable_rcec(struct pci_dev *dev) { }
-static inline void cxl_rch_handle_error(struct pci_dev *dev,
-					struct aer_err_info *info) { }
+static inline void cxl_handle_error(struct pci_dev *dev,
+				    struct aer_err_info *info) { }
 #endif
 
 /**
@@ -1119,7 +1123,7 @@  static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 
 static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 {
-	cxl_rch_handle_error(dev, info);
+	cxl_handle_error(dev, info);
 	pci_aer_handle_error(dev, info);
 	pci_dev_put(dev);
 }