diff mbox series

[RFC,3/6] PCI/AER: Enable RCEC to report internal error for CXL root port

Message ID 20240313083602.239201-4-ming4.li@intel.com
State New
Headers show
Series Add support for root port RAS error handling | expand

Commit Message

Li, Ming4 March 13, 2024, 8:35 a.m. UTC
Per CXl r3.1 section 12.2.2, CXL.cachemem protocol erros detected by CXL
root port could be logged in RCEC AER Extended Capability as
PCI_ERR_UNC_INTN or PCI_ERR_COR_INTERNAL. Unmask these errors for that
case.

Signed-off-by: Li Ming <ming4.li@intel.com>
---
 drivers/pci/pcie/aer.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

Comments

Terry Bowman March 25, 2024, 7:42 p.m. UTC | #1
Hi Li, 

I added comments below.

On 3/13/24 03:35, Li Ming wrote:
> Per CXl r3.1 section 12.2.2, CXL.cachemem protocol erros detected by CXL
> root port could be logged in RCEC AER Extended Capability as
> PCI_ERR_UNC_INTN or PCI_ERR_COR_INTERNAL. Unmask these errors for that
> case.
> 
> Signed-off-by: Li Ming <ming4.li@intel.com>
> ---
>  drivers/pci/pcie/aer.c | 24 +++++++++++++++++-------
>  1 file changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 42a3bd35a3e1..364c74e47273 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -985,7 +985,7 @@ static bool cxl_error_is_native(struct pci_dev *dev)
>  {
>  	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
>  
> -	return (pcie_ports_native || host->native_aer);
> +	return (pcie_ports_native || host->native_aer) && host->is_cxl;
>  }
>  
>  static bool is_internal_error(struct aer_err_info *info)
> @@ -1041,8 +1041,13 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>  {
>  	bool *handles_cxl = data;
>  
> -	if (!*handles_cxl)
> -		*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
> +	if (!*handles_cxl && cxl_error_is_native(dev)) {
> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END &&
> +		    dev->rcec && is_cxl_mem_dev(dev))
> +			*handles_cxl = true;
> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
> +			*handles_cxl = true;
> +	}
I understand a root port can be found under an RCEC. It's possible. But, does the downstream 
root port forward AER to the upstream RCEC? My understanding is AER is handled and processed
at the first root port/RCEC upstream from the device/RCH/USP/DSP.
 
Regards,
Terry

>  
>  	/* Non-zero terminates iteration */
>  	return *handles_cxl;
> @@ -1054,13 +1059,18 @@ static bool handles_cxl_errors(struct pci_dev *rcec)
>  
>  	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
>  	    pcie_aer_is_native(rcec))
> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
> +		pcie_walk_rcec_all(rcec, handles_cxl_error_iter, &handles_cxl);
>  
>  	return handles_cxl;
>  }
>  
> -static void cxl_rch_enable_rcec(struct pci_dev *rcec)
> +static void cxl_enable_rcec(struct pci_dev *rcec)
>  {
> +	/*
> +	 * Enable RCEC's internal error report for two cases:
> +	 * 1. RCiEP detected CXL.cachemem protocol errors
> +	 * 2. CXL root port detected CXL.cachemem protocol errors.
> +	 */
>  	if (!handles_cxl_errors(rcec))
>  		return;
>  
> @@ -1069,7 +1079,7 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>  }
>  
>  #else
> -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
> +static inline void cxl_enable_rcec(struct pci_dev *dev) { }
>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>  					struct aer_err_info *info) { }
>  #endif
> @@ -1494,7 +1504,7 @@ static int aer_probe(struct pcie_device *dev)
>  		return status;
>  	}
>  
> -	cxl_rch_enable_rcec(port);
> +	cxl_enable_rcec(port);
>  	aer_enable_rootport(rpc);
>  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
>  	return 0;
Li, Ming4 April 16, 2024, 7:27 a.m. UTC | #2
On 3/26/2024 3:42 AM, Terry Bowman wrote:
> Hi Li, 
> 
> I added comments below.
> 
> On 3/13/24 03:35, Li Ming wrote:
>> Per CXl r3.1 section 12.2.2, CXL.cachemem protocol erros detected by CXL
>> root port could be logged in RCEC AER Extended Capability as
>> PCI_ERR_UNC_INTN or PCI_ERR_COR_INTERNAL. Unmask these errors for that
>> case.
>>
>> Signed-off-by: Li Ming <ming4.li@intel.com>
>> ---
>>  drivers/pci/pcie/aer.c | 24 +++++++++++++++++-------
>>  1 file changed, 17 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 42a3bd35a3e1..364c74e47273 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -985,7 +985,7 @@ static bool cxl_error_is_native(struct pci_dev *dev)
>>  {
>>  	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
>>  
>> -	return (pcie_ports_native || host->native_aer);
>> +	return (pcie_ports_native || host->native_aer) && host->is_cxl;
>>  }
>>  
>>  static bool is_internal_error(struct aer_err_info *info)
>> @@ -1041,8 +1041,13 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>  {
>>  	bool *handles_cxl = data;
>>  
>> -	if (!*handles_cxl)
>> -		*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
>> +	if (!*handles_cxl && cxl_error_is_native(dev)) {
>> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END &&
>> +		    dev->rcec && is_cxl_mem_dev(dev))
>> +			*handles_cxl = true;
>> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
>> +			*handles_cxl = true;
>> +	}
> I understand a root port can be found under an RCEC. It's possible. But, does the downstream 
> root port forward AER to the upstream RCEC? My understanding is AER is handled and processed
> at the first root port/RCEC upstream from the device/RCH/USP/DSP.
>  
> Regards,
> Terry
> 

CXL r3.1 section 12.2.2 mentions this:

"If the CXL.cachemem protocol errors detected by a CXL root port are logged as
CIEs or UIEs in an RCEC’s AER Extended Capability, it is recommended that the System
Firmware populate an RDPAS record (see Section 9.18.1.5) to establish the association
between the RCEC and the root port."

I think it means that CXL root port is possible to forward its AER to RCEC.

Thanks
Ming

>>  
>>  	/* Non-zero terminates iteration */
>>  	return *handles_cxl;
>> @@ -1054,13 +1059,18 @@ static bool handles_cxl_errors(struct pci_dev *rcec)
>>  
>>  	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
>>  	    pcie_aer_is_native(rcec))
>> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
>> +		pcie_walk_rcec_all(rcec, handles_cxl_error_iter, &handles_cxl);
>>  
>>  	return handles_cxl;
>>  }
>>  
>> -static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>> +static void cxl_enable_rcec(struct pci_dev *rcec)
>>  {
>> +	/*
>> +	 * Enable RCEC's internal error report for two cases:
>> +	 * 1. RCiEP detected CXL.cachemem protocol errors
>> +	 * 2. CXL root port detected CXL.cachemem protocol errors.
>> +	 */
>>  	if (!handles_cxl_errors(rcec))
>>  		return;
>>  
>> @@ -1069,7 +1079,7 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>  }
>>  
>>  #else
>> -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>> +static inline void cxl_enable_rcec(struct pci_dev *dev) { }
>>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>>  					struct aer_err_info *info) { }
>>  #endif
>> @@ -1494,7 +1504,7 @@ static int aer_probe(struct pcie_device *dev)
>>  		return status;
>>  	}
>>  
>> -	cxl_rch_enable_rcec(port);
>> +	cxl_enable_rcec(port);
>>  	aer_enable_rootport(rpc);
>>  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
>>  	return 0;
Terry Bowman April 16, 2024, 2:46 p.m. UTC | #3
Hi Ming,

On 4/16/24 02:27, Li, Ming wrote:
> On 3/26/2024 3:42 AM, Terry Bowman wrote:
>> Hi Li, 
>>
>> I added comments below.
>>
>> On 3/13/24 03:35, Li Ming wrote:
>>> Per CXl r3.1 section 12.2.2, CXL.cachemem protocol erros detected by CXL
>>> root port could be logged in RCEC AER Extended Capability as
>>> PCI_ERR_UNC_INTN or PCI_ERR_COR_INTERNAL. Unmask these errors for that
>>> case.
>>>
>>> Signed-off-by: Li Ming <ming4.li@intel.com>
>>> ---
>>>  drivers/pci/pcie/aer.c | 24 +++++++++++++++++-------
>>>  1 file changed, 17 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>> index 42a3bd35a3e1..364c74e47273 100644
>>> --- a/drivers/pci/pcie/aer.c
>>> +++ b/drivers/pci/pcie/aer.c
>>> @@ -985,7 +985,7 @@ static bool cxl_error_is_native(struct pci_dev *dev)
>>>  {
>>>  	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
>>>  
>>> -	return (pcie_ports_native || host->native_aer);
>>> +	return (pcie_ports_native || host->native_aer) && host->is_cxl;
>>>  }
>>>  
>>>  static bool is_internal_error(struct aer_err_info *info)
>>> @@ -1041,8 +1041,13 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>>  {
>>>  	bool *handles_cxl = data;
>>>  
>>> -	if (!*handles_cxl)
>>> -		*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
>>> +	if (!*handles_cxl && cxl_error_is_native(dev)) {
>>> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END &&
>>> +		    dev->rcec && is_cxl_mem_dev(dev))
>>> +			*handles_cxl = true;
>>> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
>>> +			*handles_cxl = true;
>>> +	}
>> I understand a root port can be found under an RCEC. It's possible. But, does the downstream 
>> root port forward AER to the upstream RCEC? My understanding is AER is handled and processed
>> at the first root port/RCEC upstream from the device/RCH/USP/DSP.
>>  
>> Regards,
>> Terry
>>
> 
> CXL r3.1 section 12.2.2 mentions this:
> 
> "If the CXL.cachemem protocol errors detected by a CXL root port are logged as
> CIEs or UIEs in an RCEC’s AER Extended Capability, it is recommended that the System
> Firmware populate an RDPAS record (see Section 9.18.1.5) to establish the association
> between the RCEC and the root port."
> 
> I think it means that CXL root port is possible to forward its AER to RCEC.
> 
> Thanks
> Ming
> 

Thanks for pointing to spec details. 

In testing here, we used root port as agent to consume root port CXL protocol errors.
The logic to handle the root port errors requires little to no AER driver changes.
This results in a root port consuming VH protocol errors and RCEC consuming RCD 
protocol errors. The RCEC and root port both use the PCIe port bus driver's AER service
driver in separate instances for RCEC-RCD and root-port-VH.

The driver support is much simpler if RCEC does not handle VH protocol errors. Is there 
a reason to forward root port VH mode protocol errors to an RCEC rather than consume 
in the root port's AER driver and forward to CXL error handler? 

Regards,
Terry

>>>  
>>>  	/* Non-zero terminates iteration */
>>>  	return *handles_cxl;
>>> @@ -1054,13 +1059,18 @@ static bool handles_cxl_errors(struct pci_dev *rcec)
>>>  
>>>  	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
>>>  	    pcie_aer_is_native(rcec))
>>> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
>>> +		pcie_walk_rcec_all(rcec, handles_cxl_error_iter, &handles_cxl);
>>>  
>>>  	return handles_cxl;
>>>  }
>>>  
>>> -static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>> +static void cxl_enable_rcec(struct pci_dev *rcec)
>>>  {
>>> +	/*
>>> +	 * Enable RCEC's internal error report for two cases:
>>> +	 * 1. RCiEP detected CXL.cachemem protocol errors
>>> +	 * 2. CXL root port detected CXL.cachemem protocol errors.
>>> +	 */
>>>  	if (!handles_cxl_errors(rcec))
>>>  		return;
>>>  
>>> @@ -1069,7 +1079,7 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>  }
>>>  
>>>  #else
>>> -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>>> +static inline void cxl_enable_rcec(struct pci_dev *dev) { }
>>>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>>>  					struct aer_err_info *info) { }
>>>  #endif
>>> @@ -1494,7 +1504,7 @@ static int aer_probe(struct pcie_device *dev)
>>>  		return status;
>>>  	}
>>>  
>>> -	cxl_rch_enable_rcec(port);
>>> +	cxl_enable_rcec(port);
>>>  	aer_enable_rootport(rpc);
>>>  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
>>>  	return 0;
>
Li, Ming4 April 18, 2024, 5:53 a.m. UTC | #4
On 4/16/2024 10:46 PM, Terry Bowman wrote:
> Hi Ming,
> 
> On 4/16/24 02:27, Li, Ming wrote:
>> On 3/26/2024 3:42 AM, Terry Bowman wrote:
>>> Hi Li, 
>>>
>>> I added comments below.
>>>
>>> On 3/13/24 03:35, Li Ming wrote:
>>>> Per CXl r3.1 section 12.2.2, CXL.cachemem protocol erros detected by CXL
>>>> root port could be logged in RCEC AER Extended Capability as
>>>> PCI_ERR_UNC_INTN or PCI_ERR_COR_INTERNAL. Unmask these errors for that
>>>> case.
>>>>
>>>> Signed-off-by: Li Ming <ming4.li@intel.com>
>>>> ---
>>>>  drivers/pci/pcie/aer.c | 24 +++++++++++++++++-------
>>>>  1 file changed, 17 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>>> index 42a3bd35a3e1..364c74e47273 100644
>>>> --- a/drivers/pci/pcie/aer.c
>>>> +++ b/drivers/pci/pcie/aer.c
>>>> @@ -985,7 +985,7 @@ static bool cxl_error_is_native(struct pci_dev *dev)
>>>>  {
>>>>  	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
>>>>  
>>>> -	return (pcie_ports_native || host->native_aer);
>>>> +	return (pcie_ports_native || host->native_aer) && host->is_cxl;
>>>>  }
>>>>  
>>>>  static bool is_internal_error(struct aer_err_info *info)
>>>> @@ -1041,8 +1041,13 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>>>  {
>>>>  	bool *handles_cxl = data;
>>>>  
>>>> -	if (!*handles_cxl)
>>>> -		*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
>>>> +	if (!*handles_cxl && cxl_error_is_native(dev)) {
>>>> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END &&
>>>> +		    dev->rcec && is_cxl_mem_dev(dev))
>>>> +			*handles_cxl = true;
>>>> +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
>>>> +			*handles_cxl = true;
>>>> +	}
>>> I understand a root port can be found under an RCEC. It's possible. But, does the downstream 
>>> root port forward AER to the upstream RCEC? My understanding is AER is handled and processed
>>> at the first root port/RCEC upstream from the device/RCH/USP/DSP.
>>>  
>>> Regards,
>>> Terry
>>>
>>
>> CXL r3.1 section 12.2.2 mentions this:
>>
>> "If the CXL.cachemem protocol errors detected by a CXL root port are logged as
>> CIEs or UIEs in an RCEC’s AER Extended Capability, it is recommended that the System
>> Firmware populate an RDPAS record (see Section 9.18.1.5) to establish the association
>> between the RCEC and the root port."
>>
>> I think it means that CXL root port is possible to forward its AER to RCEC.
>>
>> Thanks
>> Ming
>>
> 
> Thanks for pointing to spec details. 
> 
> In testing here, we used root port as agent to consume root port CXL protocol errors.
> The logic to handle the root port errors requires little to no AER driver changes.
> This results in a root port consuming VH protocol errors and RCEC consuming RCD 
> protocol errors. The RCEC and root port both use the PCIe port bus driver's AER service
> driver in separate instances for RCEC-RCD and root-port-VH.
> 
> The driver support is much simpler if RCEC does not handle VH protocol errors. Is there 
> a reason to forward root port VH mode protocol errors to an RCEC rather than consume 
> in the root port's AER driver and forward to CXL error handler? 
> 
> Regards,
> Terry

I agree that is simpler if only root port handle VH protocol errors, but I think that software has no chance to choose if VH protocol errors reported to RCEC or root port, it depends on platform implementation. So I think we should support both cases.


Thanks
Ming

> 
>>>>  
>>>>  	/* Non-zero terminates iteration */
>>>>  	return *handles_cxl;
>>>> @@ -1054,13 +1059,18 @@ static bool handles_cxl_errors(struct pci_dev *rcec)
>>>>  
>>>>  	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
>>>>  	    pcie_aer_is_native(rcec))
>>>> -		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
>>>> +		pcie_walk_rcec_all(rcec, handles_cxl_error_iter, &handles_cxl);
>>>>  
>>>>  	return handles_cxl;
>>>>  }
>>>>  
>>>> -static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>> +static void cxl_enable_rcec(struct pci_dev *rcec)
>>>>  {
>>>> +	/*
>>>> +	 * Enable RCEC's internal error report for two cases:
>>>> +	 * 1. RCiEP detected CXL.cachemem protocol errors
>>>> +	 * 2. CXL root port detected CXL.cachemem protocol errors.
>>>> +	 */
>>>>  	if (!handles_cxl_errors(rcec))
>>>>  		return;
>>>>  
>>>> @@ -1069,7 +1079,7 @@ static void cxl_rch_enable_rcec(struct pci_dev *rcec)
>>>>  }
>>>>  
>>>>  #else
>>>> -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
>>>> +static inline void cxl_enable_rcec(struct pci_dev *dev) { }
>>>>  static inline void cxl_rch_handle_error(struct pci_dev *dev,
>>>>  					struct aer_err_info *info) { }
>>>>  #endif
>>>> @@ -1494,7 +1504,7 @@ static int aer_probe(struct pcie_device *dev)
>>>>  		return status;
>>>>  	}
>>>>  
>>>> -	cxl_rch_enable_rcec(port);
>>>> +	cxl_enable_rcec(port);
>>>>  	aer_enable_rootport(rpc);
>>>>  	pci_info(port, "enabled with IRQ %d\n", dev->irq);
>>>>  	return 0;
>>
Dan Williams April 18, 2024, 2:57 p.m. UTC | #5
Li, Ming wrote:
> On 4/16/2024 10:46 PM, Terry Bowman wrote:
> > The driver support is much simpler if RCEC does not handle VH protocol errors. Is there 
> > a reason to forward root port VH mode protocol errors to an RCEC rather than consume 
> > in the root port's AER driver and forward to CXL error handler? 
> > 
> I agree that is simpler if only root port handle VH protocol errors,
> but I think that software has no chance to choose if VH protocol
> errors reported to RCEC or root port, it depends on platform
> implementation. So I think we should support both cases.

The question is whether the CXL spec RDPAS behavior causes any problems
for platforms that follow PCIe rather than CXL reporting flows for
root-port errors. I.e. does it cause problems if Linux starts scanning
root ports on RCEC notifications?

I do think the lookup needs to change to be based on CXL host-bridge
detection and not CXL-type-3 endpoint detection, but otherwise it looks
like CXL spec wants to invalidate PCIe spec expectations.
Li, Ming4 April 22, 2024, 2:06 a.m. UTC | #6
On 4/18/2024 10:57 PM, Dan Williams wrote:
> Li, Ming wrote:
>> On 4/16/2024 10:46 PM, Terry Bowman wrote:
>>> The driver support is much simpler if RCEC does not handle VH protocol errors. Is there 
>>> a reason to forward root port VH mode protocol errors to an RCEC rather than consume 
>>> in the root port's AER driver and forward to CXL error handler? 
>>>
>> I agree that is simpler if only root port handle VH protocol errors,
>> but I think that software has no chance to choose if VH protocol
>> errors reported to RCEC or root port, it depends on platform
>> implementation. So I think we should support both cases.
> 
> The question is whether the CXL spec RDPAS behavior causes any problems
> for platforms that follow PCIe rather than CXL reporting flows for
> root-port errors. I.e. does it cause problems if Linux starts scanning
> root ports on RCEC notifications?
> 
> I do think the lookup needs to change to be based on CXL host-bridge
> detection and not CXL-type-3 endpoint detection, but otherwise it looks
> like CXL spec wants to invalidate PCIe spec expectations.

Hi Dan, if my understanding is correct, the CXL host-bridge detection you mentioned is that iterating all root ports under RCEC associated bus range for RCEC reported VH protocol errors case, and the CXL-type-3 detection is that iterating all CXL-type-3 endpoint under RCEC associated bus range. is it right?
Dan Williams April 22, 2024, 11:01 p.m. UTC | #7
Li, Ming wrote:
> On 4/18/2024 10:57 PM, Dan Williams wrote:
> > Li, Ming wrote:
> >> On 4/16/2024 10:46 PM, Terry Bowman wrote:
> >>> The driver support is much simpler if RCEC does not handle VH protocol errors. Is there 
> >>> a reason to forward root port VH mode protocol errors to an RCEC rather than consume 
> >>> in the root port's AER driver and forward to CXL error handler? 
> >>>
> >> I agree that is simpler if only root port handle VH protocol errors,
> >> but I think that software has no chance to choose if VH protocol
> >> errors reported to RCEC or root port, it depends on platform
> >> implementation. So I think we should support both cases.
> > 
> > The question is whether the CXL spec RDPAS behavior causes any problems
> > for platforms that follow PCIe rather than CXL reporting flows for
> > root-port errors. I.e. does it cause problems if Linux starts scanning
> > root ports on RCEC notifications?
> > 
> > I do think the lookup needs to change to be based on CXL host-bridge
> > detection and not CXL-type-3 endpoint detection, but otherwise it looks
> > like CXL spec wants to invalidate PCIe spec expectations.
> 
> Hi Dan, if my understanding is correct, the CXL host-bridge detection
> you mentioned is that iterating all root ports under RCEC associated
> bus range for RCEC reported VH protocol errors case, and the
> CXL-type-3 detection is that iterating all CXL-type-3 endpoint under
> RCEC associated bus range. is it right?

I think this error checking needs to be tightly scoped to only scan for
CXL.cachemem errors and not CXL.io or typical PCIe errors. That way we
are not technically running afoul of the PCIe expectations that *PCIe*
root-port errors are only reported by their local AER block and not an
RCEC.

So the scanning should be limited to just the root-ports that have
negotiated a CXL.cachemem link.
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 42a3bd35a3e1..364c74e47273 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -985,7 +985,7 @@  static bool cxl_error_is_native(struct pci_dev *dev)
 {
 	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
 
-	return (pcie_ports_native || host->native_aer);
+	return (pcie_ports_native || host->native_aer) && host->is_cxl;
 }
 
 static bool is_internal_error(struct aer_err_info *info)
@@ -1041,8 +1041,13 @@  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
 {
 	bool *handles_cxl = data;
 
-	if (!*handles_cxl)
-		*handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev);
+	if (!*handles_cxl && cxl_error_is_native(dev)) {
+		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END &&
+		    dev->rcec && is_cxl_mem_dev(dev))
+			*handles_cxl = true;
+		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
+			*handles_cxl = true;
+	}
 
 	/* Non-zero terminates iteration */
 	return *handles_cxl;
@@ -1054,13 +1059,18 @@  static bool handles_cxl_errors(struct pci_dev *rcec)
 
 	if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC &&
 	    pcie_aer_is_native(rcec))
-		pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl);
+		pcie_walk_rcec_all(rcec, handles_cxl_error_iter, &handles_cxl);
 
 	return handles_cxl;
 }
 
-static void cxl_rch_enable_rcec(struct pci_dev *rcec)
+static void cxl_enable_rcec(struct pci_dev *rcec)
 {
+	/*
+	 * Enable RCEC's internal error report for two cases:
+	 * 1. RCiEP detected CXL.cachemem protocol errors
+	 * 2. CXL root port detected CXL.cachemem protocol errors.
+	 */
 	if (!handles_cxl_errors(rcec))
 		return;
 
@@ -1069,7 +1079,7 @@  static void cxl_rch_enable_rcec(struct pci_dev *rcec)
 }
 
 #else
-static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { }
+static inline void cxl_enable_rcec(struct pci_dev *dev) { }
 static inline void cxl_rch_handle_error(struct pci_dev *dev,
 					struct aer_err_info *info) { }
 #endif
@@ -1494,7 +1504,7 @@  static int aer_probe(struct pcie_device *dev)
 		return status;
 	}
 
-	cxl_rch_enable_rcec(port);
+	cxl_enable_rcec(port);
 	aer_enable_rootport(rpc);
 	pci_info(port, "enabled with IRQ %d\n", dev->irq);
 	return 0;