diff mbox series

[v5,16/16] PCI/AER: Enable internal errors for CXL Upstream and Downstream Switch Ports

Message ID 20250107143852.3692571-17-terry.bowman@amd.com
State New
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry Jan. 7, 2025, 2:38 p.m. UTC
The AER service driver enables PCIe Uncorrectable Internal Errors (UIE) and
Correctable Internal errors (CIE) for CXL Root Ports. The UIE and CIE are
used in reporting CXL Protocol Errors. The same UIE/CIE enablement is
needed for CXL Upstream Switch Ports and CXL Downstream Switch Ports
inorder to notify the associated Root Port and OS.[1]

Export the AER service driver's pci_aer_unmask_internal_errors() function
to CXL namespace.

Remove the function's dependency on the CONFIG_PCIEAER_CXL kernel config
because it is now an exported function.

Call pci_aer_unmask_internal_errors() during RAS initialization in:
cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting().

[1] PCIe Base Spec r6.2-1.0, 6.2.3.2.2 Masking Individual Errors

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/cxl/core/pci.c | 2 ++
 drivers/pci/pcie/aer.c | 5 +++--
 include/linux/aer.h    | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

Comments

Ira Weiny Jan. 14, 2025, 11:26 p.m. UTC | #1
Terry Bowman wrote:
> The AER service driver enables PCIe Uncorrectable Internal Errors (UIE) and
> Correctable Internal errors (CIE) for CXL Root Ports. The UIE and CIE are
> used in reporting CXL Protocol Errors. The same UIE/CIE enablement is
> needed for CXL Upstream Switch Ports and CXL Downstream Switch Ports
> inorder to notify the associated Root Port and OS.[1]
> 
> Export the AER service driver's pci_aer_unmask_internal_errors() function
> to CXL namespace.
> 
> Remove the function's dependency on the CONFIG_PCIEAER_CXL kernel config
> because it is now an exported function.

This seems wrong to me.  As of this patch CXL_PCI requires PCIEAER_CXL for
the AER code to handle the errors which were just enabled.

To keep PCIEAER_CXL optional pci_aer_unmask_internal_errors() should be
stubbed out in aer.h if !CONFIG_PCIEAER_CXL.

Ira

> 
> Call pci_aer_unmask_internal_errors() during RAS initialization in:
> cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting().
> 
> [1] PCIe Base Spec r6.2-1.0, 6.2.3.2.2 Masking Individual Errors
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> ---
>  drivers/cxl/core/pci.c | 2 ++
>  drivers/pci/pcie/aer.c | 5 +++--
>  include/linux/aer.h    | 1 +
>  3 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 9c162120f0fe..c62329cd9a87 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -895,6 +895,7 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  
>  	cxl_assign_port_error_handlers(pdev);
>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
> +	pci_aer_unmask_internal_errors(pdev);
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
>  
> @@ -935,6 +936,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>  	}
>  	cxl_assign_port_error_handlers(pdev);
>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
> +	pci_aer_unmask_internal_errors(pdev);
>  	put_device(&port->dev);
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 68e957459008..e6aaa3bd84f0 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -950,7 +950,6 @@ static bool is_internal_error(struct aer_err_info *info)
>  	return info->status & PCI_ERR_UNC_INTN;
>  }
>  
> -#ifdef CONFIG_PCIEAER_CXL
>  /**
>   * pci_aer_unmask_internal_errors - unmask internal errors
>   * @dev: pointer to the pcie_dev data structure
> @@ -961,7 +960,7 @@ static bool is_internal_error(struct aer_err_info *info)
>   * Note: AER must be enabled and supported by the device which must be
>   * checked in advance, e.g. with pcie_aer_is_native().
>   */
> -static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
> +void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>  {
>  	int aer = dev->aer_cap;
>  	u32 mask;
> @@ -974,7 +973,9 @@ static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>  	mask &= ~PCI_ERR_COR_INTERNAL;
>  	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
>  }
> +EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
>  
> +#ifdef CONFIG_PCIEAER_CXL
>  static bool is_cxl_mem_dev(struct pci_dev *dev)
>  {
>  	/*
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index 4b97f38f3fcf..093293f9f12b 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -55,5 +55,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>  int cper_severity_to_aer(int cper_severity);
>  void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>  		       int severity, struct aer_capability_regs *aer_regs);
> +void pci_aer_unmask_internal_errors(struct pci_dev *dev);
>  #endif //_AER_H_
>  
> -- 
> 2.34.1
>
Bowman, Terry Jan. 14, 2025, 11:34 p.m. UTC | #2
On 1/14/2025 5:26 PM, Ira Weiny wrote:
> Terry Bowman wrote:
>> The AER service driver enables PCIe Uncorrectable Internal Errors (UIE) and
>> Correctable Internal errors (CIE) for CXL Root Ports. The UIE and CIE are
>> used in reporting CXL Protocol Errors. The same UIE/CIE enablement is
>> needed for CXL Upstream Switch Ports and CXL Downstream Switch Ports
>> inorder to notify the associated Root Port and OS.[1]
>>
>> Export the AER service driver's pci_aer_unmask_internal_errors() function
>> to CXL namespace.
>>
>> Remove the function's dependency on the CONFIG_PCIEAER_CXL kernel config
>> because it is now an exported function.
> This seems wrong to me.  As of this patch CXL_PCI requires PCIEAER_CXL for
> the AER code to handle the errors which were just enabled.
>
> To keep PCIEAER_CXL optional pci_aer_unmask_internal_errors() should be
> stubbed out in aer.h if !CONFIG_PCIEAER_CXL.
>
> Ira

Bjorn (I believe in v1 or v2) directed me to remove pci_aer_unmask_internal_errors() dependency on PCIEAER_CXL because it is now exported. He wants the behavior for other users (and subsystems) to be consistent with/without the PCIEAER_CXL setting.

Regards,
Terry

>> Call pci_aer_unmask_internal_errors() during RAS initialization in:
>> cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting().
>>
>> [1] PCIe Base Spec r6.2-1.0, 6.2.3.2.2 Masking Individual Errors
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> ---
>>  drivers/cxl/core/pci.c | 2 ++
>>  drivers/pci/pcie/aer.c | 5 +++--
>>  include/linux/aer.h    | 1 +
>>  3 files changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 9c162120f0fe..c62329cd9a87 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -895,6 +895,7 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  
>>  	cxl_assign_port_error_handlers(pdev);
>>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
>> +	pci_aer_unmask_internal_errors(pdev);
>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
>>  
>> @@ -935,6 +936,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>>  	}
>>  	cxl_assign_port_error_handlers(pdev);
>>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
>> +	pci_aer_unmask_internal_errors(pdev);
>>  	put_device(&port->dev);
>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 68e957459008..e6aaa3bd84f0 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -950,7 +950,6 @@ static bool is_internal_error(struct aer_err_info *info)
>>  	return info->status & PCI_ERR_UNC_INTN;
>>  }
>>  
>> -#ifdef CONFIG_PCIEAER_CXL
>>  /**
>>   * pci_aer_unmask_internal_errors - unmask internal errors
>>   * @dev: pointer to the pcie_dev data structure
>> @@ -961,7 +960,7 @@ static bool is_internal_error(struct aer_err_info *info)
>>   * Note: AER must be enabled and supported by the device which must be
>>   * checked in advance, e.g. with pcie_aer_is_native().
>>   */
>> -static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>> +void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>  {
>>  	int aer = dev->aer_cap;
>>  	u32 mask;
>> @@ -974,7 +973,9 @@ static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>  	mask &= ~PCI_ERR_COR_INTERNAL;
>>  	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
>>  }
>> +EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
>>  
>> +#ifdef CONFIG_PCIEAER_CXL
>>  static bool is_cxl_mem_dev(struct pci_dev *dev)
>>  {
>>  	/*
>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>> index 4b97f38f3fcf..093293f9f12b 100644
>> --- a/include/linux/aer.h
>> +++ b/include/linux/aer.h
>> @@ -55,5 +55,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>>  int cper_severity_to_aer(int cper_severity);
>>  void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>>  		       int severity, struct aer_capability_regs *aer_regs);
>> +void pci_aer_unmask_internal_errors(struct pci_dev *dev);
>>  #endif //_AER_H_
>>  
>> -- 
>> 2.34.1
>>
>
Ira Weiny Jan. 14, 2025, 11:45 p.m. UTC | #3
Bowman, Terry wrote:
> 
> 
> 
> On 1/14/2025 5:26 PM, Ira Weiny wrote:
> > Terry Bowman wrote:
> >> The AER service driver enables PCIe Uncorrectable Internal Errors (UIE) and
> >> Correctable Internal errors (CIE) for CXL Root Ports. The UIE and CIE are
> >> used in reporting CXL Protocol Errors. The same UIE/CIE enablement is
> >> needed for CXL Upstream Switch Ports and CXL Downstream Switch Ports
> >> inorder to notify the associated Root Port and OS.[1]
> >>
> >> Export the AER service driver's pci_aer_unmask_internal_errors() function
> >> to CXL namespace.
> >>
> >> Remove the function's dependency on the CONFIG_PCIEAER_CXL kernel config
> >> because it is now an exported function.
> > This seems wrong to me.  As of this patch CXL_PCI requires PCIEAER_CXL for
> > the AER code to handle the errors which were just enabled.
> >
> > To keep PCIEAER_CXL optional pci_aer_unmask_internal_errors() should be
> > stubbed out in aer.h if !CONFIG_PCIEAER_CXL.
> >
> > Ira
> 
> Bjorn (I believe in v1 or v2) directed me to remove
> pci_aer_unmask_internal_errors() dependency on PCIEAER_CXL because it is
> now exported. He wants the behavior for other users (and subsystems) to
> be consistent with/without the PCIEAER_CXL setting.
> 

I see...  If PCIEAER_CXL is not enabled why even set the cxl error
handlers and enable these?

I guess this is just adding some code which eventually calls
handles_cxl_errors() which returns false in the !PCIEAER_CXL case?

Ira

> Regards,
> Terry
> 
> >> Call pci_aer_unmask_internal_errors() during RAS initialization in:
> >> cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting().
> >>
> >> [1] PCIe Base Spec r6.2-1.0, 6.2.3.2.2 Masking Individual Errors
> >>
> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> >> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> >> ---
> >>  drivers/cxl/core/pci.c | 2 ++
> >>  drivers/pci/pcie/aer.c | 5 +++--
> >>  include/linux/aer.h    | 1 +
> >>  3 files changed, 6 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> >> index 9c162120f0fe..c62329cd9a87 100644
> >> --- a/drivers/cxl/core/pci.c
> >> +++ b/drivers/cxl/core/pci.c
> >> @@ -895,6 +895,7 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
> >>  
> >>  	cxl_assign_port_error_handlers(pdev);
> >>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
> >> +	pci_aer_unmask_internal_errors(pdev);
> >>  }
> >>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
> >>  
> >> @@ -935,6 +936,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
> >>  	}
> >>  	cxl_assign_port_error_handlers(pdev);
> >>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
> >> +	pci_aer_unmask_internal_errors(pdev);
> >>  	put_device(&port->dev);
> >>  }
> >>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
> >> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> >> index 68e957459008..e6aaa3bd84f0 100644
> >> --- a/drivers/pci/pcie/aer.c
> >> +++ b/drivers/pci/pcie/aer.c
> >> @@ -950,7 +950,6 @@ static bool is_internal_error(struct aer_err_info *info)
> >>  	return info->status & PCI_ERR_UNC_INTN;
> >>  }
> >>  
> >> -#ifdef CONFIG_PCIEAER_CXL
> >>  /**
> >>   * pci_aer_unmask_internal_errors - unmask internal errors
> >>   * @dev: pointer to the pcie_dev data structure
> >> @@ -961,7 +960,7 @@ static bool is_internal_error(struct aer_err_info *info)
> >>   * Note: AER must be enabled and supported by the device which must be
> >>   * checked in advance, e.g. with pcie_aer_is_native().
> >>   */
> >> -static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
> >> +void pci_aer_unmask_internal_errors(struct pci_dev *dev)
> >>  {
> >>  	int aer = dev->aer_cap;
> >>  	u32 mask;
> >> @@ -974,7 +973,9 @@ static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
> >>  	mask &= ~PCI_ERR_COR_INTERNAL;
> >>  	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
> >>  }
> >> +EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
> >>  
> >> +#ifdef CONFIG_PCIEAER_CXL
> >>  static bool is_cxl_mem_dev(struct pci_dev *dev)
> >>  {
> >>  	/*
> >> diff --git a/include/linux/aer.h b/include/linux/aer.h
> >> index 4b97f38f3fcf..093293f9f12b 100644
> >> --- a/include/linux/aer.h
> >> +++ b/include/linux/aer.h
> >> @@ -55,5 +55,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> >>  int cper_severity_to_aer(int cper_severity);
> >>  void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
> >>  		       int severity, struct aer_capability_regs *aer_regs);
> >> +void pci_aer_unmask_internal_errors(struct pci_dev *dev);
> >>  #endif //_AER_H_
> >>  
> >> -- 
> >> 2.34.1
> >>
> >
>
Bowman, Terry Jan. 15, 2025, 12:09 a.m. UTC | #4
On 1/14/2025 5:45 PM, Ira Weiny wrote:
> Bowman, Terry wrote:
>>
>>
>> On 1/14/2025 5:26 PM, Ira Weiny wrote:
>>> Terry Bowman wrote:
>>>> The AER service driver enables PCIe Uncorrectable Internal Errors (UIE) and
>>>> Correctable Internal errors (CIE) for CXL Root Ports. The UIE and CIE are
>>>> used in reporting CXL Protocol Errors. The same UIE/CIE enablement is
>>>> needed for CXL Upstream Switch Ports and CXL Downstream Switch Ports
>>>> inorder to notify the associated Root Port and OS.[1]
>>>>
>>>> Export the AER service driver's pci_aer_unmask_internal_errors() function
>>>> to CXL namespace.
>>>>
>>>> Remove the function's dependency on the CONFIG_PCIEAER_CXL kernel config
>>>> because it is now an exported function.
>>> This seems wrong to me.  As of this patch CXL_PCI requires PCIEAER_CXL for
>>> the AER code to handle the errors which were just enabled.
>>>
>>> To keep PCIEAER_CXL optional pci_aer_unmask_internal_errors() should be
>>> stubbed out in aer.h if !CONFIG_PCIEAER_CXL.
>>>
>>> Ira
>> Bjorn (I believe in v1 or v2) directed me to remove
>> pci_aer_unmask_internal_errors() dependency on PCIEAER_CXL because it is
>> now exported. He wants the behavior for other users (and subsystems) to
>> be consistent with/without the PCIEAER_CXL setting.
>>
> I see...  If PCIEAER_CXL is not enabled why even set the cxl error
> handlers and enable these?
>
> I guess this is just adding some code which eventually calls
> handles_cxl_errors() which returns false in the !PCIEAER_CXL case?
>
> Ira
cxl_dport_init_ras_reporting() and cxl_uport_init_ras_reporting()assign the error handlers and are within #ifdef PCIEAER_CXL. I need to add the empty stubs to the
#else block. Correct. handles_cxl_errors() returns false in the !PCIEAER_CXL case. Terry
>> Regards,
>> Terry
>>
>>>> Call pci_aer_unmask_internal_errors() during RAS initialization in:
>>>> cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting().
>>>>
>>>> [1] PCIe Base Spec r6.2-1.0, 6.2.3.2.2 Masking Individual Errors
>>>>
>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>>>> ---
>>>>  drivers/cxl/core/pci.c | 2 ++
>>>>  drivers/pci/pcie/aer.c | 5 +++--
>>>>  include/linux/aer.h    | 1 +
>>>>  3 files changed, 6 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>>>> index 9c162120f0fe..c62329cd9a87 100644
>>>> --- a/drivers/cxl/core/pci.c
>>>> +++ b/drivers/cxl/core/pci.c
>>>> @@ -895,6 +895,7 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>>>  
>>>>  	cxl_assign_port_error_handlers(pdev);
>>>>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
>>>> +	pci_aer_unmask_internal_errors(pdev);
>>>>  }
>>>>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
>>>>  
>>>> @@ -935,6 +936,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>>>>  	}
>>>>  	cxl_assign_port_error_handlers(pdev);
>>>>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
>>>> +	pci_aer_unmask_internal_errors(pdev);
>>>>  	put_device(&port->dev);
>>>>  }
>>>>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
>>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>>> index 68e957459008..e6aaa3bd84f0 100644
>>>> --- a/drivers/pci/pcie/aer.c
>>>> +++ b/drivers/pci/pcie/aer.c
>>>> @@ -950,7 +950,6 @@ static bool is_internal_error(struct aer_err_info *info)
>>>>  	return info->status & PCI_ERR_UNC_INTN;
>>>>  }
>>>>  
>>>> -#ifdef CONFIG_PCIEAER_CXL
>>>>  /**
>>>>   * pci_aer_unmask_internal_errors - unmask internal errors
>>>>   * @dev: pointer to the pcie_dev data structure
>>>> @@ -961,7 +960,7 @@ static bool is_internal_error(struct aer_err_info *info)
>>>>   * Note: AER must be enabled and supported by the device which must be
>>>>   * checked in advance, e.g. with pcie_aer_is_native().
>>>>   */
>>>> -static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>>> +void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>>>  {
>>>>  	int aer = dev->aer_cap;
>>>>  	u32 mask;
>>>> @@ -974,7 +973,9 @@ static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>>>  	mask &= ~PCI_ERR_COR_INTERNAL;
>>>>  	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
>>>>  }
>>>> +EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
>>>>  
>>>> +#ifdef CONFIG_PCIEAER_CXL
>>>>  static bool is_cxl_mem_dev(struct pci_dev *dev)
>>>>  {
>>>>  	/*
>>>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>>>> index 4b97f38f3fcf..093293f9f12b 100644
>>>> --- a/include/linux/aer.h
>>>> +++ b/include/linux/aer.h
>>>> @@ -55,5 +55,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>>>>  int cper_severity_to_aer(int cper_severity);
>>>>  void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>>>>  		       int severity, struct aer_capability_regs *aer_regs);
>>>> +void pci_aer_unmask_internal_errors(struct pci_dev *dev);
>>>>  #endif //_AER_H_
>>>>  
>>>> -- 
>>>> 2.34.1
>>>>
>
Bowman, Terry Jan. 15, 2025, 12:20 a.m. UTC | #5
On 1/14/2025 5:45 PM, Ira Weiny wrote:
> Bowman, Terry wrote:
>>
>>
>> On 1/14/2025 5:26 PM, Ira Weiny wrote:
>>> Terry Bowman wrote:
>>>> The AER service driver enables PCIe Uncorrectable Internal Errors (UIE) and
>>>> Correctable Internal errors (CIE) for CXL Root Ports. The UIE and CIE are
>>>> used in reporting CXL Protocol Errors. The same UIE/CIE enablement is
>>>> needed for CXL Upstream Switch Ports and CXL Downstream Switch Ports
>>>> inorder to notify the associated Root Port and OS.[1]
>>>>
>>>> Export the AER service driver's pci_aer_unmask_internal_errors() function
>>>> to CXL namespace.
>>>>
>>>> Remove the function's dependency on the CONFIG_PCIEAER_CXL kernel config
>>>> because it is now an exported function.
>>> This seems wrong to me.  As of this patch CXL_PCI requires PCIEAER_CXL for
>>> the AER code to handle the errors which were just enabled.
>>>
>>> To keep PCIEAER_CXL optional pci_aer_unmask_internal_errors() should be
>>> stubbed out in aer.h if !CONFIG_PCIEAER_CXL.
>>>
>>> Ira
>> Bjorn (I believe in v1 or v2) directed me to remove
>> pci_aer_unmask_internal_errors() dependency on PCIEAER_CXL because it is
>> now exported. He wants the behavior for other users (and subsystems) to
>> be consistent with/without the PCIEAER_CXL setting.
>>
> I see...  If PCIEAER_CXL is not enabled why even set the cxl error
> handlers and enable these?
>
> I guess this is just adding some code which eventually calls
> handles_cxl_errors() which returns false in the !PCIEAER_CXL case?
>
> Ira

Re-sending because I somehow sent from Outlook earlier.

cxl_dport_init_ras_reporting() and cxl_uport_init_ras_reporting() assign the errorĀ 
handlers and are within #ifdef PCIEAER_CXL. The stubs are in cxl.h.

Correct. handles_cxl_errors() returns false in the !PCIEAER_CXL case.

Terry
>>>> Call pci_aer_unmask_internal_errors() during RAS initialization in:
>>>> cxl_uport_init_ras_reporting() and cxl_dport_init_ras_reporting().
>>>>
>>>> [1] PCIe Base Spec r6.2-1.0, 6.2.3.2.2 Masking Individual Errors
>>>>
>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>>>> ---
>>>>  drivers/cxl/core/pci.c | 2 ++
>>>>  drivers/pci/pcie/aer.c | 5 +++--
>>>>  include/linux/aer.h    | 1 +
>>>>  3 files changed, 6 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>>>> index 9c162120f0fe..c62329cd9a87 100644
>>>> --- a/drivers/cxl/core/pci.c
>>>> +++ b/drivers/cxl/core/pci.c
>>>> @@ -895,6 +895,7 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>>>  
>>>>  	cxl_assign_port_error_handlers(pdev);
>>>>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
>>>> +	pci_aer_unmask_internal_errors(pdev);
>>>>  }
>>>>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
>>>>  
>>>> @@ -935,6 +936,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>>>>  	}
>>>>  	cxl_assign_port_error_handlers(pdev);
>>>>  	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
>>>> +	pci_aer_unmask_internal_errors(pdev);
>>>>  	put_device(&port->dev);
>>>>  }
>>>>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
>>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>>> index 68e957459008..e6aaa3bd84f0 100644
>>>> --- a/drivers/pci/pcie/aer.c
>>>> +++ b/drivers/pci/pcie/aer.c
>>>> @@ -950,7 +950,6 @@ static bool is_internal_error(struct aer_err_info *info)
>>>>  	return info->status & PCI_ERR_UNC_INTN;
>>>>  }
>>>>  
>>>> -#ifdef CONFIG_PCIEAER_CXL
>>>>  /**
>>>>   * pci_aer_unmask_internal_errors - unmask internal errors
>>>>   * @dev: pointer to the pcie_dev data structure
>>>> @@ -961,7 +960,7 @@ static bool is_internal_error(struct aer_err_info *info)
>>>>   * Note: AER must be enabled and supported by the device which must be
>>>>   * checked in advance, e.g. with pcie_aer_is_native().
>>>>   */
>>>> -static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>>> +void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>>>  {
>>>>  	int aer = dev->aer_cap;
>>>>  	u32 mask;
>>>> @@ -974,7 +973,9 @@ static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>>>  	mask &= ~PCI_ERR_COR_INTERNAL;
>>>>  	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
>>>>  }
>>>> +EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
>>>>  
>>>> +#ifdef CONFIG_PCIEAER_CXL
>>>>  static bool is_cxl_mem_dev(struct pci_dev *dev)
>>>>  {
>>>>  	/*
>>>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>>>> index 4b97f38f3fcf..093293f9f12b 100644
>>>> --- a/include/linux/aer.h
>>>> +++ b/include/linux/aer.h
>>>> @@ -55,5 +55,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>>>>  int cper_severity_to_aer(int cper_severity);
>>>>  void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>>>>  		       int severity, struct aer_capability_regs *aer_regs);
>>>> +void pci_aer_unmask_internal_errors(struct pci_dev *dev);
>>>>  #endif //_AER_H_
>>>>  
>>>> -- 
>>>> 2.34.1
>>>>
>
Ira Weiny Jan. 16, 2025, 9:42 p.m. UTC | #6
Bowman, Terry wrote:
> 
> 
> 
> On 1/14/2025 5:45 PM, Ira Weiny wrote:
> > Bowman, Terry wrote:
> >>
> >>
> >> On 1/14/2025 5:26 PM, Ira Weiny wrote:
> >>> Terry Bowman wrote:
> >>>> The AER service driver enables PCIe Uncorrectable Internal Errors (UIE) and
> >>>> Correctable Internal errors (CIE) for CXL Root Ports. The UIE and CIE are
> >>>> used in reporting CXL Protocol Errors. The same UIE/CIE enablement is
> >>>> needed for CXL Upstream Switch Ports and CXL Downstream Switch Ports
> >>>> inorder to notify the associated Root Port and OS.[1]
> >>>>
> >>>> Export the AER service driver's pci_aer_unmask_internal_errors() function
> >>>> to CXL namespace.
> >>>>
> >>>> Remove the function's dependency on the CONFIG_PCIEAER_CXL kernel config
> >>>> because it is now an exported function.
> >>> This seems wrong to me.  As of this patch CXL_PCI requires PCIEAER_CXL for
> >>> the AER code to handle the errors which were just enabled.
> >>>
> >>> To keep PCIEAER_CXL optional pci_aer_unmask_internal_errors() should be
> >>> stubbed out in aer.h if !CONFIG_PCIEAER_CXL.
> >>>
> >>> Ira
> >> Bjorn (I believe in v1 or v2) directed me to remove
> >> pci_aer_unmask_internal_errors() dependency on PCIEAER_CXL because it is
> >> now exported. He wants the behavior for other users (and subsystems) to
> >> be consistent with/without the PCIEAER_CXL setting.
> >>
> > I see...  If PCIEAER_CXL is not enabled why even set the cxl error
> > handlers and enable these?
> >
> > I guess this is just adding some code which eventually calls
> > handles_cxl_errors() which returns false in the !PCIEAER_CXL case?
> >
> > Ira
> 
> Re-sending because I somehow sent from Outlook earlier.
> 
> cxl_dport_init_ras_reporting() and cxl_uport_init_ras_reporting() assign the errorĀ 
> handlers and are within #ifdef PCIEAER_CXL. The stubs are in cxl.h.
> 
> Correct. handles_cxl_errors() returns false in the !PCIEAER_CXL case.
> 

That is a bit convoluted...  But I'm not sure how to get the cross Kconfig
dependencies set to eliminate the set up.  :-/

So I guess it is fine.

Ira
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 9c162120f0fe..c62329cd9a87 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -895,6 +895,7 @@  void cxl_uport_init_ras_reporting(struct cxl_port *port)
 
 	cxl_assign_port_error_handlers(pdev);
 	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
+	pci_aer_unmask_internal_errors(pdev);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
 
@@ -935,6 +936,7 @@  void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
 	}
 	cxl_assign_port_error_handlers(pdev);
 	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
+	pci_aer_unmask_internal_errors(pdev);
 	put_device(&port->dev);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 68e957459008..e6aaa3bd84f0 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -950,7 +950,6 @@  static bool is_internal_error(struct aer_err_info *info)
 	return info->status & PCI_ERR_UNC_INTN;
 }
 
-#ifdef CONFIG_PCIEAER_CXL
 /**
  * pci_aer_unmask_internal_errors - unmask internal errors
  * @dev: pointer to the pcie_dev data structure
@@ -961,7 +960,7 @@  static bool is_internal_error(struct aer_err_info *info)
  * Note: AER must be enabled and supported by the device which must be
  * checked in advance, e.g. with pcie_aer_is_native().
  */
-static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
+void pci_aer_unmask_internal_errors(struct pci_dev *dev)
 {
 	int aer = dev->aer_cap;
 	u32 mask;
@@ -974,7 +973,9 @@  static void pci_aer_unmask_internal_errors(struct pci_dev *dev)
 	mask &= ~PCI_ERR_COR_INTERNAL;
 	pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
 }
+EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
 
+#ifdef CONFIG_PCIEAER_CXL
 static bool is_cxl_mem_dev(struct pci_dev *dev)
 {
 	/*
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 4b97f38f3fcf..093293f9f12b 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -55,5 +55,6 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
 		       int severity, struct aer_capability_regs *aer_regs);
+void pci_aer_unmask_internal_errors(struct pci_dev *dev);
 #endif //_AER_H_