diff mbox series

[v7,04/17] PCI/AER: Modify AER driver logging to report CXL or PCIe bus error type

Message ID 20250211192444.2292833-5-terry.bowman@amd.com (mailing list archive)
State Handled Elsewhere
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry Feb. 11, 2025, 7:24 p.m. UTC
The AER driver and aer_event tracing currently log 'PCIe Bus Type'
for all errors.

Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL
device errors.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Gregory Price <gourry@gourry.net>
---
 drivers/pci/pcie/aer.c  | 14 ++++++++------
 include/ras/ras_event.h |  9 ++++++---
 2 files changed, 14 insertions(+), 9 deletions(-)

Comments

Bjorn Helgaas Feb. 11, 2025, 8:28 p.m. UTC | #1
On Tue, Feb 11, 2025 at 01:24:31PM -0600, Terry Bowman wrote:
> The AER driver and aer_event tracing currently log 'PCIe Bus Type'
> for all errors.
> 
> Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL
> device errors.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Fan Ni <fan.ni@samsung.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Reviewed-by: Gregory Price <gourry@gourry.net>

Acked-by: Bjorn Helgaas <bhelgaas@google.com>

> ---
>  drivers/pci/pcie/aer.c  | 14 ++++++++------
>  include/ras/ras_event.h |  9 ++++++---
>  2 files changed, 14 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 6e8de77d0fc4..f99a1c6fb274 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev,
>  
>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  {
> +	const char *bus_type = pcie_is_cxl(dev) ? "CXL"  : "PCIe";
>  	int layer, agent;
>  	int id = pci_dev_id(dev);
>  	const char *level;
>  
>  	if (!info->status) {
> -		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> -			aer_error_severity_string[info->severity]);
> +		pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> +			bus_type, aer_error_severity_string[info->severity]);
>  		goto out;
>  	}
>  
> @@ -709,8 +710,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  
>  	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
>  
> -	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> -		   aer_error_severity_string[info->severity],
> +	pci_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n",
> +		   bus_type, aer_error_severity_string[info->severity],
>  		   aer_error_layer[layer], aer_agent_string[agent]);
>  
>  	pci_printk(level, dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> @@ -725,7 +726,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  	if (info->id && info->error_dev_num > 1 && info->id == id)
>  		pci_err(dev, "  Error of this Agent is reported first\n");
>  
> -	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
> +	trace_aer_event(dev_name(&dev->dev), bus_type, (info->status & ~info->mask),
>  			info->severity, info->tlp_header_valid, &info->tlp);
>  }
>  
> @@ -759,6 +760,7 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>  void pci_print_aer(struct pci_dev *dev, int aer_severity,
>  		   struct aer_capability_regs *aer)
>  {
> +	const char *bus_type = pcie_is_cxl(dev) ? "CXL"  : "PCIe";
>  	int layer, agent, tlp_header_valid = 0;
>  	u32 status, mask;
>  	struct aer_err_info info;
> @@ -793,7 +795,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>  	if (tlp_header_valid)
>  		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
>  
> -	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
> +	trace_aer_event(dev_name(&dev->dev), bus_type, (status & ~mask),
>  			aer_severity, tlp_header_valid, &aer->header_log);
>  }
>  EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index e5f7ee0864e7..1bf8e7050ba8 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -297,15 +297,17 @@ TRACE_EVENT(non_standard_event,
>  
>  TRACE_EVENT(aer_event,
>  	TP_PROTO(const char *dev_name,
> +		 const char *bus_type,
>  		 const u32 status,
>  		 const u8 severity,
>  		 const u8 tlp_header_valid,
>  		 struct pcie_tlp_log *tlp),
>  
> -	TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
> +	TP_ARGS(dev_name, bus_type, status, severity, tlp_header_valid, tlp),
>  
>  	TP_STRUCT__entry(
>  		__string(	dev_name,	dev_name	)
> +		__string(	bus_type,	bus_type	)
>  		__field(	u32,		status		)
>  		__field(	u8,		severity	)
>  		__field(	u8, 		tlp_header_valid)
> @@ -314,6 +316,7 @@ TRACE_EVENT(aer_event,
>  
>  	TP_fast_assign(
>  		__assign_str(dev_name);
> +		__assign_str(bus_type);
>  		__entry->status		= status;
>  		__entry->severity	= severity;
>  		__entry->tlp_header_valid = tlp_header_valid;
> @@ -325,8 +328,8 @@ TRACE_EVENT(aer_event,
>  		}
>  	),
>  
> -	TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
> -		__get_str(dev_name),
> +	TP_printk("%s %s Bus Error: severity=%s, %s, TLP Header=%s\n",
> +		__get_str(dev_name), __get_str(bus_type),
>  		__entry->severity == AER_CORRECTABLE ? "Corrected" :
>  			__entry->severity == AER_FATAL ?
>  			"Fatal" : "Uncorrected, non-fatal",
> -- 
> 2.34.1
>
Dan Williams Feb. 11, 2025, 11:47 p.m. UTC | #2
Terry Bowman wrote:
> The AER driver and aer_event tracing currently log 'PCIe Bus Type'
> for all errors.
> 
> Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL
> device errors.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Fan Ni <fan.ni@samsung.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Reviewed-by: Gregory Price <gourry@gourry.net>

> ---
>  drivers/pci/pcie/aer.c  | 14 ++++++++------
>  include/ras/ras_event.h |  9 ++++++---
>  2 files changed, 14 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 6e8de77d0fc4..f99a1c6fb274 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev,
>  
>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  {
> +	const char *bus_type = pcie_is_cxl(dev) ? "CXL"  : "PCIe";

I was expecting that this would be more than just a CXL link check
because CXL AER events are only a subset of the events that can trigger
an AER interrupt on a CXL device.

Also, with CXL protocol errors the TLP log is sourced from CXL RAS
registers and is distinct from the PCIe ->tlp in 'struct aer_err_info'.

All that to say that I think this patch probably wants to decorate the
bus type in 'struct aer_err_info' and then use that rather than just the ->is_cxl
flag of the device.

In the interest of moving the patch set along perhaps just do something
like this for now and circle back to make it more sophisticated later:

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 01e51db8d285..eed098c134a6 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -533,6 +533,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev)
 struct aer_err_info {
        struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
        int error_dev_num;
+       bool is_cxl;
 
        unsigned int id:16;
 
@@ -549,6 +550,11 @@ struct aer_err_info {
        struct pcie_tlp_log tlp;        /* TLP Header */
 };
 
+static inline const char *aer_err_bus(struct aer_err_info *info)
+{
+       return info->is_cxl ? "CXL" : "PCIe";
+}
+
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 508474e17183..405f15c878ff 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1211,6 +1211,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
        /* Must reset in this function */
        info->status = 0;
        info->tlp_header_valid = 0;
+       info->is_cxl = dev->is_cxl;
 
        /* The device might not support AER */
        if (!aer)

Other than that, this patch looks good to me:

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Bowman, Terry Feb. 12, 2025, 7:15 p.m. UTC | #3
On 2/11/2025 5:47 PM, Dan Williams wrote:
> Terry Bowman wrote:
>> The AER driver and aer_event tracing currently log 'PCIe Bus Type'
>> for all errors.
>>
>> Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL
>> device errors.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
>> Reviewed-by: Fan Ni <fan.ni@samsung.com>
>> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
>> Reviewed-by: Gregory Price <gourry@gourry.net>
>> ---
>>  drivers/pci/pcie/aer.c  | 14 ++++++++------
>>  include/ras/ras_event.h |  9 ++++++---
>>  2 files changed, 14 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 6e8de77d0fc4..f99a1c6fb274 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev,
>>  
>>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>>  {
>> +	const char *bus_type = pcie_is_cxl(dev) ? "CXL"  : "PCIe";
> I was expecting that this would be more than just a CXL link check
> because CXL AER events are only a subset of the events that can trigger
> an AER interrupt on a CXL device.
>
> Also, with CXL protocol errors the TLP log is sourced from CXL RAS
> registers and is distinct from the PCIe ->tlp in 'struct aer_err_info'.
>
> All that to say that I think this patch probably wants to decorate the
> bus type in 'struct aer_err_info' and then use that rather than just the ->is_cxl
> flag of the device.
>
> In the interest of moving the patch set along perhaps just do something
> like this for now and circle back to make it more sophisticated later:
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 01e51db8d285..eed098c134a6 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -533,6 +533,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev)
>  struct aer_err_info {
>         struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
>         int error_dev_num;
> +       bool is_cxl;
>  
>         unsigned int id:16;
>  
> @@ -549,6 +550,11 @@ struct aer_err_info {
>         struct pcie_tlp_log tlp;        /* TLP Header */
>  };
>  
> +static inline const char *aer_err_bus(struct aer_err_info *info)
> +{
> +       return info->is_cxl ? "CXL" : "PCIe";
> +}
> +
>  int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
>  
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 508474e17183..405f15c878ff 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1211,6 +1211,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
>         /* Must reset in this function */
>         info->status = 0;
>         info->tlp_header_valid = 0;
> +       info->is_cxl = dev->is_cxl;
>  
>         /* The device might not support AER */
>         if (!aer)
>
> Other than that, this patch looks good to me:
>
> Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Ok. I can add is_cxl to 'struct aer_err_info'. Shall I set it by reading the
alternate protocol link state?

Terry
Dan Williams Feb. 12, 2025, 7:57 p.m. UTC | #4
Bowman, Terry wrote:
[..]
> > Reviewed-by: Dan Williams <dan.j.williams@intel.com>
> Ok. I can add is_cxl to 'struct aer_err_info'. Shall I set it by reading the
> alternate protocol link state?

I am thinking no because dev->is_cxl at least indicates that a CXL link
was up at some point, and racing CXL link down is not something the
error core can reasonably mitigate.

In the end I think that it should be something like:

   info->is_cxl = dev->is_cxl && is_internal_error()

...on the expectation that a CXL device is unlikely to multiplex
internal errors across CXL protocol error events and device-specific
internal events. Even if a device *did* multiplex those I think it is
reasonable for the kernel to treat a device-specific UCE the same as a
CXL protocol UCE and panic the system.
Bowman, Terry Feb. 12, 2025, 9:08 p.m. UTC | #5
On 2/12/2025 1:57 PM, Dan Williams wrote:
> Bowman, Terry wrote:
> [..]
>>> Reviewed-by: Dan Williams <dan.j.williams@intel.com>
>> Ok. I can add is_cxl to 'struct aer_err_info'. Shall I set it by reading the
>> alternate protocol link state?
> I am thinking no because dev->is_cxl at least indicates that a CXL link
> was up at some point, and racing CXL link down is not something the
> error core can reasonably mitigate.
>
> In the end I think that it should be something like:
>
>    info->is_cxl = dev->is_cxl && is_internal_error()
>
> ...on the expectation that a CXL device is unlikely to multiplex
> internal errors across CXL protocol error events and device-specific
> internal events. Even if a device *did* multiplex those I think it is
> reasonable for the kernel to treat a device-specific UCE the same as a
> CXL protocol UCE and panic the system.
Ok.

I found in using is_internal_error() (v5) a USP with fatal UCE will not have AER status
populated in aer_info structure, only the severity field is populated (see
aer_get_device_error_info()). The aer_info is not populated because concern reading
the USP's AER (config space) when the upstream link state is invalid. Calling
is_internal_error() in this case will return false because the uncorrectable internal error (UIE) bit is 0 and proceed to treat as a PCIe error.
How do you want to proceed to handle the UCE protocol error in this case?

Terry
Lukas Wunner Feb. 12, 2025, 9:17 p.m. UTC | #6
On Wed, Feb 12, 2025 at 03:08:10PM -0600, Bowman, Terry wrote:
> I found in using is_internal_error() (v5) a USP with fatal UCE will not
> have AER status populated in aer_info structure, only the severity field
> is populated (see aer_get_device_error_info()). The aer_info is not
> populated because concern reading the USP's AER (config space) when
> the upstream link state is invalid. Calling is_internal_error() in this
> case will return false because the uncorrectable internal error (UIE) bit
> is 0 and proceed to treat as a PCIe error.
> How do you want to proceed to handle the UCE protocol error in this case?

Shuai Xue is proposing a solution in this pending patch:

https://lore.kernel.org/r/20250207093500.70885-5-xueshuai@linux.alibaba.com/
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 6e8de77d0fc4..f99a1c6fb274 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -694,13 +694,14 @@  static void __aer_print_error(struct pci_dev *dev,
 
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 {
+	const char *bus_type = pcie_is_cxl(dev) ? "CXL"  : "PCIe";
 	int layer, agent;
 	int id = pci_dev_id(dev);
 	const char *level;
 
 	if (!info->status) {
-		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
-			aer_error_severity_string[info->severity]);
+		pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
+			bus_type, aer_error_severity_string[info->severity]);
 		goto out;
 	}
 
@@ -709,8 +710,8 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 
 	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
 
-	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
-		   aer_error_severity_string[info->severity],
+	pci_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n",
+		   bus_type, aer_error_severity_string[info->severity],
 		   aer_error_layer[layer], aer_agent_string[agent]);
 
 	pci_printk(level, dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
@@ -725,7 +726,7 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	if (info->id && info->error_dev_num > 1 && info->id == id)
 		pci_err(dev, "  Error of this Agent is reported first\n");
 
-	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+	trace_aer_event(dev_name(&dev->dev), bus_type, (info->status & ~info->mask),
 			info->severity, info->tlp_header_valid, &info->tlp);
 }
 
@@ -759,6 +760,7 @@  EXPORT_SYMBOL_GPL(cper_severity_to_aer);
 void pci_print_aer(struct pci_dev *dev, int aer_severity,
 		   struct aer_capability_regs *aer)
 {
+	const char *bus_type = pcie_is_cxl(dev) ? "CXL"  : "PCIe";
 	int layer, agent, tlp_header_valid = 0;
 	u32 status, mask;
 	struct aer_err_info info;
@@ -793,7 +795,7 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 	if (tlp_header_valid)
 		pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
 
-	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+	trace_aer_event(dev_name(&dev->dev), bus_type, (status & ~mask),
 			aer_severity, tlp_header_valid, &aer->header_log);
 }
 EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index e5f7ee0864e7..1bf8e7050ba8 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -297,15 +297,17 @@  TRACE_EVENT(non_standard_event,
 
 TRACE_EVENT(aer_event,
 	TP_PROTO(const char *dev_name,
+		 const char *bus_type,
 		 const u32 status,
 		 const u8 severity,
 		 const u8 tlp_header_valid,
 		 struct pcie_tlp_log *tlp),
 
-	TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
+	TP_ARGS(dev_name, bus_type, status, severity, tlp_header_valid, tlp),
 
 	TP_STRUCT__entry(
 		__string(	dev_name,	dev_name	)
+		__string(	bus_type,	bus_type	)
 		__field(	u32,		status		)
 		__field(	u8,		severity	)
 		__field(	u8, 		tlp_header_valid)
@@ -314,6 +316,7 @@  TRACE_EVENT(aer_event,
 
 	TP_fast_assign(
 		__assign_str(dev_name);
+		__assign_str(bus_type);
 		__entry->status		= status;
 		__entry->severity	= severity;
 		__entry->tlp_header_valid = tlp_header_valid;
@@ -325,8 +328,8 @@  TRACE_EVENT(aer_event,
 		}
 	),
 
-	TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
-		__get_str(dev_name),
+	TP_printk("%s %s Bus Error: severity=%s, %s, TLP Header=%s\n",
+		__get_str(dev_name), __get_str(bus_type),
 		__entry->severity == AER_CORRECTABLE ? "Corrected" :
 			__entry->severity == AER_FATAL ?
 			"Fatal" : "Uncorrected, non-fatal",