Message ID | 20250211192444.2292833-5-terry.bowman@amd.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | Enable CXL PCIe port protocol error handling and logging | expand |
On Tue, Feb 11, 2025 at 01:24:31PM -0600, Terry Bowman wrote: > The AER driver and aer_event tracing currently log 'PCIe Bus Type' > for all errors. > > Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL > device errors. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Reviewed-by: Dave Jiang <dave.jiang@intel.com> > Reviewed-by: Fan Ni <fan.ni@samsung.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Gregory Price <gourry@gourry.net> Acked-by: Bjorn Helgaas <bhelgaas@google.com> > --- > drivers/pci/pcie/aer.c | 14 ++++++++------ > include/ras/ras_event.h | 9 ++++++--- > 2 files changed, 14 insertions(+), 9 deletions(-) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 6e8de77d0fc4..f99a1c6fb274 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev, > > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) > { > + const char *bus_type = pcie_is_cxl(dev) ? "CXL" : "PCIe"; > int layer, agent; > int id = pci_dev_id(dev); > const char *level; > > if (!info->status) { > - pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", > - aer_error_severity_string[info->severity]); > + pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", > + bus_type, aer_error_severity_string[info->severity]); > goto out; > } > > @@ -709,8 +710,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) > > level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR; > > - pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", > - aer_error_severity_string[info->severity], > + pci_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n", > + bus_type, aer_error_severity_string[info->severity], > aer_error_layer[layer], aer_agent_string[agent]); > > pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", > @@ -725,7 +726,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) > if (info->id && info->error_dev_num > 1 && info->id == id) > pci_err(dev, " Error of this Agent is reported first\n"); > > - trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask), > + trace_aer_event(dev_name(&dev->dev), bus_type, (info->status & ~info->mask), > info->severity, info->tlp_header_valid, &info->tlp); > } > > @@ -759,6 +760,7 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer); > void pci_print_aer(struct pci_dev *dev, int aer_severity, > struct aer_capability_regs *aer) > { > + const char *bus_type = pcie_is_cxl(dev) ? "CXL" : "PCIe"; > int layer, agent, tlp_header_valid = 0; > u32 status, mask; > struct aer_err_info info; > @@ -793,7 +795,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > if (tlp_header_valid) > pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" ")); > > - trace_aer_event(dev_name(&dev->dev), (status & ~mask), > + trace_aer_event(dev_name(&dev->dev), bus_type, (status & ~mask), > aer_severity, tlp_header_valid, &aer->header_log); > } > EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); > diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h > index e5f7ee0864e7..1bf8e7050ba8 100644 > --- a/include/ras/ras_event.h > +++ b/include/ras/ras_event.h > @@ -297,15 +297,17 @@ TRACE_EVENT(non_standard_event, > > TRACE_EVENT(aer_event, > TP_PROTO(const char *dev_name, > + const char *bus_type, > const u32 status, > const u8 severity, > const u8 tlp_header_valid, > struct pcie_tlp_log *tlp), > > - TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp), > + TP_ARGS(dev_name, bus_type, status, severity, tlp_header_valid, tlp), > > TP_STRUCT__entry( > __string( dev_name, dev_name ) > + __string( bus_type, bus_type ) > __field( u32, status ) > __field( u8, severity ) > __field( u8, tlp_header_valid) > @@ -314,6 +316,7 @@ TRACE_EVENT(aer_event, > > TP_fast_assign( > __assign_str(dev_name); > + __assign_str(bus_type); > __entry->status = status; > __entry->severity = severity; > __entry->tlp_header_valid = tlp_header_valid; > @@ -325,8 +328,8 @@ TRACE_EVENT(aer_event, > } > ), > > - TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n", > - __get_str(dev_name), > + TP_printk("%s %s Bus Error: severity=%s, %s, TLP Header=%s\n", > + __get_str(dev_name), __get_str(bus_type), > __entry->severity == AER_CORRECTABLE ? "Corrected" : > __entry->severity == AER_FATAL ? > "Fatal" : "Uncorrected, non-fatal", > -- > 2.34.1 >
Terry Bowman wrote: > The AER driver and aer_event tracing currently log 'PCIe Bus Type' > for all errors. > > Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL > device errors. > > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Reviewed-by: Dave Jiang <dave.jiang@intel.com> > Reviewed-by: Fan Ni <fan.ni@samsung.com> > Reviewed-by: Ira Weiny <ira.weiny@intel.com> > Reviewed-by: Gregory Price <gourry@gourry.net> > --- > drivers/pci/pcie/aer.c | 14 ++++++++------ > include/ras/ras_event.h | 9 ++++++--- > 2 files changed, 14 insertions(+), 9 deletions(-) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 6e8de77d0fc4..f99a1c6fb274 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev, > > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) > { > + const char *bus_type = pcie_is_cxl(dev) ? "CXL" : "PCIe"; I was expecting that this would be more than just a CXL link check because CXL AER events are only a subset of the events that can trigger an AER interrupt on a CXL device. Also, with CXL protocol errors the TLP log is sourced from CXL RAS registers and is distinct from the PCIe ->tlp in 'struct aer_err_info'. All that to say that I think this patch probably wants to decorate the bus type in 'struct aer_err_info' and then use that rather than just the ->is_cxl flag of the device. In the interest of moving the patch set along perhaps just do something like this for now and circle back to make it more sophisticated later: diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 01e51db8d285..eed098c134a6 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -533,6 +533,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; + bool is_cxl; unsigned int id:16; @@ -549,6 +550,11 @@ struct aer_err_info { struct pcie_tlp_log tlp; /* TLP Header */ }; +static inline const char *aer_err_bus(struct aer_err_info *info) +{ + return info->is_cxl ? "CXL" : "PCIe"; +} + int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 508474e17183..405f15c878ff 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1211,6 +1211,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) /* Must reset in this function */ info->status = 0; info->tlp_header_valid = 0; + info->is_cxl = dev->is_cxl; /* The device might not support AER */ if (!aer) Other than that, this patch looks good to me: Reviewed-by: Dan Williams <dan.j.williams@intel.com>
On 2/11/2025 5:47 PM, Dan Williams wrote: > Terry Bowman wrote: >> The AER driver and aer_event tracing currently log 'PCIe Bus Type' >> for all errors. >> >> Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL >> device errors. >> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com> >> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Reviewed-by: Dave Jiang <dave.jiang@intel.com> >> Reviewed-by: Fan Ni <fan.ni@samsung.com> >> Reviewed-by: Ira Weiny <ira.weiny@intel.com> >> Reviewed-by: Gregory Price <gourry@gourry.net> >> --- >> drivers/pci/pcie/aer.c | 14 ++++++++------ >> include/ras/ras_event.h | 9 ++++++--- >> 2 files changed, 14 insertions(+), 9 deletions(-) >> >> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c >> index 6e8de77d0fc4..f99a1c6fb274 100644 >> --- a/drivers/pci/pcie/aer.c >> +++ b/drivers/pci/pcie/aer.c >> @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev, >> >> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) >> { >> + const char *bus_type = pcie_is_cxl(dev) ? "CXL" : "PCIe"; > I was expecting that this would be more than just a CXL link check > because CXL AER events are only a subset of the events that can trigger > an AER interrupt on a CXL device. > > Also, with CXL protocol errors the TLP log is sourced from CXL RAS > registers and is distinct from the PCIe ->tlp in 'struct aer_err_info'. > > All that to say that I think this patch probably wants to decorate the > bus type in 'struct aer_err_info' and then use that rather than just the ->is_cxl > flag of the device. > > In the interest of moving the patch set along perhaps just do something > like this for now and circle back to make it more sophisticated later: > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 01e51db8d285..eed098c134a6 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -533,6 +533,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) > struct aer_err_info { > struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; > int error_dev_num; > + bool is_cxl; > > unsigned int id:16; > > @@ -549,6 +550,11 @@ struct aer_err_info { > struct pcie_tlp_log tlp; /* TLP Header */ > }; > > +static inline const char *aer_err_bus(struct aer_err_info *info) > +{ > + return info->is_cxl ? "CXL" : "PCIe"; > +} > + > int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 508474e17183..405f15c878ff 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -1211,6 +1211,7 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) > /* Must reset in this function */ > info->status = 0; > info->tlp_header_valid = 0; > + info->is_cxl = dev->is_cxl; > > /* The device might not support AER */ > if (!aer) > > Other than that, this patch looks good to me: > > Reviewed-by: Dan Williams <dan.j.williams@intel.com> Ok. I can add is_cxl to 'struct aer_err_info'. Shall I set it by reading the alternate protocol link state? Terry
Bowman, Terry wrote: [..] > > Reviewed-by: Dan Williams <dan.j.williams@intel.com> > Ok. I can add is_cxl to 'struct aer_err_info'. Shall I set it by reading the > alternate protocol link state? I am thinking no because dev->is_cxl at least indicates that a CXL link was up at some point, and racing CXL link down is not something the error core can reasonably mitigate. In the end I think that it should be something like: info->is_cxl = dev->is_cxl && is_internal_error() ...on the expectation that a CXL device is unlikely to multiplex internal errors across CXL protocol error events and device-specific internal events. Even if a device *did* multiplex those I think it is reasonable for the kernel to treat a device-specific UCE the same as a CXL protocol UCE and panic the system.
On 2/12/2025 1:57 PM, Dan Williams wrote: > Bowman, Terry wrote: > [..] >>> Reviewed-by: Dan Williams <dan.j.williams@intel.com> >> Ok. I can add is_cxl to 'struct aer_err_info'. Shall I set it by reading the >> alternate protocol link state? > I am thinking no because dev->is_cxl at least indicates that a CXL link > was up at some point, and racing CXL link down is not something the > error core can reasonably mitigate. > > In the end I think that it should be something like: > > info->is_cxl = dev->is_cxl && is_internal_error() > > ...on the expectation that a CXL device is unlikely to multiplex > internal errors across CXL protocol error events and device-specific > internal events. Even if a device *did* multiplex those I think it is > reasonable for the kernel to treat a device-specific UCE the same as a > CXL protocol UCE and panic the system. Ok. I found in using is_internal_error() (v5) a USP with fatal UCE will not have AER status populated in aer_info structure, only the severity field is populated (see aer_get_device_error_info()). The aer_info is not populated because concern reading the USP's AER (config space) when the upstream link state is invalid. Calling is_internal_error() in this case will return false because the uncorrectable internal error (UIE) bit is 0 and proceed to treat as a PCIe error. How do you want to proceed to handle the UCE protocol error in this case? Terry
On Wed, Feb 12, 2025 at 03:08:10PM -0600, Bowman, Terry wrote: > I found in using is_internal_error() (v5) a USP with fatal UCE will not > have AER status populated in aer_info structure, only the severity field > is populated (see aer_get_device_error_info()). The aer_info is not > populated because concern reading the USP's AER (config space) when > the upstream link state is invalid. Calling is_internal_error() in this > case will return false because the uncorrectable internal error (UIE) bit > is 0 and proceed to treat as a PCIe error. > How do you want to proceed to handle the UCE protocol error in this case? Shuai Xue is proposing a solution in this pending patch: https://lore.kernel.org/r/20250207093500.70885-5-xueshuai@linux.alibaba.com/
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 6e8de77d0fc4..f99a1c6fb274 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -694,13 +694,14 @@ static void __aer_print_error(struct pci_dev *dev, void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { + const char *bus_type = pcie_is_cxl(dev) ? "CXL" : "PCIe"; int layer, agent; int id = pci_dev_id(dev); const char *level; if (!info->status) { - pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", - aer_error_severity_string[info->severity]); + pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", + bus_type, aer_error_severity_string[info->severity]); goto out; } @@ -709,8 +710,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR; - pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", - aer_error_severity_string[info->severity], + pci_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n", + bus_type, aer_error_severity_string[info->severity], aer_error_layer[layer], aer_agent_string[agent]); pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", @@ -725,7 +726,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) if (info->id && info->error_dev_num > 1 && info->id == id) pci_err(dev, " Error of this Agent is reported first\n"); - trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask), + trace_aer_event(dev_name(&dev->dev), bus_type, (info->status & ~info->mask), info->severity, info->tlp_header_valid, &info->tlp); } @@ -759,6 +760,7 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer); void pci_print_aer(struct pci_dev *dev, int aer_severity, struct aer_capability_regs *aer) { + const char *bus_type = pcie_is_cxl(dev) ? "CXL" : "PCIe"; int layer, agent, tlp_header_valid = 0; u32 status, mask; struct aer_err_info info; @@ -793,7 +795,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, if (tlp_header_valid) pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" ")); - trace_aer_event(dev_name(&dev->dev), (status & ~mask), + trace_aer_event(dev_name(&dev->dev), bus_type, (status & ~mask), aer_severity, tlp_header_valid, &aer->header_log); } EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index e5f7ee0864e7..1bf8e7050ba8 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -297,15 +297,17 @@ TRACE_EVENT(non_standard_event, TRACE_EVENT(aer_event, TP_PROTO(const char *dev_name, + const char *bus_type, const u32 status, const u8 severity, const u8 tlp_header_valid, struct pcie_tlp_log *tlp), - TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp), + TP_ARGS(dev_name, bus_type, status, severity, tlp_header_valid, tlp), TP_STRUCT__entry( __string( dev_name, dev_name ) + __string( bus_type, bus_type ) __field( u32, status ) __field( u8, severity ) __field( u8, tlp_header_valid) @@ -314,6 +316,7 @@ TRACE_EVENT(aer_event, TP_fast_assign( __assign_str(dev_name); + __assign_str(bus_type); __entry->status = status; __entry->severity = severity; __entry->tlp_header_valid = tlp_header_valid; @@ -325,8 +328,8 @@ TRACE_EVENT(aer_event, } ), - TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n", - __get_str(dev_name), + TP_printk("%s %s Bus Error: severity=%s, %s, TLP Header=%s\n", + __get_str(dev_name), __get_str(bus_type), __entry->severity == AER_CORRECTABLE ? "Corrected" : __entry->severity == AER_FATAL ? "Fatal" : "Uncorrected, non-fatal",