diff mbox series

[v2,4/4] RAS: Trace more information in aer_event

Message ID 20240125062802.50819-5-qingshun.wang@linux.intel.com (mailing list archive)
State New
Delegated to: Bjorn Helgaas
Headers show
Series PCI/AER: Handle Advisory Non-Fatal properly | expand

Commit Message

Wang, Qingshun Jan. 25, 2024, 6:28 a.m. UTC
Add following fields in aer_event to better understand Advisory
Non-Fatal and other errors for external observation:

  - cor_status		(Correctable Error Status)
  - cor_mask		(Correctable Error Mask)
  - uncor_status	(Uncorrectable Error Status)
  - uncor_severity	(Uncorrectable Error Severity)
  - uncor_mask		(Uncorrectable Error Mask)
  - aer_cap_ctrl	(AER Capabilities and Control)
  - link_status		(Link Status)
  - device_status	(Device Status)
  - device_control_2	(Device Control 2)

In addition to the raw register value, value of following fields are
extracted and logged for better observability:

  - "First Error Pointer" and "Completion Timeout Prefix/Header Log
    Capable" from "AER Capabilities and Control"
  - "Completion Timeout Value" and "Completion Timeout Disable"
    from "Device Control 2"

Signed-off-by: "Wang, Qingshun" <qingshun.wang@linux.intel.com>
---
 drivers/pci/pcie/aer.c        | 17 +++++++++++--
 include/ras/ras_event.h       | 48 ++++++++++++++++++++++++++++++++---
 include/uapi/linux/pci_regs.h |  1 +
 3 files changed, 60 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index eec3406f727a..2f5639f6c40f 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -757,6 +757,7 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	int layer, agent;
 	int id = pci_dev_id(dev);
 	const char *level;
+	struct aer_capability_regs aer_caps;
 
 	if (info->severity == AER_CORRECTABLE) {
 		status = info->cor_status;
@@ -793,8 +794,18 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	if (info->id && info->error_dev_num > 1 && info->id == id)
 		pci_err(dev, "  Error of this Agent is reported first\n");
 
+	aer_caps = (struct aer_capability_regs) {
+	  .cor_status = info->cor_status,
+	  .cor_mask = info->cor_mask,
+	  .uncor_status = info->uncor_status,
+	  .uncor_severity = info->uncor_severity,
+	  .uncor_mask = info->uncor_mask,
+	  .cap_control = info->aer_cap_ctrl
+	};
 	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
-			info->severity, info->tlp_header_valid, &info->tlp);
+			info->severity, info->tlp_header_valid, &info->tlp,
+			&aer_caps, info->link_status,
+			info->device_status, info->device_control_2);
 }
 
 static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -870,7 +881,9 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 		__print_tlp_header(dev, &aer->header_log);
 
 	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
-			aer_severity, tlp_header_valid, &aer->header_log);
+			aer_severity, tlp_header_valid, &aer->header_log,
+			aer, info.link_status,
+			info.device_status, info.device_control_2);
 }
 EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
 
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index cbd3ddd7c33d..a94997073d90 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -300,9 +300,14 @@  TRACE_EVENT(aer_event,
 		 const u32 status,
 		 const u8 severity,
 		 const u8 tlp_header_valid,
-		 struct aer_header_log_regs *tlp),
+		 struct aer_header_log_regs *tlp,
+		 struct aer_capability_regs *aer_caps,
+		 const u16 link_status,
+		 const u16 device_status,
+		 const u16 device_control_2),
 
-	TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
+	TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp,
+		aer_caps, link_status, device_status, device_control_2),
 
 	TP_STRUCT__entry(
 		__string(	dev_name,	dev_name	)
@@ -310,6 +315,10 @@  TRACE_EVENT(aer_event,
 		__field(	u8,		severity	)
 		__field(	u8, 		tlp_header_valid)
 		__array(	u32, 		tlp_header, 4	)
+		__field_struct(struct aer_capability_regs, aer_caps)
+		__field(	u16,		link_status	)
+		__field(	u16,		device_status	)
+		__field(	u16,		device_control_2)
 	),
 
 	TP_fast_assign(
@@ -317,6 +326,10 @@  TRACE_EVENT(aer_event,
 		__entry->status		= status;
 		__entry->severity	= severity;
 		__entry->tlp_header_valid = tlp_header_valid;
+		__entry->aer_caps	= *aer_caps;
+		__entry->link_status	= link_status;
+		__entry->device_status	= device_status;
+		__entry->device_control_2 = device_control_2;
 		if (tlp_header_valid) {
 			__entry->tlp_header[0] = tlp->dw0;
 			__entry->tlp_header[1] = tlp->dw1;
@@ -325,7 +338,20 @@  TRACE_EVENT(aer_event,
 		}
 	),
 
-	TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
+	TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s, "
+		  "Correctable Error Status=0x%08x, "
+		  "Correctable Error Mask=0x%08x, "
+		  "Uncorrectable Error Status=0x%08x, "
+		  "Uncorrectable Error Severity=0x%08x, "
+		  "Uncorrectable Error Mask=0x%08x, "
+		  "AER Capability and Control=0x%08x, "
+		  "First Error Pointer=0x%x, "
+		  "Completion Timeout Prefix/Header Log Capable=%s, "
+		  "Link Status=0x%04x, "
+		  "Device Status=0x%04x, "
+		  "Device Control 2=0x%04x, "
+		  "Completion Timeout Value=0x%x, "
+		  "Completion Timeout Disable=%sn",
 		__get_str(dev_name),
 		__entry->severity == AER_CORRECTABLE ? "Corrected" :
 			__entry->severity == AER_FATAL ?
@@ -335,7 +361,21 @@  TRACE_EVENT(aer_event,
 		__print_flags(__entry->status, "|", aer_uncorrectable_errors),
 		__entry->tlp_header_valid ?
 			__print_array(__entry->tlp_header, 4, 4) :
-			"Not available")
+			"Not available",
+		__entry->aer_caps.cor_status,
+		__entry->aer_caps.cor_mask,
+		__entry->aer_caps.uncor_status,
+		__entry->aer_caps.uncor_severity,
+		__entry->aer_caps.uncor_mask,
+		__entry->aer_caps.cap_control,
+		PCI_ERR_CAP_FEP(__entry->aer_caps.cap_control),
+		__entry->aer_caps.cap_control & PCI_ERR_CAP_CTO_LOGC ? "True" : "False",
+		__entry->link_status,
+		__entry->device_status,
+		__entry->device_control_2,
+		__entry->device_control_2 & PCI_EXP_DEVCTL2_COMP_TIMEOUT,
+		__entry->device_control_2 & PCI_EXP_DEVCTL2_COMP_TMOUT_DIS ?
+					    "True" : "False")
 );
 
 /*
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index a39193213ff2..54160ed2a8c9 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -787,6 +787,7 @@ 
 #define  PCI_ERR_CAP_ECRC_GENE	0x00000040	/* ECRC Generation Enable */
 #define  PCI_ERR_CAP_ECRC_CHKC	0x00000080	/* ECRC Check Capable */
 #define  PCI_ERR_CAP_ECRC_CHKE	0x00000100	/* ECRC Check Enable */
+#define  PCI_ERR_CAP_CTO_LOGC	0x00001000	/* Completion Timeout Prefix/Header Log Capable */
 #define PCI_ERR_HEADER_LOG	0x1c	/* Header Log Register (16 bytes) */
 #define PCI_ERR_ROOT_COMMAND	0x2c	/* Root Error Command */
 #define  PCI_ERR_ROOT_CMD_COR_EN	0x00000001 /* Correctable Err Reporting Enable */