diff mbox series

[RESEND,2/4] PCI/AER: Add Correctable Errors rate limiting

Message ID 68ef082c855b4e1d094dcfc9a861f43488b64922.1736341506.git.karolina.stolarek@oracle.com (mailing list archive)
State New
Delegated to: Bjorn Helgaas
Headers show
Series Rate limit reporting of Correctable Errors | expand

Commit Message

Karolina Stolarek Jan. 8, 2025, 1:55 p.m. UTC
In the case of a compromised Link integrity, we may see excessive
logging of Correctable Errors. This kind of errors is handled by
the hardware, so the messages are purely informational. It should
suffice to report the error once in a while, and inform how many
messages were suppressed over that time.

Add a ratelimit_state to control the number of printed Correctable
Errors per Root Port and check it each time a Correctable Error is
to be reported.

Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
 drivers/pci/pcie/aer.c | 44 ++++++++++++++++++++++++++++--------------
 include/linux/pci.h    |  1 +
 2 files changed, 31 insertions(+), 14 deletions(-)
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index b13690fd172f..5c34cc2b5bf3 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -40,6 +40,8 @@ 
 #define AER_MAX_TYPEOF_COR_ERRS		16	/* as per PCI_ERR_COR_STATUS */
 #define AER_MAX_TYPEOF_UNCOR_ERRS	27	/* as per PCI_ERR_UNCOR_STATUS*/
 
+#define AER_COR_ERR_INTERVAL		(2 * HZ)
+
 struct aer_err_source {
 	u32 status;			/* PCI_ERR_ROOT_STATUS */
 	u32 id;				/* PCI_ERR_ROOT_ERR_SRC */
@@ -375,6 +377,9 @@  void pci_aer_init(struct pci_dev *dev)
 
 	dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
 
+	/* Allow Root Port to report a Correctable Error message every 2 seconds */
+	ratelimit_state_init(&dev->cor_rs, AER_COR_ERR_INTERVAL, 1);
+
 	/*
 	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
 	 * PCI_ERR_COR_MASK, and PCI_ERR_CAP.  Root and Root Complex Event
@@ -766,11 +771,13 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 	u32 status, mask;
 	const char *level;
 	struct aer_err_info info;
+	bool no_ratelimit = true;
 
 	if (aer_severity == AER_CORRECTABLE) {
 		status = aer->cor_status;
 		mask = aer->cor_mask;
 		level = KERN_WARNING;
+		no_ratelimit = __ratelimit(&dev->cor_rs);
 	} else {
 		status = aer->uncor_status;
 		mask = aer->uncor_mask;
@@ -787,17 +794,20 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 	info.mask = mask;
 	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
 
-	pci_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
-	__aer_print_error(dev, &info, level);
-	pci_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
-		   aer_error_layer[layer], aer_agent_string[agent]);
+	if (no_ratelimit) {
+		pci_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+			   status, mask);
+		__aer_print_error(dev, &info, level);
+		pci_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
+			   aer_error_layer[layer], aer_agent_string[agent]);
 
-	if (aer_severity != AER_CORRECTABLE)
-		pci_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
-			   aer->uncor_severity);
+		if (aer_severity != AER_CORRECTABLE)
+			pci_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
+				   aer->uncor_severity);
 
-	if (tlp_header_valid)
-		__print_tlp_header(dev, &aer->header_log);
+		if (tlp_header_valid)
+			__print_tlp_header(dev, &aer->header_log);
+	}
 
 	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
 			aer_severity, tlp_header_valid, &aer->header_log);
@@ -1256,13 +1266,14 @@  int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 }
 
 static inline void aer_process_err_devices(struct aer_err_info *e_info,
-					   const char *level)
+					   const char *level,
+					   bool no_ratelimit)
 {
 	int i;
 
 	/* Report all before handle them, not to lost records by reset etc. */
 	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
-		if (aer_get_device_error_info(e_info->dev[i], e_info))
+		if (aer_get_device_error_info(e_info->dev[i], e_info) && no_ratelimit)
 			aer_print_error(e_info->dev[i], e_info, level);
 	}
 	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
@@ -1282,6 +1293,7 @@  static void aer_isr_one_error(struct aer_rpc *rpc,
 	struct pci_dev *pdev = rpc->rpd;
 	struct aer_err_info e_info;
 	const char *level;
+	bool no_ratelimit = true;
 
 	pci_rootport_aer_stats_incr(pdev, e_src);
 
@@ -1298,10 +1310,14 @@  static void aer_isr_one_error(struct aer_rpc *rpc,
 			e_info.multi_error_valid = 1;
 		else
 			e_info.multi_error_valid = 0;
-		aer_print_port_info(pdev, &e_info, level);
+
+		no_ratelimit = __ratelimit(&pdev->cor_rs);
+
+		if (no_ratelimit)
+			aer_print_port_info(pdev, &e_info, level);
 
 		if (find_source_device(pdev, &e_info))
-			aer_process_err_devices(&e_info, level);
+			aer_process_err_devices(&e_info, level, no_ratelimit);
 	}
 
 	if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
@@ -1321,7 +1337,7 @@  static void aer_isr_one_error(struct aer_rpc *rpc,
 		aer_print_port_info(pdev, &e_info, level);
 
 		if (find_source_device(pdev, &e_info))
-			aer_process_err_devices(&e_info, level);
+			aer_process_err_devices(&e_info, level, no_ratelimit);
 	}
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b5eb8bda655d..a736547396ca 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -347,6 +347,7 @@  struct pci_dev {
 #ifdef CONFIG_PCIEAER
 	u16		aer_cap;	/* AER capability offset */
 	struct aer_stats *aer_stats;	/* AER stats for this device */
+	struct ratelimit_state cor_rs;	/* Correctable Errors Ratelimit */
 #endif
 #ifdef CONFIG_PCIEPORTBUS
 	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */