diff mbox series

[RFC,3/4] PCI/AER: Increase the rate limit interval after threshold

Message ID 8e44f971e4e2abf89f610688a396485d8999c569.1734005191.git.karolina.stolarek@oracle.com (mailing list archive)
State New
Delegated to: Bjorn Helgaas
Headers show
Series Rate limit PCIe Correctable Errors | expand

Commit Message

Karolina Stolarek Dec. 12, 2024, 2:27 p.m. UTC
In extreme circumstances, the default rate limit might not
be enough and a longer timeout is needed. To avoid spamming
the logs, update the interval to 30 seconds for the specific
Root Port after it observes over 1000 Correctable Errors.

Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
 drivers/pci/pcie/aer.c | 22 +++++++++++++++++++++-
 include/linux/pci.h    |  1 +
 2 files changed, 22 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 5c34cc2b5bf3..98bf8bbadc07 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -40,7 +40,9 @@ 
 #define AER_MAX_TYPEOF_COR_ERRS		16	/* as per PCI_ERR_COR_STATUS */
 #define AER_MAX_TYPEOF_UNCOR_ERRS	27	/* as per PCI_ERR_UNCOR_STATUS*/
 
+#define AER_COR_ERR_THRESHOLD		1000
 #define AER_COR_ERR_INTERVAL		(2 * HZ)
+#define AER_COR_ERR_LONG_INTERVAL	(30 * HZ)
 
 struct aer_err_source {
 	u32 status;			/* PCI_ERR_ROOT_STATUS */
@@ -670,6 +672,24 @@  static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
 	}
 }
 
+static bool report_aer_cor_err(struct pci_dev *pdev)
+{
+	struct ratelimit_state *rs = &pdev->cor_rs;
+	struct aer_stats *aer_stats = pdev->aer_stats;
+	unsigned int total_cor_errs = aer_stats->rootport_total_cor_errs;
+
+	/* A significant number of errors reported, increase the rate limit */
+	if (total_cor_errs > AER_COR_ERR_THRESHOLD && !pdev->cor_err_throttled) {
+		pci_warn(pdev,
+			 "Over %d Correctable Errors reported, increasing the rate limit",
+			 AER_COR_ERR_THRESHOLD);
+		rs->interval = AER_COR_ERR_LONG_INTERVAL;
+		pdev->cor_err_throttled = 1;
+	}
+
+	return __ratelimit(&pdev->cor_rs);
+}
+
 static void __print_tlp_header(struct pci_dev *dev, struct pcie_tlp_log *t)
 {
 	pci_err(dev, "  TLP Header: %08x %08x %08x %08x\n",
@@ -1311,7 +1331,7 @@  static void aer_isr_one_error(struct aer_rpc *rpc,
 		else
 			e_info.multi_error_valid = 0;
 
-		no_ratelimit = __ratelimit(&pdev->cor_rs);
+		no_ratelimit = report_aer_cor_err(pdev);
 
 		if (no_ratelimit)
 			aer_print_port_info(pdev, &e_info, level);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3dfa2aac31b4..b01bfb339e4e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -348,6 +348,7 @@  struct pci_dev {
 	u16		aer_cap;	/* AER capability offset */
 	struct aer_stats *aer_stats;	/* AER stats for this device */
 	struct ratelimit_state cor_rs;	/* Correctable Errors Ratelimit */
+	unsigned int cor_err_throttled:1;
 #endif
 #ifdef CONFIG_PCIEPORTBUS
 	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */