diff mbox series

[PATCHv3,pci-next,2/2] PCI/AER: Rate limit the reporting of the correctable errors

Message ID 20230606035442.2886343-2-grundler@chromium.org (mailing list archive)
State Changes Requested
Delegated to: Bjorn Helgaas
Headers show
Series [PATCHv3,pci-next,1/2] PCI/AER: correctable error message as KERN_INFO | expand

Commit Message

Grant Grundler June 6, 2023, 3:54 a.m. UTC
From: Rajat Khandelwal <rajat.khandelwal@linux.intel.com>

There are many instances where correctable errors tend to inundate
the message buffer. We observe such instances during thunderbolt PCIe
tunneling.

It's true that they are mitigated by the hardware and are non-fatal
but we shouldn't be spamming the logs with such correctable errors as it
confuses other kernel developers less familiar with PCI errors, support
staff, and users who happen to look at the logs, hence rate limit them.

A typical example log inside an HP TBT4 dock:
[54912.661142] pcieport 0000:00:07.0: AER: Multiple Corrected error received: 0000:2b:00.0
[54912.661194] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[54912.661203] igc 0000:2b:00.0:   device [8086:5502] error status/mask=00001100/00002000
[54912.661211] igc 0000:2b:00.0:    [ 8] Rollover
[54912.661219] igc 0000:2b:00.0:    [12] Timeout
[54982.838760] pcieport 0000:00:07.0: AER: Corrected error received: 0000:2b:00.0
[54982.838798] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[54982.838808] igc 0000:2b:00.0:   device [8086:5502] error status/mask=00001000/00002000
[54982.838817] igc 0000:2b:00.0:    [12] Timeout

This gets repeated continuously, thus inundating the buffer.

Signed-off-by: Rajat Khandelwal <rajat.khandelwal@linux.intel.com>
Signed-off-by: Grant Grundler <grundler@chromium.org>
---
 drivers/pci/pcie/aer.c | 80 +++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 29 deletions(-)

Comments

kernel test robot June 6, 2023, 7:21 a.m. UTC | #1
Hi Grant,

kernel test robot noticed the following build errors:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus linus/master v6.4-rc5 next-20230606]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Grant-Grundler/PCI-AER-Rate-limit-the-reporting-of-the-correctable-errors/20230606-115515
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20230606035442.2886343-2-grundler%40chromium.org
patch subject: [PATCHv3 pci-next 2/2] PCI/AER: Rate limit the reporting of the correctable errors
config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20230606/202306061553.YXoBaXAM-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build):
        git remote add pci https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
        git fetch pci next
        git checkout pci/next
        b4 shazam https://lore.kernel.org/r/20230606035442.2886343-2-grundler@chromium.org
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/pci/pcie/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306061553.YXoBaXAM-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   drivers/pci/pcie/aer.c: In function '__aer_print_error':
>> drivers/pci/pcie/aer.c:717:44: error: expected ';' before ':' token
     717 |                 pci_err(dev, "%s", aer_msg):
         |                                            ^
   In file included from include/linux/device.h:15,
                    from include/linux/pci.h:37,
                    from drivers/pci/pcie/aer.c:20:
   drivers/pci/pcie/aer.c: In function 'aer_print_error':
>> drivers/pci/pcie/aer.c:15:21: warning: too many arguments for format [-Wformat-extra-args]
      15 | #define pr_fmt(fmt) "AER: " fmt
         |                     ^~~~~~~
   include/linux/dev_printk.h:110:30: note: in definition of macro 'dev_printk_index_wrap'
     110 |                 _p_func(dev, fmt, ##__VA_ARGS__);                       \
         |                              ^~~
   drivers/pci/pcie/aer.c:16:17: note: in expansion of macro 'pr_fmt'
      16 | #define dev_fmt pr_fmt
         |                 ^~~~~~
   include/linux/dev_printk.h:144:56: note: in expansion of macro 'dev_fmt'
     144 |         dev_printk_index_wrap(_dev_err, KERN_ERR, dev, dev_fmt(fmt), ##__VA_ARGS__)
         |                                                        ^~~~~~~
   include/linux/pci.h:2600:41: note: in expansion of macro 'dev_err'
    2600 | #define pci_err(pdev, fmt, arg...)      dev_err(&(pdev)->dev, fmt, ##arg)
         |                                         ^~~~~~~
   drivers/pci/pcie/aer.c:743:17: note: in expansion of macro 'pci_err'
     743 |                 pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
         |                 ^~~~~~~
   drivers/pci/pcie/aer.c: At top level:
   drivers/pci/pcie/aer.c:626:13: warning: 'pci_dev_aer_stats_incr' defined but not used [-Wunused-function]
     626 | static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
         |             ^~~~~~~~~~~~~~~~~~~~~~


vim +717 drivers/pci/pcie/aer.c

   684	
   685	static void __aer_print_error(struct pci_dev *dev,
   686				      struct aer_err_info *info)
   687	{
   688		const char **strings;
   689		char aer_msg[512];
   690		unsigned long status = info->status & ~info->mask;
   691		int i;
   692	
   693		memset(aer_msg, 0, sizeof(*aer_msg));
   694		snprintf(aer_msg, sizeof(*aer_msg), "aer_status: 0x%08x, aer_mask: 0x%08x\n",
   695				info->status, info->mask);
   696	
   697		strings = (info->severity == AER_CORRECTABLE) ?
   698			aer_correctable_error_string : aer_uncorrectable_error_string;
   699	
   700		for_each_set_bit(i, &status, 32) {
   701			const char *errmsg = strings[i];
   702			char bitmsg[64];
   703			memset(bitmsg, 0, sizeof(*bitmsg));
   704	
   705			if (!errmsg)
   706				errmsg = "Unknown Error Bit";
   707	
   708			snprintf(bitmsg, sizeof(*bitmsg), "   [%2d] %-22s%s\n", i, errmsg,
   709				    info->first_error == i ? " (First)" : "");
   710	
   711			strlcat(aer_msg, bitmsg, sizeof(*aer_msg));
   712		}
   713	
   714		if (info->severity == AER_CORRECTABLE)
   715			pci_info_ratelimited(dev, "%s", aer_msg);
   716		else
 > 717			pci_err(dev, "%s", aer_msg):
   718	
   719		pci_dev_aer_stats_incr(dev, info);
   720	}
   721
kernel test robot June 6, 2023, 9:49 a.m. UTC | #2
Hi Grant,

kernel test robot noticed the following build errors:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus linus/master v6.4-rc5 next-20230606]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Grant-Grundler/PCI-AER-Rate-limit-the-reporting-of-the-correctable-errors/20230606-115515
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20230606035442.2886343-2-grundler%40chromium.org
patch subject: [PATCHv3 pci-next 2/2] PCI/AER: Rate limit the reporting of the correctable errors
config: x86_64-randconfig-x051-20230606 (https://download.01.org/0day-ci/archive/20230606/202306061710.tDjm3jHD-lkp@intel.com/config)
compiler: clang version 15.0.7 (https://github.com/llvm/llvm-project.git 8dfdcc7b7bf66834a761bd8de445840ef68e4d1a)
reproduce (this is a W=1 build):
        mkdir -p ~/bin
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        git remote add pci https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
        git fetch pci next
        git checkout pci/next
        b4 shazam https://lore.kernel.org/r/20230606035442.2886343-2-grundler@chromium.org
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/pci/pcie/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306061710.tDjm3jHD-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

>> drivers/pci/pcie/aer.c:717:30: error: expected ';' after expression
                   pci_err(dev, "%s", aer_msg):
                                              ^
                                              ;
>> drivers/pci/pcie/aer.c:746:28: warning: data argument not used by format string [-Wformat-extra-args]
                           aer_error_layer[layer], aer_agent_string[agent],
                           ~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/pci.h:2600:65: note: expanded from macro 'pci_err'
   #define pci_err(pdev, fmt, arg...)      dev_err(&(pdev)->dev, fmt, ##arg)
                                           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~
   include/linux/dev_printk.h:144:65: note: expanded from macro 'dev_err'
           dev_printk_index_wrap(_dev_err, KERN_ERR, dev, dev_fmt(fmt), ##__VA_ARGS__)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~
   include/linux/dev_printk.h:110:23: note: expanded from macro 'dev_printk_index_wrap'
                   _p_func(dev, fmt, ##__VA_ARGS__);                       \
                                ~~~    ^
   1 warning and 1 error generated.


vim +717 drivers/pci/pcie/aer.c

   684	
   685	static void __aer_print_error(struct pci_dev *dev,
   686				      struct aer_err_info *info)
   687	{
   688		const char **strings;
   689		char aer_msg[512];
   690		unsigned long status = info->status & ~info->mask;
   691		int i;
   692	
   693		memset(aer_msg, 0, sizeof(*aer_msg));
   694		snprintf(aer_msg, sizeof(*aer_msg), "aer_status: 0x%08x, aer_mask: 0x%08x\n",
   695				info->status, info->mask);
   696	
   697		strings = (info->severity == AER_CORRECTABLE) ?
   698			aer_correctable_error_string : aer_uncorrectable_error_string;
   699	
   700		for_each_set_bit(i, &status, 32) {
   701			const char *errmsg = strings[i];
   702			char bitmsg[64];
   703			memset(bitmsg, 0, sizeof(*bitmsg));
   704	
   705			if (!errmsg)
   706				errmsg = "Unknown Error Bit";
   707	
   708			snprintf(bitmsg, sizeof(*bitmsg), "   [%2d] %-22s%s\n", i, errmsg,
   709				    info->first_error == i ? " (First)" : "");
   710	
   711			strlcat(aer_msg, bitmsg, sizeof(*aer_msg));
   712		}
   713	
   714		if (info->severity == AER_CORRECTABLE)
   715			pci_info_ratelimited(dev, "%s", aer_msg);
   716		else
 > 717			pci_err(dev, "%s", aer_msg):
   718	
   719		pci_dev_aer_stats_incr(dev, info);
   720	}
   721	
   722	void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
   723	{
   724		int layer, agent;
   725		int id = ((dev->bus->number << 8) | dev->devfn);
   726	
   727		if (!info->status) {
   728			pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
   729				aer_error_severity_string[info->severity]);
   730			goto out;
   731		}
   732	
   733		layer = AER_GET_LAYER_ERROR(info->severity, info->status);
   734		agent = AER_GET_AGENT(info->severity, info->status);
   735	
   736		if (info->severity == AER_CORRECTABLE) {
   737			pci_info_ratelimited(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n"
   738					"  device [%04x:%04x] error status/mask=%08x/%08x\n",
   739					     aer_error_severity_string[info->severity],
   740					     aer_error_layer[layer], aer_agent_string[agent],
   741					     dev->vendor, dev->device, info->status, info->mask);
   742		} else {
   743			pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
   744				"  device [%04x:%04x] error status/mask=%08x/%08x\n",
   745				aer_error_severity_string[info->severity],
 > 746				aer_error_layer[layer], aer_agent_string[agent],
   747				dev->vendor, dev->device, info->status, info->mask);
   748		}
   749	
   750		__aer_print_error(dev, info);
   751	
   752		if (info->tlp_header_valid)
   753			__print_tlp_header(dev, &info->tlp);
   754	
   755	out:
   756		if (info->id && info->error_dev_num > 1 && info->id == id)
   757			pci_err(dev, "  Error of this Agent is reported first\n");
   758	
   759		trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
   760				info->severity, info->tlp_header_valid, &info->tlp);
   761	}
   762
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d7bfc6070ddb..830f5a1261c9 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -686,26 +686,36 @@  static void __aer_print_error(struct pci_dev *dev,
 			      struct aer_err_info *info)
 {
 	const char **strings;
+	char aer_msg[512];
 	unsigned long status = info->status & ~info->mask;
-	const char *level, *errmsg;
 	int i;
 
-	if (info->severity == AER_CORRECTABLE) {
-		strings = aer_correctable_error_string;
-		level = KERN_INFO;
-	} else {
-		strings = aer_uncorrectable_error_string;
-		level = KERN_ERR;
-	}
+	memset(aer_msg, 0, sizeof(*aer_msg));
+	snprintf(aer_msg, sizeof(*aer_msg), "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+			info->status, info->mask);
+
+	strings = (info->severity == AER_CORRECTABLE) ?
+		aer_correctable_error_string : aer_uncorrectable_error_string;
 
 	for_each_set_bit(i, &status, 32) {
-		errmsg = strings[i];
+		const char *errmsg = strings[i];
+		char bitmsg[64];
+		memset(bitmsg, 0, sizeof(*bitmsg));
+
 		if (!errmsg)
 			errmsg = "Unknown Error Bit";
 
-		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
-				info->first_error == i ? " (First)" : "");
+		snprintf(bitmsg, sizeof(*bitmsg), "   [%2d] %-22s%s\n", i, errmsg,
+			    info->first_error == i ? " (First)" : "");
+
+		strlcat(aer_msg, bitmsg, sizeof(*aer_msg));
 	}
+
+	if (info->severity == AER_CORRECTABLE)
+		pci_info_ratelimited(dev, "%s", aer_msg);
+	else
+		pci_err(dev, "%s", aer_msg):
+
 	pci_dev_aer_stats_incr(dev, info);
 }
 
@@ -713,7 +723,6 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 {
 	int layer, agent;
 	int id = ((dev->bus->number << 8) | dev->devfn);
-	const char *level;
 
 	if (!info->status) {
 		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -724,14 +733,19 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
 	agent = AER_GET_AGENT(info->severity, info->status);
 
-	level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;
-
-	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
-		   aer_error_severity_string[info->severity],
-		   aer_error_layer[layer], aer_agent_string[agent]);
-
-	pci_printk(level, dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
-		   dev->vendor, dev->device, info->status, info->mask);
+	if (info->severity == AER_CORRECTABLE) {
+		pci_info_ratelimited(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n"
+				"  device [%04x:%04x] error status/mask=%08x/%08x\n",
+				     aer_error_severity_string[info->severity],
+				     aer_error_layer[layer], aer_agent_string[agent],
+				     dev->vendor, dev->device, info->status, info->mask);
+	} else {
+		pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
+			"  device [%04x:%04x] error status/mask=%08x/%08x\n",
+			aer_error_severity_string[info->severity],
+			aer_error_layer[layer], aer_agent_string[agent],
+			dev->vendor, dev->device, info->status, info->mask);
+	}
 
 	__aer_print_error(dev, info);
 
@@ -751,11 +765,19 @@  static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
 	u8 bus = info->id >> 8;
 	u8 devfn = info->id & 0xff;
 
-	pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
-		 info->multi_error_valid ? "Multiple " : "",
-		 aer_error_severity_string[info->severity],
-		 pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
-		 PCI_FUNC(devfn));
+	if (info->severity == AER_CORRECTABLE)
+		pci_info_ratelimited(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
+				     info->multi_error_valid ? "Multiple " : "",
+				     aer_error_severity_string[info->severity],
+				     pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+				     PCI_FUNC(devfn));
+	else
+		pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
+			 info->multi_error_valid ? "Multiple " : "",
+			 aer_error_severity_string[info->severity],
+			 pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+			 PCI_FUNC(devfn));
+
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -798,7 +820,7 @@  void cper_print_aer(struct pci_dev *dev, int aer_severity,
 	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
 
 	if (aer_severity == AER_CORRECTABLE)
-		pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
+		pci_info_ratelimited(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
 	else
 		pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
 
@@ -808,9 +830,9 @@  void cper_print_aer(struct pci_dev *dev, int aer_severity,
 		pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
 			aer_error_layer[layer], aer_agent_string[agent]);
 	} else {
-		pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
-			aer_error_layer[layer], aer_agent_string[agent]);
-		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
+		pci_err(dev, "aer_layer=%s, aer_agent=%s,"
+			" aer_uncor_severity=0x%08x\n",
+			aer_error_layer[layer], aer_agent_string[agent],
 			aer->uncor_severity);
 	}