diff mbox series

[PATCHv2,pci-next,1/2] PCI/AER: correctable error message as KERN_INFO

Message ID 20230317175109.3859943-1-grundler@chromium.org (mailing list archive)
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series [PATCHv2,pci-next,1/2] PCI/AER: correctable error message as KERN_INFO | expand

Commit Message

Grant Grundler March 17, 2023, 5:51 p.m. UTC
Since correctable errors have been corrected (and counted), the dmesg output
should not be reported as a warning, but rather as "informational".

Otherwise, using a certain well known vendor's PCIe parts in a USB4 docking
station, the dmesg buffer can be spammed with correctable errors, 717 bytes
per instance, potentially many MB per day.

Given the "WARN" priority, these messages have already confused the typical
user that stumbles across them, support staff (triaging feedback reports),
and more than a few linux kernel devs. Changing to INFO will hide these
messages from most audiences.

Signed-off-by: Grant Grundler <grundler@chromium.org>
---
 drivers/pci/pcie/aer.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

Comments

Kuppuswamy Sathyanarayanan March 17, 2023, 6:50 p.m. UTC | #1
On 3/17/23 10:51 AM, Grant Grundler wrote:
> Since correctable errors have been corrected (and counted), the dmesg output
> should not be reported as a warning, but rather as "informational".
> 
> Otherwise, using a certain well known vendor's PCIe parts in a USB4 docking
> station, the dmesg buffer can be spammed with correctable errors, 717 bytes
> per instance, potentially many MB per day.

Why don't you investigate why you are getting so many correctable errors?
Isn't solving the problem preferable to hiding the logs?

> 
> Given the "WARN" priority, these messages have already confused the typical
> user that stumbles across them, support staff (triaging feedback reports),
> and more than a few linux kernel devs. Changing to INFO will hide these
> messages from most audiences.
> 
> Signed-off-by: Grant Grundler <grundler@chromium.org>
> ---
>  drivers/pci/pcie/aer.c | 29 +++++++++++++++++++----------
>  1 file changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index f6c24ded134c..cb6b96233967 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -687,23 +687,29 @@ static void __aer_print_error(struct pci_dev *dev,
>  {
>  	const char **strings;
>  	unsigned long status = info->status & ~info->mask;
> -	const char *level, *errmsg;
>  	int i;
>  
>  	if (info->severity == AER_CORRECTABLE) {
>  		strings = aer_correctable_error_string;
> -		level = KERN_WARNING;
> +		pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> +			info->status, info->mask);
>  	} else {
>  		strings = aer_uncorrectable_error_string;
> -		level = KERN_ERR;
> +		pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> +			info->status, info->mask);
>  	}
>  
>  	for_each_set_bit(i, &status, 32) {
> -		errmsg = strings[i];
> +		const char *errmsg = strings[i];
> +
>  		if (!errmsg)
>  			errmsg = "Unknown Error Bit";
>  
> -		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
> +		if (info->severity == AER_CORRECTABLE)
> +			pci_info(dev, "   [%2d] %-22s%s\n", i, errmsg,
> +				info->first_error == i ? " (First)" : "");
> +		else
> +			pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
>  				info->first_error == i ? " (First)" : "");
>  	}
>  	pci_dev_aer_stats_incr(dev, info);
> @@ -724,7 +730,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
>  	agent = AER_GET_AGENT(info->severity, info->status);
>  
> -	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> +	level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;
>  
>  	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
>  		   aer_error_severity_string[info->severity],
> @@ -797,14 +803,17 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
>  	info.mask = mask;
>  	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>  
> -	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>  	__aer_print_error(dev, &info);
> -	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> -		aer_error_layer[layer], aer_agent_string[agent]);
>  
> -	if (aer_severity != AER_CORRECTABLE)
> +	if (aer_severity == AER_CORRECTABLE) {
> +		pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
> +			aer_error_layer[layer], aer_agent_string[agent]);
> +	} else {
> +		pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> +			aer_error_layer[layer], aer_agent_string[agent]);
>  		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
>  			aer->uncor_severity);
> +	}
>  
>  	if (tlp_header_valid)
>  		__print_tlp_header(dev, &aer->header_log);
Bjorn Helgaas March 17, 2023, 7:30 p.m. UTC | #2
On Fri, Mar 17, 2023 at 11:50:22AM -0700, Sathyanarayanan Kuppuswamy wrote:
> On 3/17/23 10:51 AM, Grant Grundler wrote:
> > Since correctable errors have been corrected (and counted), the dmesg output
> > should not be reported as a warning, but rather as "informational".
> > 
> > Otherwise, using a certain well known vendor's PCIe parts in a USB4 docking
> > station, the dmesg buffer can be spammed with correctable errors, 717 bytes
> > per instance, potentially many MB per day.
> 
> Why don't you investigate why you are getting so many correctable errors?
> Isn't solving the problem preferable to hiding the logs?

I hope there's some effort to find the cause of the errors, too.  But
I do think KERN_INFO is a reasonable level for errors that have
already been corrected.  KERN_ERR seems a little bit too severe to me.

Does changing to KERN_INFO keep the messages out of the dmesg log?  I
don't think it does, because *most* kernel messages are at KERN_INFO.
This may be just a commit log clarification.

I would like to know *which* devices are involved.  Is there some
reason for weasel-wording this?  Knowing which devices are involved
helps in triaging issue reports.  If there are any public reports on
mailing lists, etc, we could also cite those here to help users find
this solution.

> > Given the "WARN" priority, these messages have already confused the typical
> > user that stumbles across them, support staff (triaging feedback reports),
> > and more than a few linux kernel devs. Changing to INFO will hide these
> > messages from most audiences.
> > 
> > Signed-off-by: Grant Grundler <grundler@chromium.org>
> > ---
> >  drivers/pci/pcie/aer.c | 29 +++++++++++++++++++----------
> >  1 file changed, 19 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index f6c24ded134c..cb6b96233967 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -687,23 +687,29 @@ static void __aer_print_error(struct pci_dev *dev,
> >  {
> >  	const char **strings;
> >  	unsigned long status = info->status & ~info->mask;
> > -	const char *level, *errmsg;
> >  	int i;
> >  
> >  	if (info->severity == AER_CORRECTABLE) {
> >  		strings = aer_correctable_error_string;
> > -		level = KERN_WARNING;
> > +		pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> > +			info->status, info->mask);
> >  	} else {
> >  		strings = aer_uncorrectable_error_string;
> > -		level = KERN_ERR;
> > +		pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> > +			info->status, info->mask);
> >  	}
> >  
> >  	for_each_set_bit(i, &status, 32) {
> > -		errmsg = strings[i];
> > +		const char *errmsg = strings[i];
> > +
> >  		if (!errmsg)
> >  			errmsg = "Unknown Error Bit";
> >  
> > -		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
> > +		if (info->severity == AER_CORRECTABLE)
> > +			pci_info(dev, "   [%2d] %-22s%s\n", i, errmsg,
> > +				info->first_error == i ? " (First)" : "");
> > +		else
> > +			pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
> >  				info->first_error == i ? " (First)" : "");

The - 5 lines, + 11 lines diff and repetition of the printk strings
doesn't seem like an improvement compared to the -1, +1 in the v1
patch:

  @@ -692,7 +692,7 @@ static void __aer_print_error(struct pci_dev *dev,

          if (info->severity == AER_CORRECTABLE) {
                  strings = aer_correctable_error_string;
  -               level = KERN_WARNING;
  +               level = KERN_INFO;
          } else {

But maybe there's a reason?

> >  	}
> >  	pci_dev_aer_stats_incr(dev, info);
> > @@ -724,7 +730,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> >  	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
> >  	agent = AER_GET_AGENT(info->severity, info->status);
> >  
> > -	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> > +	level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;
> >  
> >  	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> >  		   aer_error_severity_string[info->severity],
> > @@ -797,14 +803,17 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
> >  	info.mask = mask;
> >  	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
> >  
> > -	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> >  	__aer_print_error(dev, &info);
> > -	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> > -		aer_error_layer[layer], aer_agent_string[agent]);
> >  
> > -	if (aer_severity != AER_CORRECTABLE)
> > +	if (aer_severity == AER_CORRECTABLE) {
> > +		pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
> > +			aer_error_layer[layer], aer_agent_string[agent]);
> > +	} else {
> > +		pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> > +			aer_error_layer[layer], aer_agent_string[agent]);
> >  		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> >  			aer->uncor_severity);
> > +	}
> >  
> >  	if (tlp_header_valid)
> >  		__print_tlp_header(dev, &aer->header_log);
> 
> -- 
> Sathyanarayanan Kuppuswamy
> Linux Kernel Developer
Grant Grundler March 19, 2023, 6 a.m. UTC | #3
On Fri, Mar 17, 2023 at 12:31 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Fri, Mar 17, 2023 at 11:50:22AM -0700, Sathyanarayanan Kuppuswamy wrote:
> > On 3/17/23 10:51 AM, Grant Grundler wrote:
> > > Since correctable errors have been corrected (and counted), the dmesg output
> > > should not be reported as a warning, but rather as "informational".
> > >
> > > Otherwise, using a certain well known vendor's PCIe parts in a USB4 docking
> > > station, the dmesg buffer can be spammed with correctable errors, 717 bytes
> > > per instance, potentially many MB per day.
> >
> > Why don't you investigate why you are getting so many correctable errors?

It's a bit difficult to insert a PCIe logic analyzer on someone else's
soldered down parts. Intel is the only party with all the HW in a form
that allows that level of debugging.

> > Isn't solving the problem preferable to hiding the logs?

Of course! But note that spamming the logs with low priority messages
isn't acceptable either. So we should be doing both.

> I hope there's some effort to find the cause of the errors, too.

Rajat (from Intel) has told me that Intel is investigating this.

I was told the devices are reporting two kinds of correctable
errors:replay rollover and replay timeout.

This may be related to a replay timer value (PCIERTP1) in the PCI
device is too short for PCIe tunneling (USB4).  It's not clear to me
if this value is hardcoded in the Intel NIC or if it's configurable.
But I've been waiting since last December for Intel to give me
guidance on what is implemented in their NIC.

> But I do think KERN_INFO is a reasonable level for errors that have
> already been corrected.  KERN_ERR seems a little bit too severe to me.
>
> Does changing to KERN_INFO keep the messages out of the dmesg log?  I
> don't think it does, because *most* kernel messages are at KERN_INFO.
> This may be just a commit log clarification.

True. I'm not sure why I thought KERN_INFO would not show up.  If they
do, I'll depend on the rate limiting patch to not spam the system
logs.

> I would like to know *which* devices are involved.
> Is there some reason for weasel-wording this?

I don't think there is. Rajat previously mentioned this was for the HP
TBT4 docking station. The two USB4 docking station vendors I'm
familiar with are both using Intel i225 NIC (foxville). This appears
to be a problem only with the NIC behind a PCIe tunnel and not a HW
problem with the docking station overall (which is what I previously
assumed.)

FTR, this same NIC works and does not report correctable errors when
directly connected to a PCI host controller. It's not as happy on the
other side of a USB4 PCIe tunnel but still appears to work correctly
(retries are succeeding). I'm not able to point at any NIC failures
(yet). But I'm certainly not very happy that the same company which is
the primary backer of thunderbolt and USB4 isn't able to
design/configure parts so they play well together.

>  Knowing which devices are involved
> helps in triaging issue reports.  If there are any public reports on
> mailing lists, etc, we could also cite those here to help users find
> this solution.

Yup - that's exactly how I've been helped so often in the past as well.


>
> > > Given the "WARN" priority, these messages have already confused the typical
> > > user that stumbles across them, support staff (triaging feedback reports),
> > > and more than a few linux kernel devs. Changing to INFO will hide these
> > > messages from most audiences.
> > >
> > > Signed-off-by: Grant Grundler <grundler@chromium.org>
> > > ---
> > >  drivers/pci/pcie/aer.c | 29 +++++++++++++++++++----------
> > >  1 file changed, 19 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > > index f6c24ded134c..cb6b96233967 100644
> > > --- a/drivers/pci/pcie/aer.c
> > > +++ b/drivers/pci/pcie/aer.c
> > > @@ -687,23 +687,29 @@ static void __aer_print_error(struct pci_dev *dev,
> > >  {
> > >     const char **strings;
> > >     unsigned long status = info->status & ~info->mask;
> > > -   const char *level, *errmsg;
> > >     int i;
> > >
> > >     if (info->severity == AER_CORRECTABLE) {
> > >             strings = aer_correctable_error_string;
> > > -           level = KERN_WARNING;
> > > +           pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> > > +                   info->status, info->mask);
> > >     } else {
> > >             strings = aer_uncorrectable_error_string;
> > > -           level = KERN_ERR;
> > > +           pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
> > > +                   info->status, info->mask);
> > >     }
> > >
> > >     for_each_set_bit(i, &status, 32) {
> > > -           errmsg = strings[i];
> > > +           const char *errmsg = strings[i];
> > > +
> > >             if (!errmsg)
> > >                     errmsg = "Unknown Error Bit";
> > >
> > > -           pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
> > > +           if (info->severity == AER_CORRECTABLE)
> > > +                   pci_info(dev, "   [%2d] %-22s%s\n", i, errmsg,
> > > +                           info->first_error == i ? " (First)" : "");
> > > +           else
> > > +                   pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
> > >                             info->first_error == i ? " (First)" : "");
>
> The - 5 lines, + 11 lines diff and repetition of the printk strings
> doesn't seem like an improvement compared to the -1, +1 in the v1
> patch:
>
>   @@ -692,7 +692,7 @@ static void __aer_print_error(struct pci_dev *dev,
>
>           if (info->severity == AER_CORRECTABLE) {
>                   strings = aer_correctable_error_string;
>   -               level = KERN_WARNING;
>   +               level = KERN_INFO;
>           } else {
>
> But maybe there's a reason?

Not intentionally - but Yes - the next patch is simpler when changing
to pci_info_ratelimit. This change will show up in either the first or
second patch.

I can switch that hunk back to V1 code if you prefer that for this patch.

cheers,
grant

>
> > >     }
> > >     pci_dev_aer_stats_incr(dev, info);
> > > @@ -724,7 +730,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> > >     layer = AER_GET_LAYER_ERROR(info->severity, info->status);
> > >     agent = AER_GET_AGENT(info->severity, info->status);
> > >
> > > -   level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> > > +   level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;
> > >
> > >     pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> > >                aer_error_severity_string[info->severity],
> > > @@ -797,14 +803,17 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
> > >     info.mask = mask;
> > >     info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
> > >
> > > -   pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> > >     __aer_print_error(dev, &info);
> > > -   pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> > > -           aer_error_layer[layer], aer_agent_string[agent]);
> > >
> > > -   if (aer_severity != AER_CORRECTABLE)
> > > +   if (aer_severity == AER_CORRECTABLE) {
> > > +           pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
> > > +                   aer_error_layer[layer], aer_agent_string[agent]);
> > > +   } else {
> > > +           pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> > > +                   aer_error_layer[layer], aer_agent_string[agent]);
> > >             pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> > >                     aer->uncor_severity);
> > > +   }
> > >
> > >     if (tlp_header_valid)
> > >             __print_tlp_header(dev, &aer->header_log);
> >
> > --
> > Sathyanarayanan Kuppuswamy
> > Linux Kernel Developer
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index f6c24ded134c..cb6b96233967 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -687,23 +687,29 @@  static void __aer_print_error(struct pci_dev *dev,
 {
 	const char **strings;
 	unsigned long status = info->status & ~info->mask;
-	const char *level, *errmsg;
 	int i;
 
 	if (info->severity == AER_CORRECTABLE) {
 		strings = aer_correctable_error_string;
-		level = KERN_WARNING;
+		pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+			info->status, info->mask);
 	} else {
 		strings = aer_uncorrectable_error_string;
-		level = KERN_ERR;
+		pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+			info->status, info->mask);
 	}
 
 	for_each_set_bit(i, &status, 32) {
-		errmsg = strings[i];
+		const char *errmsg = strings[i];
+
 		if (!errmsg)
 			errmsg = "Unknown Error Bit";
 
-		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
+		if (info->severity == AER_CORRECTABLE)
+			pci_info(dev, "   [%2d] %-22s%s\n", i, errmsg,
+				info->first_error == i ? " (First)" : "");
+		else
+			pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
 				info->first_error == i ? " (First)" : "");
 	}
 	pci_dev_aer_stats_incr(dev, info);
@@ -724,7 +730,7 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
 	agent = AER_GET_AGENT(info->severity, info->status);
 
-	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
+	level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;
 
 	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
 		   aer_error_severity_string[info->severity],
@@ -797,14 +803,17 @@  void cper_print_aer(struct pci_dev *dev, int aer_severity,
 	info.mask = mask;
 	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
 
-	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
 	__aer_print_error(dev, &info);
-	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
-		aer_error_layer[layer], aer_agent_string[agent]);
 
-	if (aer_severity != AER_CORRECTABLE)
+	if (aer_severity == AER_CORRECTABLE) {
+		pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
+			aer_error_layer[layer], aer_agent_string[agent]);
+	} else {
+		pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
+			aer_error_layer[layer], aer_agent_string[agent]);
 		pci_err(dev, "aer_uncor_severity: 0x%08x\n",
 			aer->uncor_severity);
+	}
 
 	if (tlp_header_valid)
 		__print_tlp_header(dev, &aer->header_log);