diff mbox series

[4/8] PCI/AER: Introduce ratelimit for error logs

Message ID 20250115074301.3514927-5-pandoh@google.com (mailing list archive)
State New
Delegated to: Bjorn Helgaas
Headers show
Series Rate limit AER logs/IRQs | expand

Commit Message

Jon Pan-Doh Jan. 15, 2025, 7:42 a.m. UTC
Spammy devices can flood kernel logs with AER errors and slow/stall
execution. Add per-device ratelimits for AER errors (correctable and
uncorrectable). Set the default rate to the default kernel ratelimit
(10 per 5s).

Tested using aer-inject[1] tool. Sent 11 AER errors. Observed 10 errors
logged while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable)
show true count of 11.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git

Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
 Documentation/PCI/pcieaer-howto.rst |  6 ++++++
 drivers/pci/pcie/aer.c              | 31 +++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

Comments

Karolina Stolarek Jan. 16, 2025, 11:11 a.m. UTC | #1
On 15/01/2025 08:42, Jon Pan-Doh wrote:
> Spammy devices can flood kernel logs with AER errors and slow/stall
> execution. Add per-device ratelimits for AER errors (correctable and
> uncorrectable). Set the default rate to the default kernel ratelimit
> (10 per 5s).
> 
> Tested using aer-inject[1] tool. Sent 11 AER errors. Observed 10 errors
> logged while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable)
> show true count of 11.
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
> 
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
>   Documentation/PCI/pcieaer-howto.rst |  6 ++++++
>   drivers/pci/pcie/aer.c              | 31 +++++++++++++++++++++++++++--
>   2 files changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
> index f013f3b27c82..5546de60f184 100644
> --- a/Documentation/PCI/pcieaer-howto.rst
> +++ b/Documentation/PCI/pcieaer-howto.rst
> @@ -85,6 +85,12 @@ In the example, 'Requester ID' means the ID of the device that sent
>   the error message to the Root Port. Please refer to PCIe specs for other
>   fields.
>   
> +AER Ratelimits
> +-------------------------
> +
> +Error messages are ratelimited per device and error type. This prevents spammy
> +devices from flooding the console.
> +

Nit: I would create a separate commit for the documentation updates. 
Also, I think it would be good to mention the default interval and, 
maybe, explain why such rate-limiting was introduced in the first place. 
If you don't feel like writing about it, let me know and we can work it 
out together.

>   AER Statistics / Counters
>   -------------------------
>   
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 5ab5cd7368bc..025c50b0f293 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -27,6 +27,7 @@
>   #include <linux/interrupt.h>
>   #include <linux/delay.h>
>   #include <linux/kfifo.h>
> +#include <linux/ratelimit.h>
>   #include <linux/slab.h>
>   #include <acpi/apei.h>
>   #include <acpi/ghes.h>
> @@ -84,6 +85,10 @@ struct aer_info {
>   	u64 rootport_total_cor_errs;
>   	u64 rootport_total_fatal_errs;
>   	u64 rootport_total_nonfatal_errs;
> +
> +	/* Ratelimits for errors */
> +	struct ratelimit_state cor_log_ratelimit;
> +	struct ratelimit_state uncor_log_ratelimit;

My comment for 3/8 applies here as well.

>   };
>   
>   #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
> @@ -374,6 +379,12 @@ void pci_aer_init(struct pci_dev *dev)
>   		return;
>   
>   	dev->aer_info = kzalloc(sizeof(struct aer_info), GFP_KERNEL);
> +	ratelimit_state_init(&dev->aer_info->cor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> +	ratelimit_state_init(&dev->aer_info->uncor_log_ratelimit,
> +			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> +	ratelimit_set_flags(&dev->aer_info->cor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);
> +	ratelimit_set_flags(&dev->aer_info->uncor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);

In some cases, it would be beneficial to keep the "X callbacks 
suppressed" to give an idea of how prevalent the errors are. On some 
devices, we could see just 11 Correctable Errors per 5 seconds, but on 
others this would be >1k (seen such a case myself).

As it's not immediately clear what the defaults are, could you add a 
comment for this?

>   
>   	/*
>   	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
> @@ -702,6 +713,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>   	int layer, agent;
>   	int id = pci_dev_id(dev);
>   	const char *level;
> +	struct ratelimit_state *ratelimit;
>   
>   	if (!info->status) {
>   		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> @@ -709,11 +721,20 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>   		goto out;
>   	}
>   
> +	if (info->severity == AER_CORRECTABLE) {
> +		ratelimit = &dev->aer_info->cor_log_ratelimit;
> +		level = KERN_WARNING;
> +	} else {
> +		ratelimit = &dev->aer_info->uncor_log_ratelimit;
> +		level = KERN_ERR;
> +	}
> +
> +	if (!__ratelimit(ratelimit))
> +		return;
> +

Maybe it's just me but I found "!__ratelimit(..)" expression confusing. 
At first, I read this "if there is not ratelimit", whereas the function 
returns 0 ("hey, you can't fire at this point of time") and we negate 
it. My series attempted to communicate this via a variable, but maybe 
that was too wordy/complicated, so I'm not pushy about introducing a 
similar solution here.

All the best,
Karolina

>   	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
>   	agent = AER_GET_AGENT(info->severity, info->status);
>   
> -	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> -
>   	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
>   		   aer_error_severity_string[info->severity],
>   		   aer_error_layer[layer], aer_agent_string[agent]);
> @@ -755,11 +776,14 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   	int layer, agent, tlp_header_valid = 0;
>   	u32 status, mask;
>   	struct aer_err_info info;
> +	struct ratelimit_state *ratelimit;
>   
>   	if (aer_severity == AER_CORRECTABLE) {
> +		ratelimit = &dev->aer_info->cor_log_ratelimit;
>   		status = aer->cor_status;
>   		mask = aer->cor_mask;
>   	} else {
> +		ratelimit = &dev->aer_info->uncor_log_ratelimit;
>   		status = aer->uncor_status;
>   		mask = aer->uncor_mask;
>   		tlp_header_valid = status & AER_LOG_TLP_MASKS;
> @@ -776,6 +800,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   
>   	pci_dev_aer_stats_incr(dev, &info);
>   
> +	if (!__ratelimit(ratelimit))
> +		return;
> +
>   	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>   	__aer_print_error(dev, &info);
>   	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
Jon Pan-Doh Jan. 18, 2025, 1:59 a.m. UTC | #2
On Thu, Jan 16, 2025 at 3:11 AM Karolina Stolarek
<karolina.stolarek@oracle.com> wrote:
> On 15/01/2025 08:42, Jon Pan-Doh wrote:
> > @@ -374,6 +379,12 @@ void pci_aer_init(struct pci_dev *dev)
> > +     ratelimit_set_flags(&dev->aer_info->cor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);
> > +     ratelimit_set_flags(&dev->aer_info->uncor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);
>
> In some cases, it would be beneficial to keep the "X callbacks
> suppressed" to give an idea of how prevalent the errors are. On some
> devices, we could see just 11 Correctable Errors per 5 seconds, but on
> others this would be >1k (seen such a case myself).
>
> As it's not immediately clear what the defaults are, could you add a
> comment for this?

It seems like the convention is not to use RATELIMIT_MSG_ON_RELEASE (I
was unfamiliar :)). I'll omit this in the next version. Let me know if
you'd still like the comment in that case.

> > @@ -709,11 +721,20 @@ void aer_print_error(struct pci_dev *dev, struct
> > +     if (!__ratelimit(ratelimit))
> > +             return;
> > +
>
> Maybe it's just me but I found "!__ratelimit(..)" expression confusing.
> At first, I read this "if there is not ratelimit", whereas the function
> returns 0 ("hey, you can't fire at this point of time") and we negate
> it. My series attempted to communicate this via a variable, but maybe
> that was too wordy/complicated, so I'm not pushy about introducing a
> similar solution here.

I see your point. I'm not particularly attached to it, but the
alternatives seem equally unappealing :/:
- put rest of function inside conditional: if
(__ratelimit(ratelimit))) { log errors }
    - another print helper?
- separate variable (as you suggested

FYI the majority of existing usage seems to be split between
__ratelimit() and !__ratelimit() (though majority doesn't always
indicate the right thing). The only instance I see of a variable is in
drivers/iommu/intel/dmar.c:dmar_fault where the variable is used in
repeated conditionals.

Thanks,
Jon


On Thu, Jan 16, 2025 at 3:11 AM Karolina Stolarek
<karolina.stolarek@oracle.com> wrote:
>
> On 15/01/2025 08:42, Jon Pan-Doh wrote:
> > Spammy devices can flood kernel logs with AER errors and slow/stall
> > execution. Add per-device ratelimits for AER errors (correctable and
> > uncorrectable). Set the default rate to the default kernel ratelimit
> > (10 per 5s).
> >
> > Tested using aer-inject[1] tool. Sent 11 AER errors. Observed 10 errors
> > logged while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable)
> > show true count of 11.
> >
> > [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
> >
> > Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> > ---
> >   Documentation/PCI/pcieaer-howto.rst |  6 ++++++
> >   drivers/pci/pcie/aer.c              | 31 +++++++++++++++++++++++++++--
> >   2 files changed, 35 insertions(+), 2 deletions(-)
> >
> > diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
> > index f013f3b27c82..5546de60f184 100644
> > --- a/Documentation/PCI/pcieaer-howto.rst
> > +++ b/Documentation/PCI/pcieaer-howto.rst
> > @@ -85,6 +85,12 @@ In the example, 'Requester ID' means the ID of the device that sent
> >   the error message to the Root Port. Please refer to PCIe specs for other
> >   fields.
> >
> > +AER Ratelimits
> > +-------------------------
> > +
> > +Error messages are ratelimited per device and error type. This prevents spammy
> > +devices from flooding the console.
> > +
>
> Nit: I would create a separate commit for the documentation updates.
> Also, I think it would be good to mention the default interval and,
> maybe, explain why such rate-limiting was introduced in the first place.
> If you don't feel like writing about it, let me know and we can work it
> out together.
>
> >   AER Statistics / Counters
> >   -------------------------
> >
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index 5ab5cd7368bc..025c50b0f293 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -27,6 +27,7 @@
> >   #include <linux/interrupt.h>
> >   #include <linux/delay.h>
> >   #include <linux/kfifo.h>
> > +#include <linux/ratelimit.h>
> >   #include <linux/slab.h>
> >   #include <acpi/apei.h>
> >   #include <acpi/ghes.h>
> > @@ -84,6 +85,10 @@ struct aer_info {
> >       u64 rootport_total_cor_errs;
> >       u64 rootport_total_fatal_errs;
> >       u64 rootport_total_nonfatal_errs;
> > +
> > +     /* Ratelimits for errors */
> > +     struct ratelimit_state cor_log_ratelimit;
> > +     struct ratelimit_state uncor_log_ratelimit;
>
> My comment for 3/8 applies here as well.
>
> >   };
> >
> >   #define AER_LOG_TLP_MASKS           (PCI_ERR_UNC_POISON_TLP|        \
> > @@ -374,6 +379,12 @@ void pci_aer_init(struct pci_dev *dev)
> >               return;
> >
> >       dev->aer_info = kzalloc(sizeof(struct aer_info), GFP_KERNEL);
> > +     ratelimit_state_init(&dev->aer_info->cor_log_ratelimit,
> > +                          DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> > +     ratelimit_state_init(&dev->aer_info->uncor_log_ratelimit,
> > +                          DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
> > +     ratelimit_set_flags(&dev->aer_info->cor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);
> > +     ratelimit_set_flags(&dev->aer_info->uncor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);
>
> In some cases, it would be beneficial to keep the "X callbacks
> suppressed" to give an idea of how prevalent the errors are. On some
> devices, we could see just 11 Correctable Errors per 5 seconds, but on
> others this would be >1k (seen such a case myself).
>
> As it's not immediately clear what the defaults are, could you add a
> comment for this?
>
> >
> >       /*
> >        * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
> > @@ -702,6 +713,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> >       int layer, agent;
> >       int id = pci_dev_id(dev);
> >       const char *level;
> > +     struct ratelimit_state *ratelimit;
> >
> >       if (!info->status) {
> >               pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> > @@ -709,11 +721,20 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
> >               goto out;
> >       }
> >
> > +     if (info->severity == AER_CORRECTABLE) {
> > +             ratelimit = &dev->aer_info->cor_log_ratelimit;
> > +             level = KERN_WARNING;
> > +     } else {
> > +             ratelimit = &dev->aer_info->uncor_log_ratelimit;
> > +             level = KERN_ERR;
> > +     }
> > +
> > +     if (!__ratelimit(ratelimit))
> > +             return;
> > +
>
> Maybe it's just me but I found "!__ratelimit(..)" expression confusing.
> At first, I read this "if there is not ratelimit", whereas the function
> returns 0 ("hey, you can't fire at this point of time") and we negate
> it. My series attempted to communicate this via a variable, but maybe
> that was too wordy/complicated, so I'm not pushy about introducing a
> similar solution here.
>
> All the best,
> Karolina
>
> >       layer = AER_GET_LAYER_ERROR(info->severity, info->status);
> >       agent = AER_GET_AGENT(info->severity, info->status);
> >
> > -     level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> > -
> >       pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> >                  aer_error_severity_string[info->severity],
> >                  aer_error_layer[layer], aer_agent_string[agent]);
> > @@ -755,11 +776,14 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> >       int layer, agent, tlp_header_valid = 0;
> >       u32 status, mask;
> >       struct aer_err_info info;
> > +     struct ratelimit_state *ratelimit;
> >
> >       if (aer_severity == AER_CORRECTABLE) {
> > +             ratelimit = &dev->aer_info->cor_log_ratelimit;
> >               status = aer->cor_status;
> >               mask = aer->cor_mask;
> >       } else {
> > +             ratelimit = &dev->aer_info->uncor_log_ratelimit;
> >               status = aer->uncor_status;
> >               mask = aer->uncor_mask;
> >               tlp_header_valid = status & AER_LOG_TLP_MASKS;
> > @@ -776,6 +800,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
> >
> >       pci_dev_aer_stats_incr(dev, &info);
> >
> > +     if (!__ratelimit(ratelimit))
> > +             return;
> > +
> >       pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> >       __aer_print_error(dev, &info);
> >       pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
>
Karolina Stolarek Jan. 20, 2025, 10:25 a.m. UTC | #3
On 18/01/2025 02:59, Jon Pan-Doh wrote:
> On Thu, Jan 16, 2025 at 3:11 AM Karolina Stolarek
> <karolina.stolarek@oracle.com> wrote:
>> In some cases, it would be beneficial to keep the "X callbacks
>> suppressed" to give an idea of how prevalent the errors are. On some
>> devices, we could see just 11 Correctable Errors per 5 seconds, but on
>> others this would be >1k (seen such a case myself).
>>
>> As it's not immediately clear what the defaults are, could you add a
>> comment for this?
> 
> It seems like the convention is not to use RATELIMIT_MSG_ON_RELEASE (I
> was unfamiliar :)). I'll omit this in the next version. Let me know if
> you'd still like the comment in that case.

Interesting, I wasn't aware of that. There are still a couple of places 
where this flag is used, but maybe it's legacy code.

I'm not too passionate about the comment, it was just a suggestion (I 
tend to be verbose in my code), feel free to omit it.

> FYI the majority of existing usage seems to be split between
> __ratelimit() and !__ratelimit() (though majority doesn't always
> indicate the right thing). The only instance I see of a variable is in
> drivers/iommu/intel/dmar.c:dmar_fault where the variable is used in
> repeated conditionals.

Yeah, it's a problem that's hard to solve at this level.

I thought that introducing a variable to make it easier to read is 
acceptable, but it's hard to defend this approach when the value is 
checked only once. Let's keep things simple and leave it as it is.

All the best,
Karolina

> 
> Thanks,
> Jon
diff mbox series

Patch

diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
index f013f3b27c82..5546de60f184 100644
--- a/Documentation/PCI/pcieaer-howto.rst
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -85,6 +85,12 @@  In the example, 'Requester ID' means the ID of the device that sent
 the error message to the Root Port. Please refer to PCIe specs for other
 fields.
 
+AER Ratelimits
+-------------------------
+
+Error messages are ratelimited per device and error type. This prevents spammy
+devices from flooding the console.
+
 AER Statistics / Counters
 -------------------------
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 5ab5cd7368bc..025c50b0f293 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -27,6 +27,7 @@ 
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/kfifo.h>
+#include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <acpi/apei.h>
 #include <acpi/ghes.h>
@@ -84,6 +85,10 @@  struct aer_info {
 	u64 rootport_total_cor_errs;
 	u64 rootport_total_fatal_errs;
 	u64 rootport_total_nonfatal_errs;
+
+	/* Ratelimits for errors */
+	struct ratelimit_state cor_log_ratelimit;
+	struct ratelimit_state uncor_log_ratelimit;
 };
 
 #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
@@ -374,6 +379,12 @@  void pci_aer_init(struct pci_dev *dev)
 		return;
 
 	dev->aer_info = kzalloc(sizeof(struct aer_info), GFP_KERNEL);
+	ratelimit_state_init(&dev->aer_info->cor_log_ratelimit,
+			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+	ratelimit_state_init(&dev->aer_info->uncor_log_ratelimit,
+			     DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+	ratelimit_set_flags(&dev->aer_info->cor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+	ratelimit_set_flags(&dev->aer_info->uncor_log_ratelimit, RATELIMIT_MSG_ON_RELEASE);
 
 	/*
 	 * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
@@ -702,6 +713,7 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	int layer, agent;
 	int id = pci_dev_id(dev);
 	const char *level;
+	struct ratelimit_state *ratelimit;
 
 	if (!info->status) {
 		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -709,11 +721,20 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 		goto out;
 	}
 
+	if (info->severity == AER_CORRECTABLE) {
+		ratelimit = &dev->aer_info->cor_log_ratelimit;
+		level = KERN_WARNING;
+	} else {
+		ratelimit = &dev->aer_info->uncor_log_ratelimit;
+		level = KERN_ERR;
+	}
+
+	if (!__ratelimit(ratelimit))
+		return;
+
 	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
 	agent = AER_GET_AGENT(info->severity, info->status);
 
-	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
-
 	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
 		   aer_error_severity_string[info->severity],
 		   aer_error_layer[layer], aer_agent_string[agent]);
@@ -755,11 +776,14 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 	int layer, agent, tlp_header_valid = 0;
 	u32 status, mask;
 	struct aer_err_info info;
+	struct ratelimit_state *ratelimit;
 
 	if (aer_severity == AER_CORRECTABLE) {
+		ratelimit = &dev->aer_info->cor_log_ratelimit;
 		status = aer->cor_status;
 		mask = aer->cor_mask;
 	} else {
+		ratelimit = &dev->aer_info->uncor_log_ratelimit;
 		status = aer->uncor_status;
 		mask = aer->uncor_mask;
 		tlp_header_valid = status & AER_LOG_TLP_MASKS;
@@ -776,6 +800,9 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 
 	pci_dev_aer_stats_incr(dev, &info);
 
+	if (!__ratelimit(ratelimit))
+		return;
+
 	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
 	__aer_print_error(dev, &info);
 	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",