Message ID | 20250214023543.992372-6-pandoh@google.com (mailing list archive) |
---|---|
State | New |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | Rate limit AER logs | expand |
On 14/02/2025 03:35, Jon Pan-Doh wrote: > Spammy devices can flood kernel logs with AER errors and slow/stall > execution. Add per-device ratelimits for AER errors (correctable and > uncorrectable). Set the default rate to the default kernel ratelimit > (10 per 5s). I'd rephrase the last sentence to say "Add per-device ratelimits for AER correctable and uncorrectable errors that use the kernel defaults (10 bursts per 5s)", but it's just a nit. Overall, it looks good to me: Reviewed-by: Karolina Stolarek <karolina.stolarek@oracle.com> > > Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged > while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show > true count of 11. > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git > > Signed-off-by: Jon Pan-Doh <pandoh@google.com> > --- > drivers/pci/pcie/aer.c | 24 ++++++++++++++++++++++++ > 1 file changed, 24 insertions(+) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index b4f902fd5ef6..c5b5381e2930 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -28,6 +28,7 @@ > #include <linux/interrupt.h> > #include <linux/delay.h> > #include <linux/kfifo.h> > +#include <linux/ratelimit.h> > #include <linux/slab.h> > #include <acpi/apei.h> > #include <acpi/ghes.h> > @@ -88,6 +89,10 @@ struct aer_report { > u64 rootport_total_cor_errs; > u64 rootport_total_fatal_errs; > u64 rootport_total_nonfatal_errs; > + > + /* Ratelimits for errors */ > + struct ratelimit_state cor_log_ratelimit; > + struct ratelimit_state uncor_log_ratelimit; > }; > > #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ > @@ -378,6 +383,10 @@ void pci_aer_init(struct pci_dev *dev) > return; > > dev->aer_report = kzalloc(sizeof(struct aer_report), GFP_KERNEL); > + ratelimit_state_init(&dev->aer_report->cor_log_ratelimit, > + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); > + ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit, > + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); > > /* > * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, > @@ -697,6 +706,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, > { > int layer, agent; > int id = pci_dev_id(dev); > + struct ratelimit_state *ratelimit; > > if (!info->status) { > pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", > @@ -704,6 +714,14 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, > goto out; > } > > + if (info->severity == AER_CORRECTABLE) > + ratelimit = &dev->aer_report->cor_log_ratelimit; > + else > + ratelimit = &dev->aer_report->uncor_log_ratelimit; > + > + if (!__ratelimit(ratelimit)) > + return; > + > layer = AER_GET_LAYER_ERROR(info->severity, info->status); > agent = AER_GET_AGENT(info->severity, info->status); > > @@ -749,12 +767,15 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > u32 status, mask; > const char *level; > struct aer_err_info info; > + struct ratelimit_state *ratelimit; > > if (aer_severity == AER_CORRECTABLE) { > + ratelimit = &dev->aer_report->cor_log_ratelimit; > status = aer->cor_status; > mask = aer->cor_mask; > level = KERN_WARNING; > } else { > + ratelimit = &dev->aer_report->uncor_log_ratelimit; > status = aer->uncor_status; > mask = aer->uncor_mask; > tlp_header_valid = status & AER_LOG_TLP_MASKS; > @@ -772,6 +793,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > > pci_dev_aer_stats_incr(dev, &info); > > + if (!__ratelimit(ratelimit)) > + return; > + > aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); > __aer_print_error(dev, &info, level); > aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
On Mon, Feb 17, 2025 at 3:29 AM Karolina Stolarek <karolina.stolarek@oracle.com> wrote: > I'd rephrase the last sentence to say "Add per-device ratelimits for AER > correctable and uncorrectable errors that use the kernel defaults (10 > bursts per 5s)", but it's just a nit. Ack. Will change in v3. Thanks, Jon
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index b4f902fd5ef6..c5b5381e2930 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -28,6 +28,7 @@ #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/kfifo.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include <acpi/apei.h> #include <acpi/ghes.h> @@ -88,6 +89,10 @@ struct aer_report { u64 rootport_total_cor_errs; u64 rootport_total_fatal_errs; u64 rootport_total_nonfatal_errs; + + /* Ratelimits for errors */ + struct ratelimit_state cor_log_ratelimit; + struct ratelimit_state uncor_log_ratelimit; }; #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ @@ -378,6 +383,10 @@ void pci_aer_init(struct pci_dev *dev) return; dev->aer_report = kzalloc(sizeof(struct aer_report), GFP_KERNEL); + ratelimit_state_init(&dev->aer_report->cor_log_ratelimit, + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + ratelimit_state_init(&dev->aer_report->uncor_log_ratelimit, + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); /* * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, @@ -697,6 +706,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, { int layer, agent; int id = pci_dev_id(dev); + struct ratelimit_state *ratelimit; if (!info->status) { pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", @@ -704,6 +714,14 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, goto out; } + if (info->severity == AER_CORRECTABLE) + ratelimit = &dev->aer_report->cor_log_ratelimit; + else + ratelimit = &dev->aer_report->uncor_log_ratelimit; + + if (!__ratelimit(ratelimit)) + return; + layer = AER_GET_LAYER_ERROR(info->severity, info->status); agent = AER_GET_AGENT(info->severity, info->status); @@ -749,12 +767,15 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, u32 status, mask; const char *level; struct aer_err_info info; + struct ratelimit_state *ratelimit; if (aer_severity == AER_CORRECTABLE) { + ratelimit = &dev->aer_report->cor_log_ratelimit; status = aer->cor_status; mask = aer->cor_mask; level = KERN_WARNING; } else { + ratelimit = &dev->aer_report->uncor_log_ratelimit; status = aer->uncor_status; mask = aer->uncor_mask; tlp_header_valid = status & AER_LOG_TLP_MASKS; @@ -772,6 +793,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, pci_dev_aer_stats_incr(dev, &info); + if (!__ratelimit(ratelimit)) + return; + aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); __aer_print_error(dev, &info, level); aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
Spammy devices can flood kernel logs with AER errors and slow/stall execution. Add per-device ratelimits for AER errors (correctable and uncorrectable). Set the default rate to the default kernel ratelimit (10 per 5s). Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged while AER stats (cat /sys/bus/pci/devices/<dev>/aer_dev_correctable) show true count of 11. [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git Signed-off-by: Jon Pan-Doh <pandoh@google.com> --- drivers/pci/pcie/aer.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+)