Message ID | 20250319084050.366718-4-pandoh@google.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Rate limit AER logs | expand |
On Wed, Mar 19, 2025 at 01:40:44AM -0700, Jon Pan-Doh wrote: > Decouple stat collection from internal AER print functions. AERs from ghes > or cxl drivers have stat collection in pci_print_aer() as that is where > aer_err_info is populated. This moves pci_dev_aer_stats_incr() from __aer_print_error() to - pci_print_aer(), used by CXL via cxl_handle_rdport_errors() and GHES via aer_recover_queue() and aer_recover_work_func() - aer_process_err_devices(), used by native AER handling via aer_irq(), aer_isr(), aer_isr_one_error(), and - dpc_process_error(), used by native DPC handling via dpc_handler() and by ACPI EDR Notify events via edr_handle_event() And the reason for this is ...? Maybe just to separate stats from printing, which does seem reasonable to me, although pci_print_aer() is still primarily a printing function, albeit also an external interface. In any event, I would like to include the motivation. > Signed-off-by: Jon Pan-Doh <pandoh@google.com> > --- > drivers/pci/pci.h | 1 + > drivers/pci/pcie/aer.c | 10 ++++++---- > drivers/pci/pcie/dpc.c | 1 + > 3 files changed, 8 insertions(+), 4 deletions(-) > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 75985b96ecc1..9d63d32f041c 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -551,6 +551,7 @@ struct aer_err_info { > }; > > int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); > +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info); > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level); > > int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 7eeaad917134..8e4d4f9326e1 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = { > .is_visible = aer_stats_attrs_are_visible, > }; > > -static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > - struct aer_err_info *info) > +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info) > { > unsigned long status = info->status & ~info->mask; > int i, max = -1; > @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev, > aer_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, > info->first_error == i ? " (First)" : ""); > } > - pci_dev_aer_stats_incr(dev, info); > } > > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, > @@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > info.mask = mask; > info.first_error = PCI_ERR_CAP_FEP(aer->cap_control); > > + pci_dev_aer_stats_incr(dev, &info); > + > aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); > __aer_print_error(dev, &info, level); > aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n", > @@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info, > > /* Report all before handle them, not to lost records by reset etc. */ > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > - if (aer_get_device_error_info(e_info->dev[i], e_info)) > + if (aer_get_device_error_info(e_info->dev[i], e_info)) { > + pci_dev_aer_stats_incr(e_info->dev[i], e_info); > aer_print_error(e_info->dev[i], e_info, level); > + } > } > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > if (aer_get_device_error_info(e_info->dev[i], e_info)) > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c > index 9e4c9ac737a7..81cd6e8ff3a4 100644 > --- a/drivers/pci/pcie/dpc.c > +++ b/drivers/pci/pcie/dpc.c > @@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev) > else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR && > dpc_get_aer_uncorrect_severity(pdev, &info) && > aer_get_device_error_info(pdev, &info)) { > + pci_dev_aer_stats_incr(pdev, &info); > aer_print_error(pdev, &info, KERN_ERR); > pci_aer_clear_nonfatal_status(pdev); > pci_aer_clear_fatal_status(pdev); > -- > 2.49.0.rc1.451.g8f38331e32-goog >
On 3/19/25 1:40 AM, Jon Pan-Doh wrote: > Decouple stat collection from internal AER print functions. AERs from ghes > or cxl drivers have stat collection in pci_print_aer() as that is where > aer_err_info is populated. pci_print_aer() also seems to do more than printing the AER stat. Why only care about stat collection in __aer_print_error(). If the motivation is to just improve the code readability, I am not sure it is worth the effort. Please correct me if my understanding is incorrect. > > Signed-off-by: Jon Pan-Doh <pandoh@google.com> > --- > drivers/pci/pci.h | 1 + > drivers/pci/pcie/aer.c | 10 ++++++---- > drivers/pci/pcie/dpc.c | 1 + > 3 files changed, 8 insertions(+), 4 deletions(-) > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 75985b96ecc1..9d63d32f041c 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -551,6 +551,7 @@ struct aer_err_info { > }; > > int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); > +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info); > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level); > > int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 7eeaad917134..8e4d4f9326e1 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = { > .is_visible = aer_stats_attrs_are_visible, > }; > > -static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > - struct aer_err_info *info) > +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info) > { > unsigned long status = info->status & ~info->mask; > int i, max = -1; > @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev, > aer_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, > info->first_error == i ? " (First)" : ""); > } > - pci_dev_aer_stats_incr(dev, info); > } > > void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, > @@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > info.mask = mask; > info.first_error = PCI_ERR_CAP_FEP(aer->cap_control); > > + pci_dev_aer_stats_incr(dev, &info); > + > aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); > __aer_print_error(dev, &info, level); > aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n", > @@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info, > > /* Report all before handle them, not to lost records by reset etc. */ > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > - if (aer_get_device_error_info(e_info->dev[i], e_info)) > + if (aer_get_device_error_info(e_info->dev[i], e_info)) { > + pci_dev_aer_stats_incr(e_info->dev[i], e_info); > aer_print_error(e_info->dev[i], e_info, level); > + } > } > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > if (aer_get_device_error_info(e_info->dev[i], e_info)) > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c > index 9e4c9ac737a7..81cd6e8ff3a4 100644 > --- a/drivers/pci/pcie/dpc.c > +++ b/drivers/pci/pcie/dpc.c > @@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev) > else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR && > dpc_get_aer_uncorrect_severity(pdev, &info) && > aer_get_device_error_info(pdev, &info)) { > + pci_dev_aer_stats_incr(pdev, &info); > aer_print_error(pdev, &info, KERN_ERR); > pci_aer_clear_nonfatal_status(pdev); > pci_aer_clear_fatal_status(pdev);
On Wed, Mar 19, 2025 at 11:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote: > This moves pci_dev_aer_stats_incr() from __aer_print_error() to > > - pci_print_aer(), used by CXL via cxl_handle_rdport_errors() and > GHES via aer_recover_queue() and aer_recover_work_func() > > - aer_process_err_devices(), used by native AER handling via > aer_irq(), aer_isr(), aer_isr_one_error(), and > > - dpc_process_error(), used by native DPC handling via dpc_handler() > and by ACPI EDR Notify events via edr_handle_event() > > And the reason for this is ...? > > Maybe just to separate stats from printing, which does seem reasonable > to me, although pci_print_aer() is still primarily a printing > function, albeit also an external interface. Separating stats from internal print functions allows us to: - easily add ratelimits to logs while having stats unaffected - simplify control flow as stats collection is no longer buried pci_print_aer() is the odd one out, but that should be temporary as Karolina's log consolidation patch[1] should allow pci_dev_aer_stats_incr() to be pulled out of print functions and called after the newly created populate_aer_err_info(). > In any event, I would like to include the motivation. Ack. Added in v4. [1] https://lore.kernel.org/linux-pci/8bcb8c9a7b38ce3bdaca5a64fe76f08b0b337511.1742202797.git.karolina.stolarek@oracle.com/ Thanks, Jon
On Wed, Mar 19, 2025 at 8:23 PM Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@linux.intel.com> wrote: > pci_print_aer() also seems to do more than printing the AER stat. Why only > care about stat collection in __aer_print_error(). If the motivation is > to just improve the code readability, I am not sure it is worth the > effort. Please correct me if my understanding is incorrect. Bjorn had similar concerns. Hopefully my response[1] answers your questions. [1] https://lore.kernel.org/linux-pci/CAMC_AXW85x_LRT5UTD0C_VvK8WTG6kbfvp5k_7RjnLzGM3Bg-g@mail.gmail.com/ Thanks, Jon
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 75985b96ecc1..9d63d32f041c 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -551,6 +551,7 @@ struct aer_err_info { }; int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info); void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level); int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 7eeaad917134..8e4d4f9326e1 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = { .is_visible = aer_stats_attrs_are_visible, }; -static void pci_dev_aer_stats_incr(struct pci_dev *pdev, - struct aer_err_info *info) +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info) { unsigned long status = info->status & ~info->mask; int i, max = -1; @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev, aer_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, info->first_error == i ? " (First)" : ""); } - pci_dev_aer_stats_incr(dev, info); } void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, @@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, info.mask = mask; info.first_error = PCI_ERR_CAP_FEP(aer->cap_control); + pci_dev_aer_stats_incr(dev, &info); + aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); __aer_print_error(dev, &info, level); aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n", @@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info, /* Report all before handle them, not to lost records by reset etc. */ for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { - if (aer_get_device_error_info(e_info->dev[i], e_info)) + if (aer_get_device_error_info(e_info->dev[i], e_info)) { + pci_dev_aer_stats_incr(e_info->dev[i], e_info); aer_print_error(e_info->dev[i], e_info, level); + } } for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { if (aer_get_device_error_info(e_info->dev[i], e_info)) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 9e4c9ac737a7..81cd6e8ff3a4 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev) else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR && dpc_get_aer_uncorrect_severity(pdev, &info) && aer_get_device_error_info(pdev, &info)) { + pci_dev_aer_stats_incr(pdev, &info); aer_print_error(pdev, &info, KERN_ERR); pci_aer_clear_nonfatal_status(pdev); pci_aer_clear_fatal_status(pdev);
Decouple stat collection from internal AER print functions. AERs from ghes or cxl drivers have stat collection in pci_print_aer() as that is where aer_err_info is populated. Signed-off-by: Jon Pan-Doh <pandoh@google.com> --- drivers/pci/pci.h | 1 + drivers/pci/pcie/aer.c | 10 ++++++---- drivers/pci/pcie/dpc.c | 1 + 3 files changed, 8 insertions(+), 4 deletions(-)