Message ID | 20250115074301.3514927-4-pandoh@google.com (mailing list archive) |
---|---|
State | New |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | Rate limit AER logs/IRQs | expand |
On 15/01/2025 08:42, Jon Pan-Doh wrote: > Update name to reflect the broader definition of structs/variables that > are stored (e.g. ratelimits). I understand the intention behind this change, but I'm not fully convinced if we should mix AER error attributes with the tools to control error reporting/generation (ratelimits). I'd argue that "aer_info" name still doesn't express what the structure does. aer_stats sits in pci_dev, so I can see why you decided to use it; it's one of the few available places where we could keep a stateful ratelimit. How about creating a struct to keep all the ratelimits in one place and embedding that in the pci_dev? All the best, Karolina > > Signed-off-by: Jon Pan-Doh <pandoh@google.com> > --- > drivers/pci/pcie/aer.c | 50 +++++++++++++++++++++--------------------- > include/linux/pci.h | 2 +- > 2 files changed, 26 insertions(+), 26 deletions(-) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 4bb0b3840402..5ab5cd7368bc 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -50,11 +50,11 @@ struct aer_rpc { > DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX); > }; > > -/* AER stats for the device */ > -struct aer_stats { > +/* AER info for the device */ > +struct aer_info { > > /* > - * Fields for all AER capable devices. They indicate the errors > + * Stats for all AER capable devices. They indicate the errors > * "as seen by this device". Note that this may mean that if an > * end point is causing problems, the AER counters may increment > * at its link partner (e.g. root port) because the errors will be > @@ -76,7 +76,7 @@ struct aer_stats { > u64 dev_total_nonfatal_errs; > > /* > - * Fields for Root ports & root complex event collectors only, these > + * Stats for Root ports & root complex event collectors only, these > * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL > * messages received by the root port / event collector, INCLUDING the > * ones that are generated internally (by the rootport itself) > @@ -373,7 +373,7 @@ void pci_aer_init(struct pci_dev *dev) > if (!dev->aer_cap) > return; > > - dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL); > + dev->aer_info = kzalloc(sizeof(struct aer_info), GFP_KERNEL); > > /* > * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, > @@ -394,8 +394,8 @@ void pci_aer_init(struct pci_dev *dev) > > void pci_aer_exit(struct pci_dev *dev) > { > - kfree(dev->aer_stats); > - dev->aer_stats = NULL; > + kfree(dev->aer_info); > + dev->aer_info = NULL; > } > > #define AER_AGENT_RECEIVER 0 > @@ -533,10 +533,10 @@ static const char *aer_agent_string[] = { > { \ > unsigned int i; \ > struct pci_dev *pdev = to_pci_dev(dev); \ > - u64 *stats = pdev->aer_stats->stats_array; \ > + u64 *stats = pdev->aer_info->stats_array; \ > size_t len = 0; \ > \ > - for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\ > + for (i = 0; i < ARRAY_SIZE(pdev->aer_info->stats_array); i++) {\ > if (strings_array[i]) \ > len += sysfs_emit_at(buf, len, "%s %llu\n", \ > strings_array[i], \ > @@ -547,7 +547,7 @@ static const char *aer_agent_string[] = { > i, stats[i]); \ > } \ > len += sysfs_emit_at(buf, len, "TOTAL_%s %llu\n", total_string, \ > - pdev->aer_stats->total_field); \ > + pdev->aer_info->total_field); \ > return len; \ > } \ > static DEVICE_ATTR_RO(name) > @@ -568,7 +568,7 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs, > char *buf) \ > { \ > struct pci_dev *pdev = to_pci_dev(dev); \ > - return sysfs_emit(buf, "%llu\n", pdev->aer_stats->field); \ > + return sysfs_emit(buf, "%llu\n", pdev->aer_info->field); \ > } \ > static DEVICE_ATTR_RO(name) > > @@ -595,7 +595,7 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj, > struct device *dev = kobj_to_dev(kobj); > struct pci_dev *pdev = to_pci_dev(dev); > > - if (!pdev->aer_stats) > + if (!pdev->aer_info) > return 0; > > if ((a == &dev_attr_aer_rootport_total_err_cor.attr || > @@ -619,25 +619,25 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > unsigned long status = info->status & ~info->mask; > int i, max = -1; > u64 *counter = NULL; > - struct aer_stats *aer_stats = pdev->aer_stats; > + struct aer_info *aer_info = pdev->aer_info; > > - if (!aer_stats) > + if (!aer_info) > return; > > switch (info->severity) { > case AER_CORRECTABLE: > - aer_stats->dev_total_cor_errs++; > - counter = &aer_stats->dev_cor_errs[0]; > + aer_info->dev_total_cor_errs++; > + counter = &aer_info->dev_cor_errs[0]; > max = AER_MAX_TYPEOF_COR_ERRS; > break; > case AER_NONFATAL: > - aer_stats->dev_total_nonfatal_errs++; > - counter = &aer_stats->dev_nonfatal_errs[0]; > + aer_info->dev_total_nonfatal_errs++; > + counter = &aer_info->dev_nonfatal_errs[0]; > max = AER_MAX_TYPEOF_UNCOR_ERRS; > break; > case AER_FATAL: > - aer_stats->dev_total_fatal_errs++; > - counter = &aer_stats->dev_fatal_errs[0]; > + aer_info->dev_total_fatal_errs++; > + counter = &aer_info->dev_fatal_errs[0]; > max = AER_MAX_TYPEOF_UNCOR_ERRS; > break; > } > @@ -649,19 +649,19 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, > struct aer_err_source *e_src) > { > - struct aer_stats *aer_stats = pdev->aer_stats; > + struct aer_info *aer_info = pdev->aer_info; > > - if (!aer_stats) > + if (!aer_info) > return; > > if (e_src->status & PCI_ERR_ROOT_COR_RCV) > - aer_stats->rootport_total_cor_errs++; > + aer_info->rootport_total_cor_errs++; > > if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) { > if (e_src->status & PCI_ERR_ROOT_FATAL_RCV) > - aer_stats->rootport_total_fatal_errs++; > + aer_info->rootport_total_fatal_errs++; > else > - aer_stats->rootport_total_nonfatal_errs++; > + aer_info->rootport_total_nonfatal_errs++; > } > } > > diff --git a/include/linux/pci.h b/include/linux/pci.h > index db9b47ce3eef..72e6f5560164 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -346,7 +346,7 @@ struct pci_dev { > u8 hdr_type; /* PCI header type (`multi' flag masked out) */ > #ifdef CONFIG_PCIEAER > u16 aer_cap; /* AER capability offset */ > - struct aer_stats *aer_stats; /* AER stats for this device */ > + struct aer_info *aer_info; /* AER info for this device */ > #endif > #ifdef CONFIG_PCIEPORTBUS > struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
On Thu, Jan 16, 2025 at 2:12 AM Karolina Stolarek <karolina.stolarek@oracle.com> wrote: > On 15/01/2025 08:42, Jon Pan-Doh wrote: > > Update name to reflect the broader definition of structs/variables that > > are stored (e.g. ratelimits). > > I understand the intention behind this change, but I'm not fully > convinced if we should mix AER error attributes with the tools to > control error reporting/generation (ratelimits). I'd argue that > "aer_info" name still doesn't express what the structure does. > > aer_stats sits in pci_dev, so I can see why you decided to use it; it's > one of the few available places where we could keep a stateful ratelimit. > > How about creating a struct to keep all the ratelimits in one place and > embedding that in the pci_dev? In a previous version, I had a separate struct aer_ratelimits in pci_dev (similar to your patch[1]): @@ -346,7 +346,7 @@ struct pci_dev { u8 hdr_type; /* PCI header type (`multi' flag masked out) */ #ifdef CONFIG_PCIEAER u16 aer_cap; /* AER capability offset */ struct aer_stats *aer_stats; /* AER stats for this device */ + struct aer_ratelimits *aer_ratelimits; /* AER ratelimits for this device */ However, in an internal review, Bjorn said: > I don't think we need to burn two pointers here since we always populate both at the > same time. Perhaps combine the stats and ratelimits into a single "struct aer_info" or > similar. It seems like the preference is to minimize additional memory in pci_dev. Any suggestions for a more appropriate name than "aer_info"? Otherwise, is this a good enough justification to maintain two pointers Bjorn? Thanks, Jon [1] https://lore.kernel.org/linux-pci/68ef082c855b4e1d094dcfc9a861f43488b64922.1736341506.git.karolina.stolarek@oracle.com/#Z31include:linux:pci.h
On 18/01/2025 02:59, Jon Pan-Doh wrote: > > In a previous version, I had a separate struct aer_ratelimits in > pci_dev (similar to your patch[1]): > > @@ -346,7 +346,7 @@ struct pci_dev { > u8 hdr_type; /* PCI header type (`multi' > flag masked out) */ > #ifdef CONFIG_PCIEAER > u16 aer_cap; /* AER capability offset */ > struct aer_stats *aer_stats; /* AER stats for this device */ > + struct aer_ratelimits *aer_ratelimits; /* AER ratelimits > for this device */ > > However, in an internal review, Bjorn said: >> I don't think we need to burn two pointers here since we always populate both at the >> same time. Perhaps combine the stats and ratelimits into a single "struct aer_info" or >> similar. I see, thanks for sharing it here. My thinking is that it's preferable to have small and specialized structs, with one used for collecting/showing error info and the other to control the rates at which they are reported and generated. I'd propose a similar change to the one above. > It seems like the preference is to minimize additional memory in > pci_dev. Any suggestions for a more appropriate name than "aer_info"? Like I said, my preference would be to leave aer_stats as it is, but if we have to stick to one struct, I'd go with "aer_control" or "aer_report". All the best, Karolina > > Otherwise, is this a good enough justification to maintain two pointers Bjorn? > > Thanks, > Jon > > [1] https://lore.kernel.org/linux-pci/68ef082c855b4e1d094dcfc9a861f43488b64922.1736341506.git.karolina.stolarek@oracle.com/#Z31include:linux:pci.h
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 4bb0b3840402..5ab5cd7368bc 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -50,11 +50,11 @@ struct aer_rpc { DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX); }; -/* AER stats for the device */ -struct aer_stats { +/* AER info for the device */ +struct aer_info { /* - * Fields for all AER capable devices. They indicate the errors + * Stats for all AER capable devices. They indicate the errors * "as seen by this device". Note that this may mean that if an * end point is causing problems, the AER counters may increment * at its link partner (e.g. root port) because the errors will be @@ -76,7 +76,7 @@ struct aer_stats { u64 dev_total_nonfatal_errs; /* - * Fields for Root ports & root complex event collectors only, these + * Stats for Root ports & root complex event collectors only, these * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL * messages received by the root port / event collector, INCLUDING the * ones that are generated internally (by the rootport itself) @@ -373,7 +373,7 @@ void pci_aer_init(struct pci_dev *dev) if (!dev->aer_cap) return; - dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL); + dev->aer_info = kzalloc(sizeof(struct aer_info), GFP_KERNEL); /* * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, @@ -394,8 +394,8 @@ void pci_aer_init(struct pci_dev *dev) void pci_aer_exit(struct pci_dev *dev) { - kfree(dev->aer_stats); - dev->aer_stats = NULL; + kfree(dev->aer_info); + dev->aer_info = NULL; } #define AER_AGENT_RECEIVER 0 @@ -533,10 +533,10 @@ static const char *aer_agent_string[] = { { \ unsigned int i; \ struct pci_dev *pdev = to_pci_dev(dev); \ - u64 *stats = pdev->aer_stats->stats_array; \ + u64 *stats = pdev->aer_info->stats_array; \ size_t len = 0; \ \ - for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\ + for (i = 0; i < ARRAY_SIZE(pdev->aer_info->stats_array); i++) {\ if (strings_array[i]) \ len += sysfs_emit_at(buf, len, "%s %llu\n", \ strings_array[i], \ @@ -547,7 +547,7 @@ static const char *aer_agent_string[] = { i, stats[i]); \ } \ len += sysfs_emit_at(buf, len, "TOTAL_%s %llu\n", total_string, \ - pdev->aer_stats->total_field); \ + pdev->aer_info->total_field); \ return len; \ } \ static DEVICE_ATTR_RO(name) @@ -568,7 +568,7 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs, char *buf) \ { \ struct pci_dev *pdev = to_pci_dev(dev); \ - return sysfs_emit(buf, "%llu\n", pdev->aer_stats->field); \ + return sysfs_emit(buf, "%llu\n", pdev->aer_info->field); \ } \ static DEVICE_ATTR_RO(name) @@ -595,7 +595,7 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct pci_dev *pdev = to_pci_dev(dev); - if (!pdev->aer_stats) + if (!pdev->aer_info) return 0; if ((a == &dev_attr_aer_rootport_total_err_cor.attr || @@ -619,25 +619,25 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, unsigned long status = info->status & ~info->mask; int i, max = -1; u64 *counter = NULL; - struct aer_stats *aer_stats = pdev->aer_stats; + struct aer_info *aer_info = pdev->aer_info; - if (!aer_stats) + if (!aer_info) return; switch (info->severity) { case AER_CORRECTABLE: - aer_stats->dev_total_cor_errs++; - counter = &aer_stats->dev_cor_errs[0]; + aer_info->dev_total_cor_errs++; + counter = &aer_info->dev_cor_errs[0]; max = AER_MAX_TYPEOF_COR_ERRS; break; case AER_NONFATAL: - aer_stats->dev_total_nonfatal_errs++; - counter = &aer_stats->dev_nonfatal_errs[0]; + aer_info->dev_total_nonfatal_errs++; + counter = &aer_info->dev_nonfatal_errs[0]; max = AER_MAX_TYPEOF_UNCOR_ERRS; break; case AER_FATAL: - aer_stats->dev_total_fatal_errs++; - counter = &aer_stats->dev_fatal_errs[0]; + aer_info->dev_total_fatal_errs++; + counter = &aer_info->dev_fatal_errs[0]; max = AER_MAX_TYPEOF_UNCOR_ERRS; break; } @@ -649,19 +649,19 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, struct aer_err_source *e_src) { - struct aer_stats *aer_stats = pdev->aer_stats; + struct aer_info *aer_info = pdev->aer_info; - if (!aer_stats) + if (!aer_info) return; if (e_src->status & PCI_ERR_ROOT_COR_RCV) - aer_stats->rootport_total_cor_errs++; + aer_info->rootport_total_cor_errs++; if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) { if (e_src->status & PCI_ERR_ROOT_FATAL_RCV) - aer_stats->rootport_total_fatal_errs++; + aer_info->rootport_total_fatal_errs++; else - aer_stats->rootport_total_nonfatal_errs++; + aer_info->rootport_total_nonfatal_errs++; } } diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..72e6f5560164 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -346,7 +346,7 @@ struct pci_dev { u8 hdr_type; /* PCI header type (`multi' flag masked out) */ #ifdef CONFIG_PCIEAER u16 aer_cap; /* AER capability offset */ - struct aer_stats *aer_stats; /* AER stats for this device */ + struct aer_info *aer_info; /* AER info for this device */ #endif #ifdef CONFIG_PCIEPORTBUS struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
Update name to reflect the broader definition of structs/variables that are stored (e.g. ratelimits). Signed-off-by: Jon Pan-Doh <pandoh@google.com> --- drivers/pci/pcie/aer.c | 50 +++++++++++++++++++++--------------------- include/linux/pci.h | 2 +- 2 files changed, 26 insertions(+), 26 deletions(-)