Message ID | 20180620234147.48438-1-rajatja@google.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
On Wed, Jun 20, 2018 at 04:41:43PM -0700, Rajat Jain wrote: > Define a structure to hold the AER statistics. There are 2 groups > of statistics: dev_* counters that are to be collected for all AER > capable devices and rootport_* counters that are collected for all > (AER capable) rootports only. Allocate and free this structure when > device is added or released (thus counters survive the lifetime of the > device). > > Signed-off-by: Rajat Jain <rajatja@google.com> > --- > v5: Same as v4 > v4: Same as v3 > v3: Merge everything in aer.c > > drivers/pci/pcie/aer.c | 60 ++++++++++++++++++++++++++++++++++++++++++ > drivers/pci/probe.c | 1 + > include/linux/pci.h | 3 +++ > 3 files changed, 64 insertions(+) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index a2e88386af28..f9fa994b6c33 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -33,6 +33,9 @@ > #define AER_ERROR_SOURCES_MAX 100 > #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ > > +#define AER_MAX_TYPEOF_CORRECTABLE_ERRS 16 /* as per PCI_ERR_COR_STATUS */ > +#define AER_MAX_TYPEOF_UNCORRECTABLE_ERRS 26 /* as per PCI_ERR_UNCOR_STATUS*/ > + > struct aer_err_info { > struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; > int error_dev_num; > @@ -76,6 +79,40 @@ struct aer_rpc { > */ > }; > > +/* AER stats for the device */ > +struct aer_stats { > + > + /* > + * Fields for all AER capable devices. They indicate the errors > + * "as seen by this device". Note that this may mean that if an > + * end point is causing problems, the AER counters may increment > + * at its link partner (e.g. root port) because the errors will be > + * "seen" by the link partner and not the the problematic end point > + * itself (which may report all counters as 0 as it never saw any > + * problems). > + */ > + /* Individual counters for different type of correctable errors */ > + u64 dev_cor_errs[AER_MAX_TYPEOF_CORRECTABLE_ERRS]; > + /* Individual counters for different type of uncorrectable errors */ > + u64 dev_uncor_errs[AER_MAX_TYPEOF_UNCORRECTABLE_ERRS]; > + /* Total number of correctable errors seen by this device */ > + u64 dev_total_cor_errs; > + /* Total number of fatal uncorrectable errors seen by this device */ > + u64 dev_total_fatal_errs; > + /* Total number of fatal uncorrectable errors seen by this device */ > + u64 dev_total_nonfatal_errs; > + > + /* > + * Fields for Root ports only, these indicate the total number of > + * ERR_COR, ERR_FATAL, and ERR_NONFATAL messages received by the > + * rootport, INCLUDING the ones that are generated internally (by > + * the rootport itself) Strictly speaking, I think these are applicable for both root ports and root complex event collectors, right? > + */ > + u64 rootport_total_cor_errs; > + u64 rootport_total_fatal_errs; > + u64 rootport_total_nonfatal_errs; > +}; > + > #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ > PCI_ERR_UNC_ECRC| \ > PCI_ERR_UNC_UNSUP| \ > @@ -402,12 +439,35 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) > return 0; > } > > +static int pci_aer_stats_init(struct pci_dev *pdev) > +{ > + pdev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL); > + if (!pdev->aer_stats) { > + dev_err(&pdev->dev, "No memory for aer_stats\n"); pci_err(), if we need the message at all. Based on c7abb2352c29 ("PCI: Remove unnecessary messages for memory allocation failures"), I'd be inclined to drop the message completely. > + return -ENOMEM; > + } > + return 0; > +} > + > +static void pci_aer_stats_exit(struct pci_dev *pdev) > +{ > + kfree(pdev->aer_stats); > + pdev->aer_stats = NULL; > +} > + > int pci_aer_init(struct pci_dev *dev) > { > dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); > + if (!dev->aer_cap || pci_aer_stats_init(dev)) > + return -EIO; This skips pci_cleanup_aer_error_status_regs() if the kzalloc() fails. I think we should still do pci_cleanup_aer_error_status_regs(), even if the alloc fails. Nobody checks the return value of pci_aer_init(), so I think you can simplify this by making these functions void. Maybe even squash them together, i.e., do the kzalloc() directly in pci_aer_init() and the kfree() directly in pci_aer_exit()? > return pci_cleanup_aer_error_status_regs(dev); > } > > +void pci_aer_exit(struct pci_dev *dev) > +{ > + pci_aer_stats_exit(dev); > +} > + > #define AER_AGENT_RECEIVER 0 > #define AER_AGENT_REQUESTER 1 > #define AER_AGENT_COMPLETER 2 > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c > index ac876e32de4b..48edd0c9e4bc 100644 > --- a/drivers/pci/probe.c > +++ b/drivers/pci/probe.c > @@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev) > > static void pci_release_capabilities(struct pci_dev *dev) > { > + pci_aer_exit(dev); > pci_vpd_release(dev); > pci_iov_release(dev); > pci_free_cap_save_buffers(dev); > diff --git a/include/linux/pci.h b/include/linux/pci.h > index 340029b2fb38..8d59c6c19a19 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -299,6 +299,7 @@ struct pci_dev { > u8 hdr_type; /* PCI header type (`multi' flag masked out) */ > #ifdef CONFIG_PCIEAER > u16 aer_cap; /* AER capability offset */ > + struct aer_stats *aer_stats; /* AER stats for this device */ > #endif > u8 pcie_cap; /* PCIe capability offset */ > u8 msi_cap; /* MSI capability offset */ > @@ -1471,10 +1472,12 @@ static inline bool pcie_aspm_support_enabled(void) { return false; } > void pci_no_aer(void); > bool pci_aer_available(void); > int pci_aer_init(struct pci_dev *dev); > +void pci_aer_exit(struct pci_dev *dev); With the exception of pci_aer_available(), these are only used inside drivers/pci. This might be a good opportunity to move those private things to drivers/pci/pci.h (in a separate patch, of course). > #else > static inline void pci_no_aer(void) { } > static inline bool pci_aer_available(void) { return false; } > static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } > +static inline void pci_aer_exit(struct pci_dev *d) { } > #endif > > #ifdef CONFIG_PCIE_ECRC > -- > 2.18.0.rc1.244.gcf134e6275-goog >
On Thu, Jun 21, 2018 at 6:17 AM, Bjorn Helgaas <helgaas@kernel.org> wrote: > On Wed, Jun 20, 2018 at 04:41:43PM -0700, Rajat Jain wrote: >> Define a structure to hold the AER statistics. There are 2 groups >> of statistics: dev_* counters that are to be collected for all AER >> capable devices and rootport_* counters that are collected for all >> (AER capable) rootports only. Allocate and free this structure when >> device is added or released (thus counters survive the lifetime of the >> device). >> >> Signed-off-by: Rajat Jain <rajatja@google.com> >> --- >> v5: Same as v4 >> v4: Same as v3 >> v3: Merge everything in aer.c >> >> drivers/pci/pcie/aer.c | 60 ++++++++++++++++++++++++++++++++++++++++++ >> drivers/pci/probe.c | 1 + >> include/linux/pci.h | 3 +++ >> 3 files changed, 64 insertions(+) >> >> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c >> index a2e88386af28..f9fa994b6c33 100644 >> --- a/drivers/pci/pcie/aer.c >> +++ b/drivers/pci/pcie/aer.c >> @@ -33,6 +33,9 @@ >> #define AER_ERROR_SOURCES_MAX 100 >> #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ >> >> +#define AER_MAX_TYPEOF_CORRECTABLE_ERRS 16 /* as per PCI_ERR_COR_STATUS */ >> +#define AER_MAX_TYPEOF_UNCORRECTABLE_ERRS 26 /* as per PCI_ERR_UNCOR_STATUS*/ >> + >> struct aer_err_info { >> struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; >> int error_dev_num; >> @@ -76,6 +79,40 @@ struct aer_rpc { >> */ >> }; >> >> +/* AER stats for the device */ >> +struct aer_stats { >> + >> + /* >> + * Fields for all AER capable devices. They indicate the errors >> + * "as seen by this device". Note that this may mean that if an >> + * end point is causing problems, the AER counters may increment >> + * at its link partner (e.g. root port) because the errors will be >> + * "seen" by the link partner and not the the problematic end point >> + * itself (which may report all counters as 0 as it never saw any >> + * problems). >> + */ >> + /* Individual counters for different type of correctable errors */ >> + u64 dev_cor_errs[AER_MAX_TYPEOF_CORRECTABLE_ERRS]; >> + /* Individual counters for different type of uncorrectable errors */ >> + u64 dev_uncor_errs[AER_MAX_TYPEOF_UNCORRECTABLE_ERRS]; >> + /* Total number of correctable errors seen by this device */ >> + u64 dev_total_cor_errs; >> + /* Total number of fatal uncorrectable errors seen by this device */ >> + u64 dev_total_fatal_errs; >> + /* Total number of fatal uncorrectable errors seen by this device */ >> + u64 dev_total_nonfatal_errs; >> + >> + /* >> + * Fields for Root ports only, these indicate the total number of >> + * ERR_COR, ERR_FATAL, and ERR_NONFATAL messages received by the >> + * rootport, INCLUDING the ones that are generated internally (by >> + * the rootport itself) > > Strictly speaking, I think these are applicable for both root ports > and root complex event collectors, right? Correct, I will reword this comment to state this. > >> + */ >> + u64 rootport_total_cor_errs; >> + u64 rootport_total_fatal_errs; >> + u64 rootport_total_nonfatal_errs; >> +}; >> + >> #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ >> PCI_ERR_UNC_ECRC| \ >> PCI_ERR_UNC_UNSUP| \ >> @@ -402,12 +439,35 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) >> return 0; >> } >> >> +static int pci_aer_stats_init(struct pci_dev *pdev) >> +{ >> + pdev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL); >> + if (!pdev->aer_stats) { >> + dev_err(&pdev->dev, "No memory for aer_stats\n"); > > pci_err(), if we need the message at all. > > Based on c7abb2352c29 ("PCI: Remove unnecessary messages for memory > allocation failures"), I'd be inclined to drop the message completely. Will do. > >> + return -ENOMEM; >> + } >> + return 0; >> +} >> + >> +static void pci_aer_stats_exit(struct pci_dev *pdev) >> +{ >> + kfree(pdev->aer_stats); >> + pdev->aer_stats = NULL; >> +} >> + >> int pci_aer_init(struct pci_dev *dev) >> { >> dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); >> + if (!dev->aer_cap || pci_aer_stats_init(dev)) >> + return -EIO; > > This skips pci_cleanup_aer_error_status_regs() if the kzalloc() fails. > I think we should still do pci_cleanup_aer_error_status_regs(), even > if the alloc fails. Will do. > > Nobody checks the return value of pci_aer_init(), so I think you can > simplify this by making these functions void. Will do. > > Maybe even squash them together, i.e., do the kzalloc() directly in > pci_aer_init() and the kfree() directly in pci_aer_exit()? Will do. > >> return pci_cleanup_aer_error_status_regs(dev); >> } >> >> +void pci_aer_exit(struct pci_dev *dev) >> +{ >> + pci_aer_stats_exit(dev); >> +} >> + >> #define AER_AGENT_RECEIVER 0 >> #define AER_AGENT_REQUESTER 1 >> #define AER_AGENT_COMPLETER 2 >> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c >> index ac876e32de4b..48edd0c9e4bc 100644 >> --- a/drivers/pci/probe.c >> +++ b/drivers/pci/probe.c >> @@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev) >> >> static void pci_release_capabilities(struct pci_dev *dev) >> { >> + pci_aer_exit(dev); >> pci_vpd_release(dev); >> pci_iov_release(dev); >> pci_free_cap_save_buffers(dev); >> diff --git a/include/linux/pci.h b/include/linux/pci.h >> index 340029b2fb38..8d59c6c19a19 100644 >> --- a/include/linux/pci.h >> +++ b/include/linux/pci.h >> @@ -299,6 +299,7 @@ struct pci_dev { >> u8 hdr_type; /* PCI header type (`multi' flag masked out) */ >> #ifdef CONFIG_PCIEAER >> u16 aer_cap; /* AER capability offset */ >> + struct aer_stats *aer_stats; /* AER stats for this device */ >> #endif >> u8 pcie_cap; /* PCIe capability offset */ >> u8 msi_cap; /* MSI capability offset */ >> @@ -1471,10 +1472,12 @@ static inline bool pcie_aspm_support_enabled(void) { return false; } >> void pci_no_aer(void); >> bool pci_aer_available(void); >> int pci_aer_init(struct pci_dev *dev); >> +void pci_aer_exit(struct pci_dev *dev); > > With the exception of pci_aer_available(), these are only used inside > drivers/pci. This might be a good opportunity to move those private > things to drivers/pci/pci.h (in a separate patch, of course). Will do. > >> #else >> static inline void pci_no_aer(void) { } >> static inline bool pci_aer_available(void) { return false; } >> static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } >> +static inline void pci_aer_exit(struct pci_dev *d) { } >> #endif >> >> #ifdef CONFIG_PCIE_ECRC >> -- >> 2.18.0.rc1.244.gcf134e6275-goog >>
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index a2e88386af28..f9fa994b6c33 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -33,6 +33,9 @@ #define AER_ERROR_SOURCES_MAX 100 #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ +#define AER_MAX_TYPEOF_CORRECTABLE_ERRS 16 /* as per PCI_ERR_COR_STATUS */ +#define AER_MAX_TYPEOF_UNCORRECTABLE_ERRS 26 /* as per PCI_ERR_UNCOR_STATUS*/ + struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; @@ -76,6 +79,40 @@ struct aer_rpc { */ }; +/* AER stats for the device */ +struct aer_stats { + + /* + * Fields for all AER capable devices. They indicate the errors + * "as seen by this device". Note that this may mean that if an + * end point is causing problems, the AER counters may increment + * at its link partner (e.g. root port) because the errors will be + * "seen" by the link partner and not the the problematic end point + * itself (which may report all counters as 0 as it never saw any + * problems). + */ + /* Individual counters for different type of correctable errors */ + u64 dev_cor_errs[AER_MAX_TYPEOF_CORRECTABLE_ERRS]; + /* Individual counters for different type of uncorrectable errors */ + u64 dev_uncor_errs[AER_MAX_TYPEOF_UNCORRECTABLE_ERRS]; + /* Total number of correctable errors seen by this device */ + u64 dev_total_cor_errs; + /* Total number of fatal uncorrectable errors seen by this device */ + u64 dev_total_fatal_errs; + /* Total number of fatal uncorrectable errors seen by this device */ + u64 dev_total_nonfatal_errs; + + /* + * Fields for Root ports only, these indicate the total number of + * ERR_COR, ERR_FATAL, and ERR_NONFATAL messages received by the + * rootport, INCLUDING the ones that are generated internally (by + * the rootport itself) + */ + u64 rootport_total_cor_errs; + u64 rootport_total_fatal_errs; + u64 rootport_total_nonfatal_errs; +}; + #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ PCI_ERR_UNC_ECRC| \ PCI_ERR_UNC_UNSUP| \ @@ -402,12 +439,35 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) return 0; } +static int pci_aer_stats_init(struct pci_dev *pdev) +{ + pdev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL); + if (!pdev->aer_stats) { + dev_err(&pdev->dev, "No memory for aer_stats\n"); + return -ENOMEM; + } + return 0; +} + +static void pci_aer_stats_exit(struct pci_dev *pdev) +{ + kfree(pdev->aer_stats); + pdev->aer_stats = NULL; +} + int pci_aer_init(struct pci_dev *dev) { dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + if (!dev->aer_cap || pci_aer_stats_init(dev)) + return -EIO; return pci_cleanup_aer_error_status_regs(dev); } +void pci_aer_exit(struct pci_dev *dev) +{ + pci_aer_stats_exit(dev); +} + #define AER_AGENT_RECEIVER 0 #define AER_AGENT_REQUESTER 1 #define AER_AGENT_COMPLETER 2 diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index ac876e32de4b..48edd0c9e4bc 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev) static void pci_release_capabilities(struct pci_dev *dev) { + pci_aer_exit(dev); pci_vpd_release(dev); pci_iov_release(dev); pci_free_cap_save_buffers(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 340029b2fb38..8d59c6c19a19 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -299,6 +299,7 @@ struct pci_dev { u8 hdr_type; /* PCI header type (`multi' flag masked out) */ #ifdef CONFIG_PCIEAER u16 aer_cap; /* AER capability offset */ + struct aer_stats *aer_stats; /* AER stats for this device */ #endif u8 pcie_cap; /* PCIe capability offset */ u8 msi_cap; /* MSI capability offset */ @@ -1471,10 +1472,12 @@ static inline bool pcie_aspm_support_enabled(void) { return false; } void pci_no_aer(void); bool pci_aer_available(void); int pci_aer_init(struct pci_dev *dev); +void pci_aer_exit(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline bool pci_aer_available(void) { return false; } static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } +static inline void pci_aer_exit(struct pci_dev *d) { } #endif #ifdef CONFIG_PCIE_ECRC
Define a structure to hold the AER statistics. There are 2 groups of statistics: dev_* counters that are to be collected for all AER capable devices and rootport_* counters that are collected for all (AER capable) rootports only. Allocate and free this structure when device is added or released (thus counters survive the lifetime of the device). Signed-off-by: Rajat Jain <rajatja@google.com> --- v5: Same as v4 v4: Same as v3 v3: Merge everything in aer.c drivers/pci/pcie/aer.c | 60 ++++++++++++++++++++++++++++++++++++++++++ drivers/pci/probe.c | 1 + include/linux/pci.h | 3 +++ 3 files changed, 64 insertions(+)