diff mbox series

[v2,3/8] PCI/AER: Move AER stat collection out of __aer_print_error

Message ID 20250214023543.992372-4-pandoh@google.com (mailing list archive)
State New
Delegated to: Bjorn Helgaas
Headers show
Series Rate limit AER logs | expand

Commit Message

Jon Pan-Doh Feb. 14, 2025, 2:35 a.m. UTC
Decouple stat collection from internal AER print functions. AERs from ghes
or cxl drivers have stat collection in pci_print_aer as that is where
aer_err_info is populated.

Tested using aer-inject[1]. AER sysfs counters still updated correctly.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git

Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
 drivers/pci/pci.h      |  1 +
 drivers/pci/pcie/aer.c | 10 ++++++----
 drivers/pci/pcie/dpc.c |  1 +
 3 files changed, 8 insertions(+), 4 deletions(-)

Comments

Karolina Stolarek Feb. 17, 2025, 11:29 a.m. UTC | #1
On 14/02/2025 03:35, Jon Pan-Doh wrote:
> Decouple stat collection from internal AER print functions. AERs from ghes
> or cxl drivers have stat collection in pci_print_aer as that is where
> aer_err_info is populated.
> 
> Tested using aer-inject[1]. AER sysfs counters still updated correctly.

I don't think we have to mention that it was tested. In other patches 
you mention specific examples that illustrate the change nicely, but we 
don't get the same value from the statement above.

> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git
> 
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
>   drivers/pci/pci.h      |  1 +
>   drivers/pci/pcie/aer.c | 10 ++++++----
>   drivers/pci/pcie/dpc.c |  1 +
>   3 files changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 8cb816ee5388..26104aee06c0 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -550,6 +550,7 @@ struct aer_err_info {
>   };
>   
>   int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
>   void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
>   
>   int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index f1fdaa052cf6..d6edb95d468f 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = {
>   	.is_visible = aer_stats_attrs_are_visible,
>   };
>   
> -static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
> -				   struct aer_err_info *info)
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
>   {
>   	unsigned long status = info->status & ~info->mask;
>   	int i, max = -1;
> @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev,
>   		aer_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
>   				info->first_error == i ? " (First)" : "");
>   	}
> -	pci_dev_aer_stats_incr(dev, info);
>   }
>   
>   void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> @@ -772,6 +770,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   	info.mask = mask;
>   	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>   
> +	pci_dev_aer_stats_incr(dev, &info);

With this change, we increment the stats when we iterate the recovery 
queue in ghes_handle_aer. Is there a possibility that in the GHES path 
we would increment the stats twice? First via AER module (aer_isr) and 
then in aer_recover_work_func, or is it either/or?

All the best,
Karolina
Jon Pan-Doh Feb. 19, 2025, 2:48 a.m. UTC | #2
On Mon, Feb 17, 2025 at 3:29 AM Karolina Stolarek
<karolina.stolarek@oracle.com> wrote:
> On 14/02/2025 03:35, Jon Pan-Doh wrote:
> > Tested using aer-inject[1]. AER sysfs counters still updated correctly.
>
> I don't think we have to mention that it was tested. In other patches
> you mention specific examples that illustrate the change nicely, but we
> don't get the same value from the statement above.

Ack. Will omit in v3.

> With this change, we increment the stats when we iterate the recovery
> queue in ghes_handle_aer. Is there a possibility that in the GHES path
> we would increment the stats twice? First via AER module (aer_isr) and
> then in aer_recover_work_func, or is it either/or?

It's either/or. aer_isr deals with native AER handling (i.e. by OS).
However, AER errors from GHES originate from ACPI APEI (i.e. FW
notifies OS of error).

Thanks,
Jon
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8cb816ee5388..26104aee06c0 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -550,6 +550,7 @@  struct aer_err_info {
 };
 
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
 
 int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index f1fdaa052cf6..d6edb95d468f 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -617,8 +617,7 @@  const struct attribute_group aer_stats_attr_group = {
 	.is_visible = aer_stats_attrs_are_visible,
 };
 
-static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
-				   struct aer_err_info *info)
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
 {
 	unsigned long status = info->status & ~info->mask;
 	int i, max = -1;
@@ -691,7 +690,6 @@  static void __aer_print_error(struct pci_dev *dev,
 		aer_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
 				info->first_error == i ? " (First)" : "");
 	}
-	pci_dev_aer_stats_incr(dev, info);
 }
 
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
@@ -772,6 +770,8 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 	info.mask = mask;
 	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
 
+	pci_dev_aer_stats_incr(dev, &info);
+
 	aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
 	__aer_print_error(dev, &info, level);
 	aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
@@ -1250,8 +1250,10 @@  static inline void aer_process_err_devices(struct aer_err_info *e_info,
 
 	/* Report all before handle them, not to lost records by reset etc. */
 	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
-		if (aer_get_device_error_info(e_info->dev[i], e_info))
+		if (aer_get_device_error_info(e_info->dev[i], e_info)) {
+			pci_dev_aer_stats_incr(e_info->dev[i], e_info);
 			aer_print_error(e_info->dev[i], e_info, level);
+		}
 	}
 	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
 		if (aer_get_device_error_info(e_info->dev[i], e_info))
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index f06fad95f2eb..a85ea76b4dea 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -287,6 +287,7 @@  void dpc_process_error(struct pci_dev *pdev)
 	else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
 		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 		 aer_get_device_error_info(pdev, &info)) {
+		pci_dev_aer_stats_incr(pdev, &info);
 		aer_print_error(pdev, &info, KERN_ERR);
 		pci_aer_clear_nonfatal_status(pdev);
 		pci_aer_clear_fatal_status(pdev);