diff mbox series

[v3,3/8] PCI/AER: Move AER stat collection out of __aer_print_error()

Message ID 20250319084050.366718-4-pandoh@google.com (mailing list archive)
State Superseded
Headers show
Series Rate limit AER logs | expand

Commit Message

Jon Pan-Doh March 19, 2025, 8:40 a.m. UTC
Decouple stat collection from internal AER print functions. AERs from ghes
or cxl drivers have stat collection in pci_print_aer() as that is where
aer_err_info is populated.

Signed-off-by: Jon Pan-Doh <pandoh@google.com>
---
 drivers/pci/pci.h      |  1 +
 drivers/pci/pcie/aer.c | 10 ++++++----
 drivers/pci/pcie/dpc.c |  1 +
 3 files changed, 8 insertions(+), 4 deletions(-)

Comments

Bjorn Helgaas March 19, 2025, 6:19 p.m. UTC | #1
On Wed, Mar 19, 2025 at 01:40:44AM -0700, Jon Pan-Doh wrote:
> Decouple stat collection from internal AER print functions. AERs from ghes
> or cxl drivers have stat collection in pci_print_aer() as that is where
> aer_err_info is populated.

This moves pci_dev_aer_stats_incr() from __aer_print_error() to

  - pci_print_aer(), used by CXL via cxl_handle_rdport_errors() and
    GHES via aer_recover_queue() and aer_recover_work_func()

  - aer_process_err_devices(), used by native AER handling via
    aer_irq(), aer_isr(), aer_isr_one_error(), and

  - dpc_process_error(), used by native DPC handling via dpc_handler()
    and by ACPI EDR Notify events via edr_handle_event()

And the reason for this is ...?

Maybe just to separate stats from printing, which does seem reasonable
to me, although pci_print_aer() is still primarily a printing
function, albeit also an external interface.

In any event, I would like to include the motivation.

> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
>  drivers/pci/pci.h      |  1 +
>  drivers/pci/pcie/aer.c | 10 ++++++----
>  drivers/pci/pcie/dpc.c |  1 +
>  3 files changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 75985b96ecc1..9d63d32f041c 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -551,6 +551,7 @@ struct aer_err_info {
>  };
>  
>  int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
>  
>  int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 7eeaad917134..8e4d4f9326e1 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = {
>  	.is_visible = aer_stats_attrs_are_visible,
>  };
>  
> -static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
> -				   struct aer_err_info *info)
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
>  {
>  	unsigned long status = info->status & ~info->mask;
>  	int i, max = -1;
> @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev,
>  		aer_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
>  				info->first_error == i ? " (First)" : "");
>  	}
> -	pci_dev_aer_stats_incr(dev, info);
>  }
>  
>  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> @@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>  	info.mask = mask;
>  	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>  
> +	pci_dev_aer_stats_incr(dev, &info);
> +
>  	aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>  	__aer_print_error(dev, &info, level);
>  	aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
> @@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info,
>  
>  	/* Report all before handle them, not to lost records by reset etc. */
>  	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> -		if (aer_get_device_error_info(e_info->dev[i], e_info))
> +		if (aer_get_device_error_info(e_info->dev[i], e_info)) {
> +			pci_dev_aer_stats_incr(e_info->dev[i], e_info);
>  			aer_print_error(e_info->dev[i], e_info, level);
> +		}
>  	}
>  	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
>  		if (aer_get_device_error_info(e_info->dev[i], e_info))
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 9e4c9ac737a7..81cd6e8ff3a4 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
>  	else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
>  		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
>  		 aer_get_device_error_info(pdev, &info)) {
> +		pci_dev_aer_stats_incr(pdev, &info);
>  		aer_print_error(pdev, &info, KERN_ERR);
>  		pci_aer_clear_nonfatal_status(pdev);
>  		pci_aer_clear_fatal_status(pdev);
> -- 
> 2.49.0.rc1.451.g8f38331e32-goog
>
Sathyanarayanan Kuppuswamy March 20, 2025, 3:22 a.m. UTC | #2
On 3/19/25 1:40 AM, Jon Pan-Doh wrote:
> Decouple stat collection from internal AER print functions. AERs from ghes
> or cxl drivers have stat collection in pci_print_aer() as that is where
> aer_err_info is populated.

pci_print_aer() also seems to do more than printing the AER stat. Why only
care about stat collection in __aer_print_error(). If the motivation is 
to just improve the code readability, I am not sure it is worth the 
effort. Please correct me if my understanding is incorrect.
>
> Signed-off-by: Jon Pan-Doh <pandoh@google.com>
> ---
>   drivers/pci/pci.h      |  1 +
>   drivers/pci/pcie/aer.c | 10 ++++++----
>   drivers/pci/pcie/dpc.c |  1 +
>   3 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 75985b96ecc1..9d63d32f041c 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -551,6 +551,7 @@ struct aer_err_info {
>   };
>   
>   int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
>   void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
>   
>   int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 7eeaad917134..8e4d4f9326e1 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -617,8 +617,7 @@ const struct attribute_group aer_stats_attr_group = {
>   	.is_visible = aer_stats_attrs_are_visible,
>   };
>   
> -static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
> -				   struct aer_err_info *info)
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
>   {
>   	unsigned long status = info->status & ~info->mask;
>   	int i, max = -1;
> @@ -691,7 +690,6 @@ static void __aer_print_error(struct pci_dev *dev,
>   		aer_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
>   				info->first_error == i ? " (First)" : "");
>   	}
> -	pci_dev_aer_stats_incr(dev, info);
>   }
>   
>   void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
> @@ -784,6 +782,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>   	info.mask = mask;
>   	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>   
> +	pci_dev_aer_stats_incr(dev, &info);
> +
>   	aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>   	__aer_print_error(dev, &info, level);
>   	aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
> @@ -1263,8 +1263,10 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info,
>   
>   	/* Report all before handle them, not to lost records by reset etc. */
>   	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
> -		if (aer_get_device_error_info(e_info->dev[i], e_info))
> +		if (aer_get_device_error_info(e_info->dev[i], e_info)) {
> +			pci_dev_aer_stats_incr(e_info->dev[i], e_info);
>   			aer_print_error(e_info->dev[i], e_info, level);
> +		}
>   	}
>   	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
>   		if (aer_get_device_error_info(e_info->dev[i], e_info))
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 9e4c9ac737a7..81cd6e8ff3a4 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -289,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
>   	else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
>   		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
>   		 aer_get_device_error_info(pdev, &info)) {
> +		pci_dev_aer_stats_incr(pdev, &info);
>   		aer_print_error(pdev, &info, KERN_ERR);
>   		pci_aer_clear_nonfatal_status(pdev);
>   		pci_aer_clear_fatal_status(pdev);
Jon Pan-Doh March 20, 2025, 8:27 a.m. UTC | #3
On Wed, Mar 19, 2025 at 11:19 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> This moves pci_dev_aer_stats_incr() from __aer_print_error() to
>
>   - pci_print_aer(), used by CXL via cxl_handle_rdport_errors() and
>     GHES via aer_recover_queue() and aer_recover_work_func()
>
>   - aer_process_err_devices(), used by native AER handling via
>     aer_irq(), aer_isr(), aer_isr_one_error(), and
>
>   - dpc_process_error(), used by native DPC handling via dpc_handler()
>     and by ACPI EDR Notify events via edr_handle_event()
>
> And the reason for this is ...?
>
> Maybe just to separate stats from printing, which does seem reasonable
> to me, although pci_print_aer() is still primarily a printing
> function, albeit also an external interface.

Separating stats from internal print functions allows us to:
- easily add ratelimits to logs while having stats unaffected
- simplify control flow as stats collection is no longer buried

pci_print_aer() is the odd one out, but that should be temporary as
Karolina's log
consolidation patch[1] should allow pci_dev_aer_stats_incr() to be pulled out of
print functions and called after the newly created populate_aer_err_info().

> In any event, I would like to include the motivation.

Ack. Added in v4.

[1] https://lore.kernel.org/linux-pci/8bcb8c9a7b38ce3bdaca5a64fe76f08b0b337511.1742202797.git.karolina.stolarek@oracle.com/

Thanks,
Jon
Jon Pan-Doh March 20, 2025, 8:29 a.m. UTC | #4
On Wed, Mar 19, 2025 at 8:23 PM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> pci_print_aer() also seems to do more than printing the AER stat. Why only
> care about stat collection in __aer_print_error(). If the motivation is
> to just improve the code readability, I am not sure it is worth the
> effort. Please correct me if my understanding is incorrect.

Bjorn had similar concerns. Hopefully my response[1] answers your questions.

[1] https://lore.kernel.org/linux-pci/CAMC_AXW85x_LRT5UTD0C_VvK8WTG6kbfvp5k_7RjnLzGM3Bg-g@mail.gmail.com/

Thanks,
Jon
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 75985b96ecc1..9d63d32f041c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -551,6 +551,7 @@  struct aer_err_info {
 };
 
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
 
 int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 7eeaad917134..8e4d4f9326e1 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -617,8 +617,7 @@  const struct attribute_group aer_stats_attr_group = {
 	.is_visible = aer_stats_attrs_are_visible,
 };
 
-static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
-				   struct aer_err_info *info)
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
 {
 	unsigned long status = info->status & ~info->mask;
 	int i, max = -1;
@@ -691,7 +690,6 @@  static void __aer_print_error(struct pci_dev *dev,
 		aer_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
 				info->first_error == i ? " (First)" : "");
 	}
-	pci_dev_aer_stats_incr(dev, info);
 }
 
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
@@ -784,6 +782,8 @@  void pci_print_aer(struct pci_dev *dev, int aer_severity,
 	info.mask = mask;
 	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
 
+	pci_dev_aer_stats_incr(dev, &info);
+
 	aer_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
 	__aer_print_error(dev, &info, level);
 	aer_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
@@ -1263,8 +1263,10 @@  static inline void aer_process_err_devices(struct aer_err_info *e_info,
 
 	/* Report all before handle them, not to lost records by reset etc. */
 	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
-		if (aer_get_device_error_info(e_info->dev[i], e_info))
+		if (aer_get_device_error_info(e_info->dev[i], e_info)) {
+			pci_dev_aer_stats_incr(e_info->dev[i], e_info);
 			aer_print_error(e_info->dev[i], e_info, level);
+		}
 	}
 	for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
 		if (aer_get_device_error_info(e_info->dev[i], e_info))
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 9e4c9ac737a7..81cd6e8ff3a4 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -289,6 +289,7 @@  void dpc_process_error(struct pci_dev *pdev)
 	else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
 		 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 		 aer_get_device_error_info(pdev, &info)) {
+		pci_dev_aer_stats_incr(pdev, &info);
 		aer_print_error(pdev, &info, KERN_ERR);
 		pci_aer_clear_nonfatal_status(pdev);
 		pci_aer_clear_fatal_status(pdev);