diff mbox

[2/5] PCI/AER: Add sysfs stats for AER capable devices

Message ID 20180522222805.80314-3-rajatja@google.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Rajat Jain May 22, 2018, 10:28 p.m. UTC
Add the following AER sysfs stats to represent the counters for each
kind of error as seen by the device:

dev_total_cor_errs
dev_total_fatal_errs
dev_total_nonfatal_errs

Signed-off-by: Rajat Jain <rajatja@google.com>
---
 drivers/pci/pci-sysfs.c                |  3 ++
 drivers/pci/pci.h                      |  4 +-
 drivers/pci/pcie/aer/aerdrv.h          |  1 +
 drivers/pci/pcie/aer/aerdrv_errprint.c |  1 +
 drivers/pci/pcie/aer/aerdrv_stats.c    | 72 ++++++++++++++++++++++++++
 5 files changed, 80 insertions(+), 1 deletion(-)

Comments

Alex G. May 22, 2018, 10:50 p.m. UTC | #1
On 05/22/2018 05:28 PM, Rajat Jain wrote:
> Add the following AER sysfs stats to represent the counters for each
> kind of error as seen by the device:
> 
> dev_total_cor_errs
> dev_total_fatal_errs
> dev_total_nonfatal_errs
> 
> Signed-off-by: Rajat Jain <rajatja@google.com>
> ---
>  drivers/pci/pci-sysfs.c                |  3 ++
>  drivers/pci/pci.h                      |  4 +-
>  drivers/pci/pcie/aer/aerdrv.h          |  1 +
>  drivers/pci/pcie/aer/aerdrv_errprint.c |  1 +
>  drivers/pci/pcie/aer/aerdrv_stats.c    | 72 ++++++++++++++++++++++++++
>  5 files changed, 80 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index 366d93af051d..730f985a3dc9 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -1743,6 +1743,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
>  #endif
>  	&pci_bridge_attr_group,
>  	&pcie_dev_attr_group,
> +#ifdef CONFIG_PCIEAER
> +	&aer_stats_attr_group,
> +#endif
>  	NULL,
>  };

So if the device is removed as part of recovery, then these get reset,
right? So if the device fails intermittently, these counters would keep
getting reset. Is this the intent?

(snip)

>  /**
>   * pci_match_one_device - Tell if a PCI device structure has a matching
> diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
> index d8b9fba536ed..b5d5ad6f2c03 100644
> --- a/drivers/pci/pcie/aer/aerdrv.h
> +++ b/drivers/pci/pcie/aer/aerdrv.h
> @@ -87,6 +87,7 @@ void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info);
>  irqreturn_t aer_irq(int irq, void *context);
>  int pci_aer_stats_init(struct pci_dev *pdev);
>  void pci_aer_stats_exit(struct pci_dev *pdev);
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
>  
>  #ifdef CONFIG_ACPI_APEI
>  int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index 21ca5e1b0ded..5e8b98deda08 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -155,6 +155,7 @@ static void __aer_print_error(struct pci_dev *dev,
>  			pci_err(dev, "   [%2d] Unknown Error Bit%s\n",
>  				i, info->first_error == i ? " (First)" : "");
>  	}
> +	pci_dev_aer_stats_incr(dev, info);

What about AER errors that are contained by DPC?

Alex
Rajat Jain May 22, 2018, 11:27 p.m. UTC | #2
On Tue, May 22, 2018 at 3:50 PM, Alex G. <mr.nuke.me@gmail.com> wrote:
>
>
> On 05/22/2018 05:28 PM, Rajat Jain wrote:
>> Add the following AER sysfs stats to represent the counters for each
>> kind of error as seen by the device:
>>
>> dev_total_cor_errs
>> dev_total_fatal_errs
>> dev_total_nonfatal_errs
>>
>> Signed-off-by: Rajat Jain <rajatja@google.com>
>> ---
>>  drivers/pci/pci-sysfs.c                |  3 ++
>>  drivers/pci/pci.h                      |  4 +-
>>  drivers/pci/pcie/aer/aerdrv.h          |  1 +
>>  drivers/pci/pcie/aer/aerdrv_errprint.c |  1 +
>>  drivers/pci/pcie/aer/aerdrv_stats.c    | 72 ++++++++++++++++++++++++++
>>  5 files changed, 80 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
>> index 366d93af051d..730f985a3dc9 100644
>> --- a/drivers/pci/pci-sysfs.c
>> +++ b/drivers/pci/pci-sysfs.c
>> @@ -1743,6 +1743,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
>>  #endif
>>       &pci_bridge_attr_group,
>>       &pcie_dev_attr_group,
>> +#ifdef CONFIG_PCIEAER
>> +     &aer_stats_attr_group,
>> +#endif
>>       NULL,
>>  };
>
> So if the device is removed as part of recovery, then these get reset,
> right? So if the device fails intermittently, these counters would keep
> getting reset. Is this the intent?

Umm, kind of.
* One argument is that if a PCI device is removed and then
re-enumerated, how do we know it is the same device and has not been
replaced by another device for e.g.?  Note that the root port counters
that have the cumulative counters for all the errors seen will still
have them logged in the situation you describe.

>
> (snip)
>
>>  /**
>>   * pci_match_one_device - Tell if a PCI device structure has a matching
>> diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
>> index d8b9fba536ed..b5d5ad6f2c03 100644
>> --- a/drivers/pci/pcie/aer/aerdrv.h
>> +++ b/drivers/pci/pcie/aer/aerdrv.h
>> @@ -87,6 +87,7 @@ void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info);
>>  irqreturn_t aer_irq(int irq, void *context);
>>  int pci_aer_stats_init(struct pci_dev *pdev);
>>  void pci_aer_stats_exit(struct pci_dev *pdev);
>> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
>>
>>  #ifdef CONFIG_ACPI_APEI
>>  int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
>> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
>> index 21ca5e1b0ded..5e8b98deda08 100644
>> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
>> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
>> @@ -155,6 +155,7 @@ static void __aer_print_error(struct pci_dev *dev,
>>                       pci_err(dev, "   [%2d] Unknown Error Bit%s\n",
>>                               i, info->first_error == i ? " (First)" : "");
>>       }
>> +     pci_dev_aer_stats_incr(dev, info);
>
> What about AER errors that are contained by DPC?

Thanks, You are right, this patch does not take care of the DPC. I'll
try to read up on DPC and can integrate it if it turns out to be easy
enough.

Thanks,

Rajat

>
> Alex
Sinan Kaya May 22, 2018, 11:30 p.m. UTC | #3
On 5/22/2018 7:27 PM, Rajat Jain wrote:
>> What about AER errors that are contained by DPC?
> Thanks, You are right, this patch does not take care of the DPC. I'll
> try to read up on DPC and can integrate it if it turns out to be easy
> enough.
> 

I'd focus on AER for the moment. DPC is going through major restructuring
and will only handle AER_FATAL errors moving forward.

> Thanks,
> 
> Rajat
>
Greg KH May 23, 2018, 8:22 a.m. UTC | #4
On Tue, May 22, 2018 at 03:28:02PM -0700, Rajat Jain wrote:
> Add the following AER sysfs stats to represent the counters for each
> kind of error as seen by the device:
> 
> dev_total_cor_errs
> dev_total_fatal_errs
> dev_total_nonfatal_errs

You need Documentation/ABI/ updates for new sysfs files please.

thanks,

greg k-h
Greg KH May 23, 2018, 8:24 a.m. UTC | #5
On Tue, May 22, 2018 at 03:28:02PM -0700, Rajat Jain wrote:
> +#define aer_stats_aggregate_attr(field)                                        \
> +	static ssize_t                                                         \
> +	field##_show(struct device *dev, struct device_attribute *attr,        \
> +		     char *buf)                                                \
> +{                                                                              \
> +	struct pci_dev *pdev = to_pci_dev(dev);                                \
> +	return sprintf(buf, "0x%llx\n", pdev->aer_stats->field);               \
> +}                                                                              \

Use tabs at the end please, otherwise your trailing \ look horrid.

> +static DEVICE_ATTR_RO(field)
> +
> +aer_stats_aggregate_attr(dev_total_cor_errs);
> +aer_stats_aggregate_attr(dev_total_fatal_errs);
> +aer_stats_aggregate_attr(dev_total_nonfatal_errs);
> +
> +static struct attribute *aer_stats_attrs[] __ro_after_init = {
> +	&dev_attr_dev_total_cor_errs.attr,
> +	&dev_attr_dev_total_fatal_errs.attr,
> +	&dev_attr_dev_total_nonfatal_errs.attr,
> +	NULL
> +};
> +
> +static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
> +					   struct attribute *a, int n)
> +{
> +	struct device *dev = kobj_to_dev(kobj);
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	if (!pdev->aer_stats)
> +		return 0;
> +
> +	return a->mode;
> +}
> +
> +const struct attribute_group aer_stats_attr_group = {
> +	.name  = "aer_stats",
> +	.attrs  = aer_stats_attrs,
> +	.is_visible = aer_stats_attrs_are_visible,
> +};
> +
> +void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
> +{
> +	int status, i, max = -1;
> +	u64 *counter = NULL;
> +	struct aer_stats *aer_stats = pdev->aer_stats;
> +
> +	if (unlikely(!aer_stats))
> +		return;

Can you measure the speed difference with and without that unlikely()
macro?  If not, please don't use it.  Hint, the cpu and compiler are
always always better at this than we are...

thanks,

greg k-h
diff mbox

Patch

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 366d93af051d..730f985a3dc9 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1743,6 +1743,9 @@  static const struct attribute_group *pci_dev_attr_groups[] = {
 #endif
 	&pci_bridge_attr_group,
 	&pcie_dev_attr_group,
+#ifdef CONFIG_PCIEAER
+	&aer_stats_attr_group,
+#endif
 	NULL,
 };
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index c358e7a07f3f..9a28ec600225 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -181,7 +181,9 @@  extern const struct attribute_group *pci_dev_groups[];
 extern const struct attribute_group *pcibus_groups[];
 extern const struct device_type pci_dev_type;
 extern const struct attribute_group *pci_bus_groups[];
-
+#ifdef CONFIG_PCIEAER
+extern const struct attribute_group aer_stats_attr_group;
+#endif
 
 /**
  * pci_match_one_device - Tell if a PCI device structure has a matching
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index d8b9fba536ed..b5d5ad6f2c03 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -87,6 +87,7 @@  void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info);
 irqreturn_t aer_irq(int irq, void *context);
 int pci_aer_stats_init(struct pci_dev *pdev);
 void pci_aer_stats_exit(struct pci_dev *pdev);
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info);
 
 #ifdef CONFIG_ACPI_APEI
 int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index 21ca5e1b0ded..5e8b98deda08 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -155,6 +155,7 @@  static void __aer_print_error(struct pci_dev *dev,
 			pci_err(dev, "   [%2d] Unknown Error Bit%s\n",
 				i, info->first_error == i ? " (First)" : "");
 	}
+	pci_dev_aer_stats_incr(dev, info);
 }
 
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
diff --git a/drivers/pci/pcie/aer/aerdrv_stats.c b/drivers/pci/pcie/aer/aerdrv_stats.c
index b9f251992209..87b7119d0a86 100644
--- a/drivers/pci/pcie/aer/aerdrv_stats.c
+++ b/drivers/pci/pcie/aer/aerdrv_stats.c
@@ -47,6 +47,78 @@  struct aer_stats {
 	u64 rootport_total_nonfatal_errs;
 };
 
+#define aer_stats_aggregate_attr(field)                                        \
+	static ssize_t                                                         \
+	field##_show(struct device *dev, struct device_attribute *attr,        \
+		     char *buf)                                                \
+{                                                                              \
+	struct pci_dev *pdev = to_pci_dev(dev);                                \
+	return sprintf(buf, "0x%llx\n", pdev->aer_stats->field);               \
+}                                                                              \
+static DEVICE_ATTR_RO(field)
+
+aer_stats_aggregate_attr(dev_total_cor_errs);
+aer_stats_aggregate_attr(dev_total_fatal_errs);
+aer_stats_aggregate_attr(dev_total_nonfatal_errs);
+
+static struct attribute *aer_stats_attrs[] __ro_after_init = {
+	&dev_attr_dev_total_cor_errs.attr,
+	&dev_attr_dev_total_fatal_errs.attr,
+	&dev_attr_dev_total_nonfatal_errs.attr,
+	NULL
+};
+
+static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
+					   struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (!pdev->aer_stats)
+		return 0;
+
+	return a->mode;
+}
+
+const struct attribute_group aer_stats_attr_group = {
+	.name  = "aer_stats",
+	.attrs  = aer_stats_attrs,
+	.is_visible = aer_stats_attrs_are_visible,
+};
+
+void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info)
+{
+	int status, i, max = -1;
+	u64 *counter = NULL;
+	struct aer_stats *aer_stats = pdev->aer_stats;
+
+	if (unlikely(!aer_stats))
+		return;
+
+	switch (info->severity) {
+	case AER_CORRECTABLE:
+		aer_stats->dev_total_cor_errs++;
+		counter = &aer_stats->dev_cor_errs[0];
+		max = AER_MAX_TYPEOF_CORRECTABLE_ERRS;
+		break;
+	case AER_NONFATAL:
+		aer_stats->dev_total_nonfatal_errs++;
+		counter = &aer_stats->dev_uncor_errs[0];
+		max = AER_MAX_TYPEOF_UNCORRECTABLE_ERRS;
+		break;
+	case AER_FATAL:
+		aer_stats->dev_total_fatal_errs++;
+		counter = &aer_stats->dev_uncor_errs[0];
+		max = AER_MAX_TYPEOF_UNCORRECTABLE_ERRS;
+		break;
+	}
+
+	status = (info->status & ~info->mask);
+	for (i = 0; i < max; i++)
+		if (status & (1 << i))
+			counter[i]++;
+}
+
 int pci_aer_stats_init(struct pci_dev *pdev)
 {
 	pdev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);