diff mbox series

[v2,1/3] mm: memory-failure: Add memory failure stats to sysfs

Message ID 20230120034622.2698268-2-jiaqiyan@google.com (mailing list archive)
State New
Headers show
Series Introduce per NUMA node memory error statistics | expand

Commit Message

Jiaqi Yan Jan. 20, 2023, 3:46 a.m. UTC
Today kernel provides following memory error info to userspace, but each
has its own disadvantage
* HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total,
  not per NUMA node stats though
* ras:memory_failure_event: only available after explicitly enabled
* /dev/mcelog provides many useful info about the MCEs, but
  doesn't capture how memory_failure recovered memory MCEs
* kernel logs: userspace needs to process log text

Exposes per NUMA node memory error stats as sysfs entries:

  /sys/devices/system/node/node${X}/memory_failure/total
  /sys/devices/system/node/node${X}/memory_failure/recovered
  /sys/devices/system/node/node${X}/memory_failure/ignored
  /sys/devices/system/node/node${X}/memory_failure/failed
  /sys/devices/system/node/node${X}/memory_failure/delayed

These counters describe how many raw pages are poisoned and after the
attempted recoveries by the kernel, their resolutions: how many are
recovered, ignored, failed, or delayed respectively. The following
math holds for the statistics:
* total = recovered + ignored + failed + delayed

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 drivers/base/node.c    |  3 +++
 include/linux/mm.h     |  5 +++++
 include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++
 mm/memory-failure.c    | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+)

Comments

HORIGUCHI NAOYA(堀口 直也) Jan. 23, 2023, 2:42 a.m. UTC | #1
On Fri, Jan 20, 2023 at 03:46:20AM +0000, Jiaqi Yan wrote:
> Today kernel provides following memory error info to userspace, but each
> has its own disadvantage
> * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total,
>   not per NUMA node stats though
> * ras:memory_failure_event: only available after explicitly enabled
> * /dev/mcelog provides many useful info about the MCEs, but
>   doesn't capture how memory_failure recovered memory MCEs
> * kernel logs: userspace needs to process log text
> 
> Exposes per NUMA node memory error stats as sysfs entries:
> 
>   /sys/devices/system/node/node${X}/memory_failure/total
>   /sys/devices/system/node/node${X}/memory_failure/recovered
>   /sys/devices/system/node/node${X}/memory_failure/ignored
>   /sys/devices/system/node/node${X}/memory_failure/failed
>   /sys/devices/system/node/node${X}/memory_failure/delayed
> 
> These counters describe how many raw pages are poisoned and after the
> attempted recoveries by the kernel, their resolutions: how many are
> recovered, ignored, failed, or delayed respectively. The following
> math holds for the statistics:
> * total = recovered + ignored + failed + delayed
> 
> Acked-by: David Rientjes <rientjes@google.com>
> Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>

Looks good to me, thank you.

Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Kefeng Wang Feb. 2, 2023, 6:54 a.m. UTC | #2
On 2023/1/20 11:46, Jiaqi Yan wrote:
> Today kernel provides following memory error info to userspace, but each
> has its own disadvantage
> * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total,
>    not per NUMA node stats though
> * ras:memory_failure_event: only available after explicitly enabled
> * /dev/mcelog provides many useful info about the MCEs, but
>    doesn't capture how memory_failure recovered memory MCEs
> * kernel logs: userspace needs to process log text
> 
> Exposes per NUMA node memory error stats as sysfs entries:
> 
>    /sys/devices/system/node/node${X}/memory_failure/total
>    /sys/devices/system/node/node${X}/memory_failure/recovered
>    /sys/devices/system/node/node${X}/memory_failure/ignored
>    /sys/devices/system/node/node${X}/memory_failure/failed
>    /sys/devices/system/node/node${X}/memory_failure/delayed
> 
> These counters describe how many raw pages are poisoned and after the
> attempted recoveries by the kernel, their resolutions: how many are
> recovered, ignored, failed, or delayed respectively. The following
> math holds for the statistics:
> * total = recovered + ignored + failed + delayed
> 
> Acked-by: David Rientjes <rientjes@google.com>
> Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> ---
>   drivers/base/node.c    |  3 +++
>   include/linux/mm.h     |  5 +++++
>   include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++
>   mm/memory-failure.c    | 35 +++++++++++++++++++++++++++++++++++
>   4 files changed, 71 insertions(+)
> 
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index faf3597a96da..b46db17124f3 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = {
>   	&node_dev_group,
>   #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
>   	&arch_node_dev_group,
> +#endif
> +#ifdef CONFIG_MEMORY_FAILURE
> +	&memory_failure_attr_group,
>   #endif
>   	NULL
>   };
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f3f196e4d66d..888576884eb9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3521,6 +3521,11 @@ enum mf_action_page_type {
>   	MF_MSG_UNKNOWN,
>   };
>   
> +/*
> + * Sysfs entries for memory failure handling statistics.
> + */
> +extern const struct attribute_group memory_failure_attr_group;
> +

This should move under CONFIG_MEMORY_FAILURE

>   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
>   extern void clear_huge_page(struct page *page,
>   			    unsigned long addr_hint,
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index cd28a100d9e4..2c537b31fa7b 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1110,6 +1110,31 @@ struct deferred_split {
>   };
>   #endif
>   
> +#ifdef CONFIG_MEMORY_FAILURE
> +/*
> + * Per NUMA node memory failure handling statistics.
> + */
> +struct memory_failure_stats {
> +	/*
> +	 * Number of raw pages poisoned.
> +	 * Cases not accounted: memory outside kernel control, offline page,
> +	 * arch-specific memory_failure (SGX), hwpoison_filter() filtered
> +	 * error events, and unpoison actions from hwpoison_unpoison.
> +	 */
> +	unsigned long total;
> +	/*
> +	 * Recovery results of poisoned raw pages handled by memory_failure,
> +	 * in sync with mf_result.
> +	 * total = ignored + failed + delayed + recovered.
> +	 * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
> +	 */
> +	unsigned long ignored;
> +	unsigned long failed;
> +	unsigned long delayed;
> +	unsigned long recovered;
> +};
> +#endif
> +
>   /*
>    * On NUMA machines, each NUMA node would have a pg_data_t to describe
>    * it's memory layout. On UMA machines there is a single pglist_data which
> @@ -1253,6 +1278,9 @@ typedef struct pglist_data {
>   #ifdef CONFIG_NUMA
>   	struct memory_tier __rcu *memtier;
>   #endif
> +#ifdef CONFIG_MEMORY_FAILURE
> +	struct memory_failure_stats mf_stats;
> +#endif
>   } pg_data_t;
>   
>   #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index c77a9e37e27e..c628f1db3a4d 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i)
>   		memblk_nr_poison_sub(pfn, i);
>   }
>   
> +/**
> + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
> + * @_name: name of the file in the per NUMA sysfs directory.
> + */
> +#define MF_ATTR_RO(_name)					\
> +static ssize_t _name##_show(struct device *dev,			\
> +			    struct device_attribute *attr,	\
> +			    char *buf)				\
> +{								\
> +	struct memory_failure_stats *mf_stats =			\
> +		&NODE_DATA(dev->id)->mf_stats;			\
> +	return sprintf(buf, "%lu\n", mf_stats->_name);		\
> +}								\
> +static DEVICE_ATTR_RO(_name)
> +
> +MF_ATTR_RO(total);
> +MF_ATTR_RO(ignored);
> +MF_ATTR_RO(failed);
> +MF_ATTR_RO(delayed);
> +MF_ATTR_RO(recovered);
> +
> +static struct attribute *memory_failure_attr[] = {
> +	&dev_attr_total.attr,
> +	&dev_attr_ignored.attr,
> +	&dev_attr_failed.attr,
> +	&dev_attr_delayed.attr,
> +	&dev_attr_recovered.attr,
> +	NULL,
> +};
> +
> +const struct attribute_group memory_failure_attr_group = {
> +	.name = "memory_failure",
> +	.attrs = memory_failure_attr,
> +};
> +
>   /*
>    * Return values:
>    *   1:   the page is dissolved (if needed) and taken off from buddy,
Jiaqi Yan Feb. 4, 2023, 11:21 p.m. UTC | #3
On Wed, Feb 1, 2023 at 10:54 PM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
>
>
>
> On 2023/1/20 11:46, Jiaqi Yan wrote:
> > Today kernel provides following memory error info to userspace, but each
> > has its own disadvantage
> > * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total,
> >    not per NUMA node stats though
> > * ras:memory_failure_event: only available after explicitly enabled
> > * /dev/mcelog provides many useful info about the MCEs, but
> >    doesn't capture how memory_failure recovered memory MCEs
> > * kernel logs: userspace needs to process log text
> >
> > Exposes per NUMA node memory error stats as sysfs entries:
> >
> >    /sys/devices/system/node/node${X}/memory_failure/total
> >    /sys/devices/system/node/node${X}/memory_failure/recovered
> >    /sys/devices/system/node/node${X}/memory_failure/ignored
> >    /sys/devices/system/node/node${X}/memory_failure/failed
> >    /sys/devices/system/node/node${X}/memory_failure/delayed
> >
> > These counters describe how many raw pages are poisoned and after the
> > attempted recoveries by the kernel, their resolutions: how many are
> > recovered, ignored, failed, or delayed respectively. The following
> > math holds for the statistics:
> > * total = recovered + ignored + failed + delayed
> >
> > Acked-by: David Rientjes <rientjes@google.com>
> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> > ---
> >   drivers/base/node.c    |  3 +++
> >   include/linux/mm.h     |  5 +++++
> >   include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++
> >   mm/memory-failure.c    | 35 +++++++++++++++++++++++++++++++++++
> >   4 files changed, 71 insertions(+)
> >
> > diff --git a/drivers/base/node.c b/drivers/base/node.c
> > index faf3597a96da..b46db17124f3 100644
> > --- a/drivers/base/node.c
> > +++ b/drivers/base/node.c
> > @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = {
> >       &node_dev_group,
> >   #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
> >       &arch_node_dev_group,
> > +#endif
> > +#ifdef CONFIG_MEMORY_FAILURE
> > +     &memory_failure_attr_group,
> >   #endif
> >       NULL
> >   };
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index f3f196e4d66d..888576884eb9 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3521,6 +3521,11 @@ enum mf_action_page_type {
> >       MF_MSG_UNKNOWN,
> >   };
> >
> > +/*
> > + * Sysfs entries for memory failure handling statistics.
> > + */
> > +extern const struct attribute_group memory_failure_attr_group;
> > +
>
> This should move under CONFIG_MEMORY_FAILURE

Thanks! I will move it around in the new version.

>
> >   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
> >   extern void clear_huge_page(struct page *page,
> >                           unsigned long addr_hint,
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index cd28a100d9e4..2c537b31fa7b 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -1110,6 +1110,31 @@ struct deferred_split {
> >   };
> >   #endif
> >
> > +#ifdef CONFIG_MEMORY_FAILURE
> > +/*
> > + * Per NUMA node memory failure handling statistics.
> > + */
> > +struct memory_failure_stats {
> > +     /*
> > +      * Number of raw pages poisoned.
> > +      * Cases not accounted: memory outside kernel control, offline page,
> > +      * arch-specific memory_failure (SGX), hwpoison_filter() filtered
> > +      * error events, and unpoison actions from hwpoison_unpoison.
> > +      */
> > +     unsigned long total;
> > +     /*
> > +      * Recovery results of poisoned raw pages handled by memory_failure,
> > +      * in sync with mf_result.
> > +      * total = ignored + failed + delayed + recovered.
> > +      * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
> > +      */
> > +     unsigned long ignored;
> > +     unsigned long failed;
> > +     unsigned long delayed;
> > +     unsigned long recovered;
> > +};
> > +#endif
> > +
> >   /*
> >    * On NUMA machines, each NUMA node would have a pg_data_t to describe
> >    * it's memory layout. On UMA machines there is a single pglist_data which
> > @@ -1253,6 +1278,9 @@ typedef struct pglist_data {
> >   #ifdef CONFIG_NUMA
> >       struct memory_tier __rcu *memtier;
> >   #endif
> > +#ifdef CONFIG_MEMORY_FAILURE
> > +     struct memory_failure_stats mf_stats;
> > +#endif
> >   } pg_data_t;
> >
> >   #define node_present_pages(nid)     (NODE_DATA(nid)->node_present_pages)
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index c77a9e37e27e..c628f1db3a4d 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i)
> >               memblk_nr_poison_sub(pfn, i);
> >   }
> >
> > +/**
> > + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
> > + * @_name: name of the file in the per NUMA sysfs directory.
> > + */
> > +#define MF_ATTR_RO(_name)                                    \
> > +static ssize_t _name##_show(struct device *dev,                      \
> > +                         struct device_attribute *attr,      \
> > +                         char *buf)                          \
> > +{                                                            \
> > +     struct memory_failure_stats *mf_stats =                 \
> > +             &NODE_DATA(dev->id)->mf_stats;                  \
> > +     return sprintf(buf, "%lu\n", mf_stats->_name);          \
> > +}                                                            \
> > +static DEVICE_ATTR_RO(_name)
> > +
> > +MF_ATTR_RO(total);
> > +MF_ATTR_RO(ignored);
> > +MF_ATTR_RO(failed);
> > +MF_ATTR_RO(delayed);
> > +MF_ATTR_RO(recovered);
> > +
> > +static struct attribute *memory_failure_attr[] = {
> > +     &dev_attr_total.attr,
> > +     &dev_attr_ignored.attr,
> > +     &dev_attr_failed.attr,
> > +     &dev_attr_delayed.attr,
> > +     &dev_attr_recovered.attr,
> > +     NULL,
> > +};
> > +
> > +const struct attribute_group memory_failure_attr_group = {
> > +     .name = "memory_failure",
> > +     .attrs = memory_failure_attr,
> > +};
> > +
> >   /*
> >    * Return values:
> >    *   1:   the page is dissolved (if needed) and taken off from buddy,
diff mbox series

Patch

diff --git a/drivers/base/node.c b/drivers/base/node.c
index faf3597a96da..b46db17124f3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -586,6 +586,9 @@  static const struct attribute_group *node_dev_groups[] = {
 	&node_dev_group,
 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
 	&arch_node_dev_group,
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+	&memory_failure_attr_group,
 #endif
 	NULL
 };
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3f196e4d66d..888576884eb9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3521,6 +3521,11 @@  enum mf_action_page_type {
 	MF_MSG_UNKNOWN,
 };
 
+/*
+ * Sysfs entries for memory failure handling statistics.
+ */
+extern const struct attribute_group memory_failure_attr_group;
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr_hint,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd28a100d9e4..2c537b31fa7b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1110,6 +1110,31 @@  struct deferred_split {
 };
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Per NUMA node memory failure handling statistics.
+ */
+struct memory_failure_stats {
+	/*
+	 * Number of raw pages poisoned.
+	 * Cases not accounted: memory outside kernel control, offline page,
+	 * arch-specific memory_failure (SGX), hwpoison_filter() filtered
+	 * error events, and unpoison actions from hwpoison_unpoison.
+	 */
+	unsigned long total;
+	/*
+	 * Recovery results of poisoned raw pages handled by memory_failure,
+	 * in sync with mf_result.
+	 * total = ignored + failed + delayed + recovered.
+	 * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
+	 */
+	unsigned long ignored;
+	unsigned long failed;
+	unsigned long delayed;
+	unsigned long recovered;
+};
+#endif
+
 /*
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout. On UMA machines there is a single pglist_data which
@@ -1253,6 +1278,9 @@  typedef struct pglist_data {
 #ifdef CONFIG_NUMA
 	struct memory_tier __rcu *memtier;
 #endif
+#ifdef CONFIG_MEMORY_FAILURE
+	struct memory_failure_stats mf_stats;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c77a9e37e27e..c628f1db3a4d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -87,6 +87,41 @@  inline void num_poisoned_pages_sub(unsigned long pfn, long i)
 		memblk_nr_poison_sub(pfn, i);
 }
 
+/**
+ * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
+ * @_name: name of the file in the per NUMA sysfs directory.
+ */
+#define MF_ATTR_RO(_name)					\
+static ssize_t _name##_show(struct device *dev,			\
+			    struct device_attribute *attr,	\
+			    char *buf)				\
+{								\
+	struct memory_failure_stats *mf_stats =			\
+		&NODE_DATA(dev->id)->mf_stats;			\
+	return sprintf(buf, "%lu\n", mf_stats->_name);		\
+}								\
+static DEVICE_ATTR_RO(_name)
+
+MF_ATTR_RO(total);
+MF_ATTR_RO(ignored);
+MF_ATTR_RO(failed);
+MF_ATTR_RO(delayed);
+MF_ATTR_RO(recovered);
+
+static struct attribute *memory_failure_attr[] = {
+	&dev_attr_total.attr,
+	&dev_attr_ignored.attr,
+	&dev_attr_failed.attr,
+	&dev_attr_delayed.attr,
+	&dev_attr_recovered.attr,
+	NULL,
+};
+
+const struct attribute_group memory_failure_attr_group = {
+	.name = "memory_failure",
+	.attrs = memory_failure_attr,
+};
+
 /*
  * Return values:
  *   1:   the page is dissolved (if needed) and taken off from buddy,