diff mbox series

[RFC] rasdaemon: Add page offline support for cxl memory

Message ID a4cdc0ddd56c450c9bfa1d950a3a37ac@micron.com
State New
Headers show
Series [RFC] rasdaemon: Add page offline support for cxl memory | expand

Commit Message

Srinivasulu Opensrc Oct. 14, 2024, 10:10 a.m. UTC
From: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>

CXL Type 3 device implements a threshold for corrected errors as described in
CXL 3.1 specification section 8.2.9.9.11.3. Device can set the threshold field
in the DRAM event descriptor when it detects corrected errors that meet or
exceed the threshold value.

This patch is intended to offline pages for corrected memory errors when the
device sets the threshold in the DRAM event descriptor.
This helps prevent corrected errors from becoming uncorrected.

Record the hpa for given dpa, then do page offline for hpa when corrected
errors threshold is set.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
---
 ras-cxl-handler.c    | 14 ++++++++++++++
 ras-page-isolation.c |  7 +++++++
 ras-page-isolation.h |  1 +
 ras-record.h         |  1 +
 4 files changed, 23 insertions(+)

Comments

Shiju Jose Oct. 16, 2024, 10:46 a.m. UTC | #1
Hi Srinivas,

Please see few comments inline,

>-----Original Message-----
>From: Srinivasulu Opensrc <sthanneeru.opensrc@micron.com>
>Sent: 14 October 2024 11:11
>To: mchehab@kernel.org; linux-edac@vger.kernel.org; linux-
>cxl@vger.kernel.org
>Cc: Srinivasulu Thanneeru <sthanneeru@micron.com>; Ajay Joshi
><ajayjoshi@micron.com>; Senthil Thangaraj <sthangaraj@micron.com>;
>Vandana Salve <vsalve@micron.com>
>Subject: [RFC PATCH] rasdaemon: Add page offline support for cxl memory
>
>From: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
>
>CXL Type 3 device implements a threshold for corrected errors as described in
>CXL 3.1 specification section 8.2.9.9.11.3. Device can set the threshold field in
>the DRAM event descriptor when it detects corrected errors that meet or
>exceed the threshold value.
1. Better mentioning Spec section and table for DRAM Event Record.

2. Section 8.2.9.2.1.1 Table 8-45 General Media Event Record  has threshold event bit
in memory event descriptor field. May need similar page offline support for General Media
Event Record too?

3. General question, Is the threshold check for the corrected errors in a CXL device
    always supported/enabled? If yes, please ignore following question.
    If not,
  1. Do we need to store the corrected errors reported using ras_record_page_error()
     when threshold check is not enabled?  and page would be offline when the total CE count
    exceeds threshold.val by the ras-page-isolation.
      Not sure how rasdaemon get information whether threshold check is enabled/supported?
      May be from Advanced Programmable Corrected Volatile Memory Error Threshold Feature? 
>
>This patch is intended to offline pages for corrected memory errors when the
>device sets the threshold in the DRAM event descriptor.
>This helps prevent corrected errors from becoming uncorrected.
>
>Record the hpa for given dpa, then do page offline for hpa when corrected
>errors threshold is set.
>
>Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
>---
> ras-cxl-handler.c    | 14 ++++++++++++++
> ras-page-isolation.c |  7 +++++++
> ras-page-isolation.h |  1 +
> ras-record.h         |  1 +
> 4 files changed, 23 insertions(+)
>
>diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index 037c19c..c163c6f 100644
>--- a/ras-cxl-handler.c
>+++ b/ras-cxl-handler.c
>@@ -13,6 +13,7 @@
>
> #include "ras-cxl-handler.h"
> #include "ras-logger.h"
>+#include "ras-page-isolation.h"
> #include "ras-record.h"
> #include "ras-report.h"
> #include "types.h"
>@@ -897,6 +898,12 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
> 	if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0)
> 		return -1;
>
>+	if (tep_get_field_val(s, event, "hpa", record, &val, 1) < 0)
>+		return -1;
>+	ev.hpa = val;
>+	if (trace_seq_printf(s, "hpa:0x%llx ", (unsigned long long)ev.hpa) <= 0)
>+		return -1;
>+
Support for the new fields in cxl_general_media and cxl_dram events  including 'hpa' had
submitted in August in the following pull request.
https://github.com/mchehab/rasdaemon/pull/178
https://github.com/mchehab/rasdaemon/pull/178/commits/0b396b47d740c88fbd890213f2d9d56e566e0671 
> 	if (tep_get_field_val(s,  event, "dpa_flags", record, &val, 1) < 0)
> 		return -1;
> 	ev.dpa_flags = val;
>@@ -1005,6 +1012,13 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
> 		}
> 	}
>
>+#ifdef HAVE_MEMORY_CE_PFA
>+	/* Page offline for CE when threeshold is set */
>+	if (!(ev.descriptor & CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT)
>&&
>+	     (ev.descriptor & CXL_GMER_EVT_DESC_THRESHOLD_EVENT))
I think alignment should match open parenthesis.
>+		ras_hw_threshold_pageoffline(ev.hpa);
>+#endif
>+
> 	/* Insert data into the SGBD */
> #ifdef HAVE_SQLITE3
> 	ras_store_cxl_dram_event(ras, &ev);
>diff --git a/ras-page-isolation.c b/ras-page-isolation.c index bb6b777..6eb45d0
>100644
>--- a/ras-page-isolation.c
>+++ b/ras-page-isolation.c
>@@ -338,3 +338,10 @@ void ras_record_page_error(unsigned long long addr,
>unsigned int count, time_t t
> 		page_record(pr, count, time);
> 	}
> }
>+
>+void ras_hw_threshold_pageoffline(unsigned long long addr) {
>+	time_t now = time(NULL);
>+
>+	ras_record_page_error(addr, threshold.val, now); }
>diff --git a/ras-page-isolation.h b/ras-page-isolation.h index 73c9157..ed2f661
>100644
>--- a/ras-page-isolation.h
>+++ b/ras-page-isolation.h
>@@ -57,5 +57,6 @@ struct isolation {
> void ras_page_account_init(void);
> void ras_record_page_error(unsigned long long addr,
> 			   unsigned int count, time_t time);
>+void ras_hw_threshold_pageoffline(unsigned long long addr);
>
> #endif
>diff --git a/ras-record.h b/ras-record.h index bd861ff..d4969d1 100644
>--- a/ras-record.h
>+++ b/ras-record.h
>@@ -203,6 +203,7 @@ struct ras_cxl_general_media_event {  struct
>ras_cxl_dram_event {
> 	struct ras_cxl_event_common_hdr hdr;
> 	uint64_t dpa;
>+	uint64_t hpa;
> 	uint8_t dpa_flags;
> 	uint8_t descriptor;
> 	uint8_t type;
>--
>2.46.2
>
Thanks,
Shiju
Srinivasulu Opensrc Oct. 23, 2024, 6:09 a.m. UTC | #2
>-----Original Message-----
>From: Shiju Jose <shiju.jose@huawei.com>
>Sent: Wednesday, October 16, 2024 4:16 PM
>To: Srinivasulu Opensrc <sthanneeru.opensrc@micron.com>;
>mchehab@kernel.org; linux-edac@vger.kernel.org; linux-cxl@vger.kernel.org
>Cc: Srinivasulu Thanneeru <sthanneeru@micron.com>; Ajay Joshi
><ajayjoshi@micron.com>; Senthil Thangaraj <sthangaraj@micron.com>; Vandana
>Salve <vsalve@micron.com>
>Subject: [EXT] RE: [RFC PATCH] rasdaemon: Add page offline support for cxl
>memory
>
>CAUTION: EXTERNAL EMAIL. Do not click links or open attachments unless you
>recognize the sender and were expecting this message.
>
>
>Hi Srinivas,
>
>Please see few comments inline,
>
>>-----Original Message-----
>>From: Srinivasulu Opensrc <sthanneeru.opensrc@micron.com>
>>Sent: 14 October 2024 11:11
>>To: mchehab@kernel.org; linux-edac@vger.kernel.org; linux-
>>cxl@vger.kernel.org
>>Cc: Srinivasulu Thanneeru <sthanneeru@micron.com>; Ajay Joshi
>><ajayjoshi@micron.com>; Senthil Thangaraj <sthangaraj@micron.com>;
>>Vandana Salve <vsalve@micron.com>
>>Subject: [RFC PATCH] rasdaemon: Add page offline support for cxl memory
>>
>>From: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
>>
>>CXL Type 3 device implements a threshold for corrected errors as described in
>>CXL 3.1 specification section 8.2.9.9.11.3. Device can set the threshold field in
>>the DRAM event descriptor when it detects corrected errors that meet or
>>exceed the threshold value.
>1. Better mentioning Spec section and table for DRAM Event Record.

Sure, will add table information too.
>
>2. Section 8.2.9.2.1.1 Table 8-45 General Media Event Record  has threshold event
>bit
>in memory event descriptor field. May need similar page offline support for
>General Media
>Event Record too?
>

This patch mainly targeted for DRAM events, can be extended for Media Event in future if required.

>3. General question, Is the threshold check for the corrected errors in a CXL device
>    always supported/enabled? If yes, please ignore following question.
>    If not,
>  1. Do we need to store the corrected errors reported using
>ras_record_page_error()
>     when threshold check is not enabled?  and page would be offline when the total
>CE count
>    exceeds threshold.val by the ras-page-isolation.
>      Not sure how rasdaemon get information whether threshold check is
>enabled/supported?
>      May be from Advanced Programmable Corrected Volatile Memory Error
>Threshold Feature?

Yes, it uses ACVME threshold feature, if threshold enabled then flag is set in dram event record,
Depending on the flag, the RASDAEMON can know whether feature is enabled or not.
If enabled, then RASDAEMON will immediately try to page offline. If not enabled, then 
call to ras_record_page_error() wont takes place.

>>
>>This patch is intended to offline pages for corrected memory errors when the
>>device sets the threshold in the DRAM event descriptor.
>>This helps prevent corrected errors from becoming uncorrected.
>>
>>Record the hpa for given dpa, then do page offline for hpa when corrected
>>errors threshold is set.
>>
>>Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
>>---
>> ras-cxl-handler.c    | 14 ++++++++++++++
>> ras-page-isolation.c |  7 +++++++
>> ras-page-isolation.h |  1 +
>> ras-record.h         |  1 +
>> 4 files changed, 23 insertions(+)
>>
>>diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index 037c19c..c163c6f 100644
>>--- a/ras-cxl-handler.c
>>+++ b/ras-cxl-handler.c
>>@@ -13,6 +13,7 @@
>>
>> #include "ras-cxl-handler.h"
>> #include "ras-logger.h"
>>+#include "ras-page-isolation.h"
>> #include "ras-record.h"
>> #include "ras-report.h"
>> #include "types.h"
>>@@ -897,6 +898,12 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
>>       if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0)
>>               return -1;
>>
>>+      if (tep_get_field_val(s, event, "hpa", record, &val, 1) < 0)
>>+              return -1;
>>+      ev.hpa = val;
>>+      if (trace_seq_printf(s, "hpa:0x%llx ", (unsigned long long)ev.hpa) <= 0)
>>+              return -1;
>>+
>Support for the new fields in cxl_general_media and cxl_dram events  including
>'hpa' had
>submitted in August in the following pull request.
>https://github.co/
>m%2Fmchehab%2Frasdaemon%2Fpull%2F178&data=05%7C02%7Csthanneeru.op
>ensrc%40micron.com%7C58611d95cc3244d1019b08dcedcfbd2d%7Cf38a5ecd281
>34862b11bac1d563c806f%7C0%7C0%7C638646723730965866%7CUnknown%7C
>TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXV
>CI6Mn0%3D%7C0%7C%7C%7C&sdata=A%2B2H1gyx3njUG5U0p3p2IPzNkcPBfaKG
>QPfvcC%2BZRAQ%3D&reserved=0
>https://github.co/
>m%2Fmchehab%2Frasdaemon%2Fpull%2F178%2Fcommits%2F0b396b47d740c88
>fbd890213f2d9d56e566e0671&data=05%7C02%7Csthanneeru.opensrc%40micron
>.com%7C58611d95cc3244d1019b08dcedcfbd2d%7Cf38a5ecd28134862b11bac1d
>563c806f%7C0%7C0%7C638646723730965866%7CUnknown%7CTWFpbGZsb3d8
>eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7
>C0%7C%7C%7C&sdata=djrC%2BSZVrrxsZ8IiC2RJ3E5%2BwZs4QYtqulVBdgPrnyU%
>3D&reserved=0
>>       if (tep_get_field_val(s,  event, "dpa_flags", record, &val, 1) < 0)
>>               return -1;
>>       ev.dpa_flags = val;
>>@@ -1005,6 +1012,13 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
>>               }
>>       }
>>
>>+#ifdef HAVE_MEMORY_CE_PFA
>>+      /* Page offline for CE when threeshold is set */
>>+      if (!(ev.descriptor & CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT)
>>&&
>>+           (ev.descriptor & CXL_GMER_EVT_DESC_THRESHOLD_EVENT))
>I think alignment should match open parenthesis.
>>+              ras_hw_threshold_pageoffline(ev.hpa);
>>+#endif
>>+
>>       /* Insert data into the SGBD */
>> #ifdef HAVE_SQLITE3
>>       ras_store_cxl_dram_event(ras, &ev);
>>diff --git a/ras-page-isolation.c b/ras-page-isolation.c index bb6b777..6eb45d0
>>100644
>>--- a/ras-page-isolation.c
>>+++ b/ras-page-isolation.c
>>@@ -338,3 +338,10 @@ void ras_record_page_error(unsigned long long addr,
>>unsigned int count, time_t t
>>               page_record(pr, count, time);
>>       }
>> }
>>+
>>+void ras_hw_threshold_pageoffline(unsigned long long addr) {
>>+      time_t now = time(NULL);
>>+
>>+      ras_record_page_error(addr, threshold.val, now); }
>>diff --git a/ras-page-isolation.h b/ras-page-isolation.h index 73c9157..ed2f661
>>100644
>>--- a/ras-page-isolation.h
>>+++ b/ras-page-isolation.h
>>@@ -57,5 +57,6 @@ struct isolation {
>> void ras_page_account_init(void);
>> void ras_record_page_error(unsigned long long addr,
>>                          unsigned int count, time_t time);
>>+void ras_hw_threshold_pageoffline(unsigned long long addr);
>>
>> #endif
>>diff --git a/ras-record.h b/ras-record.h index bd861ff..d4969d1 100644
>>--- a/ras-record.h
>>+++ b/ras-record.h
>>@@ -203,6 +203,7 @@ struct ras_cxl_general_media_event {  struct
>>ras_cxl_dram_event {
>>       struct ras_cxl_event_common_hdr hdr;
>>       uint64_t dpa;
>>+      uint64_t hpa;
>>       uint8_t dpa_flags;
>>       uint8_t descriptor;
>>       uint8_t type;
>>--
>>2.46.2
>>
>Thanks,
>Shiju
diff mbox series

Patch

diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 037c19c..c163c6f 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -13,6 +13,7 @@ 
 
 #include "ras-cxl-handler.h"
 #include "ras-logger.h"
+#include "ras-page-isolation.h"
 #include "ras-record.h"
 #include "ras-report.h"
 #include "types.h"
@@ -897,6 +898,12 @@  int ras_cxl_dram_event_handler(struct trace_seq *s,
 	if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0)
 		return -1;
 
+	if (tep_get_field_val(s, event, "hpa", record, &val, 1) < 0)
+		return -1;
+	ev.hpa = val;
+	if (trace_seq_printf(s, "hpa:0x%llx ", (unsigned long long)ev.hpa) <= 0)
+		return -1;
+
 	if (tep_get_field_val(s,  event, "dpa_flags", record, &val, 1) < 0)
 		return -1;
 	ev.dpa_flags = val;
@@ -1005,6 +1012,13 @@  int ras_cxl_dram_event_handler(struct trace_seq *s,
 		}
 	}
 
+#ifdef HAVE_MEMORY_CE_PFA
+	/* Page offline for CE when threeshold is set */
+	if (!(ev.descriptor & CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT) &&
+	     (ev.descriptor & CXL_GMER_EVT_DESC_THRESHOLD_EVENT))
+		ras_hw_threshold_pageoffline(ev.hpa);
+#endif
+
 	/* Insert data into the SGBD */
 #ifdef HAVE_SQLITE3
 	ras_store_cxl_dram_event(ras, &ev);
diff --git a/ras-page-isolation.c b/ras-page-isolation.c
index bb6b777..6eb45d0 100644
--- a/ras-page-isolation.c
+++ b/ras-page-isolation.c
@@ -338,3 +338,10 @@  void ras_record_page_error(unsigned long long addr, unsigned int count, time_t t
 		page_record(pr, count, time);
 	}
 }
+
+void ras_hw_threshold_pageoffline(unsigned long long addr)
+{
+	time_t now = time(NULL);
+
+	ras_record_page_error(addr, threshold.val, now);
+}
diff --git a/ras-page-isolation.h b/ras-page-isolation.h
index 73c9157..ed2f661 100644
--- a/ras-page-isolation.h
+++ b/ras-page-isolation.h
@@ -57,5 +57,6 @@  struct isolation {
 void ras_page_account_init(void);
 void ras_record_page_error(unsigned long long addr,
 			   unsigned int count, time_t time);
+void ras_hw_threshold_pageoffline(unsigned long long addr);
 
 #endif
diff --git a/ras-record.h b/ras-record.h
index bd861ff..d4969d1 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -203,6 +203,7 @@  struct ras_cxl_general_media_event {
 struct ras_cxl_dram_event {
 	struct ras_cxl_event_common_hdr hdr;
 	uint64_t dpa;
+	uint64_t hpa;
 	uint8_t dpa_flags;
 	uint8_t descriptor;
 	uint8_t type;