diff mbox series

[07/13] rasdaemon: cxl: Update CXL DRAM event to CXL spec rev 3.1

Message ID 20241120095923.1891-8-shiju.jose@huawei.com
State New
Headers show
Series rasdaemon: cxl: Update CXL event logging and recording to CXL spec rev 3.1 | expand

Commit Message

Shiju Jose Nov. 20, 2024, 9:59 a.m. UTC
From: Shiju Jose <shiju.jose@huawei.com>

CXL spec 3.1 section 8.2.9.2.1.2 Table 8-46, DRAM Event Record has updated
with following new fields and new types for Memory Event Type, Transaction
Type and Validity Flags fields.
1. Component Identifier
2. Sub-channel
3. Advanced Programmable Corrected Memory Error Threshold Event Flags
4. Corrected Memory Error Count at Event
5. Memory Event Sub-Type

Update the parsing, logging and recording of DRAM event for the above
spec rev 3.1 changes.

Example rasdaemon log for CXL DRAM event,

cxl_dram 2024-11-20 00:18:53 +0000 memdev:mem0 host:0000:0f:00.0 serial:0x3 \
log type:Informational hdr_uuid:601dcbb3-9c06-4eab-b8af-4e9bfb5c9624 \
hdr_handle:0x1 hdr_related_handle:0x0 hdr_timestamp:1970-01-01 00:00:58 +0000 \
hdr_length:128 hdr_maint_op_class:1 hdr_maint_op_sub_class:3 dpa:0x18680 \
dpa_flags:descriptor:'UNCORRECTABLE EVENT' 'THRESHOLD EVENT' \
memory_event_type:Data Path Error memory_event_sub_type:Media Link CRC Error \
transaction_type:Internal Media Scrub hpa:0xffffffffffffffff region: \
region_uuid:00000000-0000-0000-0000-000000000000 channel:3 rank:17 \
nibble_mask:3866802 bank_group:7 bank:11 row:2 column:77
correction_mask:21 00 00 00 00 00 00 00 2c 00 00 00 00 00 00 00 37 00 00 \
00 00 00 00 00 42 00 00 00 00 00 00 00 comp_id:01 74 c5 08 9a 1a 0b fc d2 \
7e 2f 31 9b 3c 81 4d comp_id_pldm_valid_flags:'PLDM Entity ID' \
PLDM Entity ID:74 c5 08 9a 1a 0b \
Advanced Programmable CME threshold Event Flags:'Corrected Memory Errors \
in Multiple Media Components' 'Exceeded Programmable Threshold' \
CVME Count:0x94

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 ras-cxl-handler.c | 67 +++++++++++++++++++++++++++++++++++++++++++++--
 ras-record.c      | 18 +++++++++++++
 ras-record.h      |  7 +++++
 ras-report.c      | 12 +++++++--
 4 files changed, 100 insertions(+), 4 deletions(-)

Comments

Jonathan Cameron Nov. 21, 2024, 3:29 p.m. UTC | #1
On Wed, 20 Nov 2024 09:59:17 +0000
<shiju.jose@huawei.com> wrote:

> From: Shiju Jose <shiju.jose@huawei.com>
> 
> CXL spec 3.1 section 8.2.9.2.1.2 Table 8-46, DRAM Event Record has updated
> with following new fields and new types for Memory Event Type, Transaction
> Type and Validity Flags fields.
> 1. Component Identifier
> 2. Sub-channel
> 3. Advanced Programmable Corrected Memory Error Threshold Event Flags
> 4. Corrected Memory Error Count at Event
> 5. Memory Event Sub-Type
> 
> Update the parsing, logging and recording of DRAM event for the above
> spec rev 3.1 changes.
> 
> Example rasdaemon log for CXL DRAM event,
> 
> cxl_dram 2024-11-20 00:18:53 +0000 memdev:mem0 host:0000:0f:00.0 serial:0x3 \
> log type:Informational hdr_uuid:601dcbb3-9c06-4eab-b8af-4e9bfb5c9624 \
> hdr_handle:0x1 hdr_related_handle:0x0 hdr_timestamp:1970-01-01 00:00:58 +0000 \
> hdr_length:128 hdr_maint_op_class:1 hdr_maint_op_sub_class:3 dpa:0x18680 \
> dpa_flags:descriptor:'UNCORRECTABLE EVENT' 'THRESHOLD EVENT' \
> memory_event_type:Data Path Error memory_event_sub_type:Media Link CRC Error \
> transaction_type:Internal Media Scrub hpa:0xffffffffffffffff region: \
> region_uuid:00000000-0000-0000-0000-000000000000 channel:3 rank:17 \
> nibble_mask:3866802 bank_group:7 bank:11 row:2 column:77
> correction_mask:21 00 00 00 00 00 00 00 2c 00 00 00 00 00 00 00 37 00 00 \
> 00 00 00 00 00 42 00 00 00 00 00 00 00 comp_id:01 74 c5 08 9a 1a 0b fc d2 \
> 7e 2f 31 9b 3c 81 4d comp_id_pldm_valid_flags:'PLDM Entity ID' \
> PLDM Entity ID:74 c5 08 9a 1a 0b \
> Advanced Programmable CME threshold Event Flags:'Corrected Memory Errors \
> in Multiple Media Components' 'Exceeded Programmable Threshold' \
> CVME Count:0x94
> 
> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Also LGTM
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
diff mbox series

Patch

diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index c64c540..c028203 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -1003,7 +1003,7 @@  int ras_cxl_general_media_event_handler(struct trace_seq *s,
 /*
  * DRAM Event Record - DER
  *
- * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ * CXL rev 3.1 section 8.2.9.2.1.2; Table 8-46
  */
 #define CXL_DER_VALID_CHANNEL			BIT(0)
 #define CXL_DER_VALID_RANK			BIT(1)
@@ -1013,19 +1013,25 @@  int ras_cxl_general_media_event_handler(struct trace_seq *s,
 #define CXL_DER_VALID_ROW			BIT(5)
 #define CXL_DER_VALID_COLUMN			BIT(6)
 #define CXL_DER_VALID_CORRECTION_MASK		BIT(7)
+#define CXL_DER_VALID_COMPONENT_ID		BIT(8)
+#define CXL_DER_VALID_COMPONENT_ID_FORMAT	BIT(9)
+#define CXL_DER_VALID_SUB_CHANNEL		BIT(10)
 
 static const char * const cxl_der_mem_event_type[] = {
 	"Media ECC Error",
 	"Scrub Media ECC Error",
 	"Invalid Address",
 	"Data Path Error",
+	"TE State Violation",
+	"Advanced Programmable CME Counter Expiration",
+	"CKID Violation",
 };
 
 int ras_cxl_dram_event_handler(struct trace_seq *s,
 			       struct tep_record *record,
 			       struct tep_event *event, void *context)
 {
-	int len, i;
+	int len, i, rc;
 	unsigned long long val;
 	struct ras_events *ras = context;
 	struct ras_cxl_dram_event ev;
@@ -1067,6 +1073,15 @@  int ras_cxl_dram_event_handler(struct trace_seq *s,
 					      ev.type)) <= 0)
 		return -1;
 
+	if (tep_get_field_val(s,  event, "sub_type", record, &val, 1) < 0)
+		return -1;
+	ev.sub_type = val;
+	if (trace_seq_printf(s, "memory_event_sub_type:%s ",
+			     get_cxl_type_str(cxl_mem_event_sub_type,
+					      ARRAY_SIZE(cxl_mem_event_sub_type),
+					      ev.sub_type)) <= 0)
+		return -1;
+
 	if (tep_get_field_val(s,  event, "transaction_type", record, &val, 1) < 0)
 		return -1;
 	ev.transaction_type = val;
@@ -1108,6 +1123,14 @@  int ras_cxl_dram_event_handler(struct trace_seq *s,
 			return -1;
 	}
 
+	if (ev.validity_flags & CXL_DER_VALID_SUB_CHANNEL) {
+		if (tep_get_field_val(s,  event, "sub_channel", record, &val, 1) < 0)
+			return -1;
+		ev.sub_channel = val;
+		if (trace_seq_printf(s, "sub_channel:%u ", ev.sub_channel) <= 0)
+			return -1;
+	}
+
 	if (ev.validity_flags & CXL_DER_VALID_RANK) {
 		if (tep_get_field_val(s,  event, "rank", record, &val, 1) < 0)
 			return -1;
@@ -1168,6 +1191,46 @@  int ras_cxl_dram_event_handler(struct trace_seq *s,
 		}
 	}
 
+	if (ev.validity_flags & CXL_DER_VALID_COMPONENT_ID) {
+		ev.comp_id = tep_get_field_raw(s, event, "comp_id", record, &len, 1);
+		if (!ev.comp_id)
+			return -1;
+		if (trace_seq_printf(s, "comp_id:") <= 0)
+			return -1;
+		for (i = 0; i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; i++) {
+			if (trace_seq_printf(s, "%02x ", ev.comp_id[i]) <= 0)
+				break;
+		}
+
+		if (ev.validity_flags & CXL_DER_VALID_COMPONENT_ID_FORMAT) {
+			if (trace_seq_printf(s, "comp_id_pldm_valid_flags:") <= 0)
+				return -1;
+			if (decode_cxl_event_flags(s, ev.comp_id[0], cxl_pldm_comp_id_flags,
+						   ARRAY_SIZE(cxl_pldm_comp_id_flags)) < 0)
+				return -1;
+
+			rc = ras_cxl_print_component_id(s, ev.comp_id, ev.entity_id, ev.res_id);
+			if (rc)
+				return rc;
+		}
+	}
+
+	if (tep_get_field_val(s,  event, "cme_threshold_ev_flags", record, &val, 1) < 0)
+		return -1;
+	ev.cme_threshold_ev_flags = val;
+	if (trace_seq_printf(s, "Advanced Programmable CME threshold Event Flags:") <= 0)
+		return -1;
+	if (decode_cxl_event_flags(s, ev.cme_threshold_ev_flags,
+				   cxl_cme_threshold_ev_flags,
+				   ARRAY_SIZE(cxl_cme_threshold_ev_flags)) < 0)
+		return -1;
+
+	if (tep_get_field_val(s,  event, "cvme_count", record, &val, 1) < 0)
+		return -1;
+	ev.cvme_count = val;
+	if (trace_seq_printf(s, "CVME Count:0x%x ", ev.cvme_count) <= 0)
+		return -1;
+
 	/* Insert data into the SGBD */
 #ifdef HAVE_SQLITE3
 	ras_store_cxl_dram_event(ras, &ev);
diff --git a/ras-record.c b/ras-record.c
index e24562e..82939e7 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -985,6 +985,13 @@  static const struct db_fields cxl_dram_event_fields[] = {
 	{ .name = "hpa",		.type = "INTEGER" },
 	{ .name = "region",		.type = "TEXT" },
 	{ .name = "region_uuid",	.type = "TEXT" },
+	{ .name = "comp_id",		.type = "BLOB" },
+	{ .name = "pldm_entity_id",	.type = "BLOB" },
+	{ .name = "pldm_resource_id",	.type = "BLOB" },
+	{ .name = "sub_type",		.type = "INTEGER" },
+	{ .name = "sub_channel",	.type = "INTEGER" },
+	{ .name = "cme_threshold_ev_flags",	.type = "INTEGER" },
+	{ .name = "cvme_count",		.type = "INTEGER" },
 };
 
 static const struct db_table_descriptor cxl_dram_event_tab = {
@@ -1024,6 +1031,17 @@  int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *
 	sqlite3_bind_int64(priv->stmt_cxl_dram_event, idx++, ev->hpa);
 	sqlite3_bind_text(priv->stmt_cxl_dram_event, idx++, ev->region, -1, NULL);
 	sqlite3_bind_text(priv->stmt_cxl_dram_event, idx++, ev->region_uuid, -1, NULL);
+	sqlite3_bind_blob(priv->stmt_cxl_dram_event, idx++, ev->comp_id,
+			  CXL_EVENT_GEN_MED_COMP_ID_SIZE, NULL);
+	sqlite3_bind_blob(priv->stmt_cxl_dram_event, idx++, ev->entity_id,
+			  CXL_PLDM_ENTITY_ID_LEN, NULL);
+	sqlite3_bind_blob(priv->stmt_cxl_dram_event, idx++, ev->res_id,
+			  CXL_PLDM_RES_ID_LEN, NULL);
+	sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++, ev->sub_type);
+	sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++, ev->sub_channel);
+	sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++,
+			 ev->cme_threshold_ev_flags);
+	sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++, ev->cvme_count);
 
 	rc = sqlite3_step(priv->stmt_cxl_dram_event);
 	if (rc != SQLITE_OK && rc != SQLITE_DONE)
diff --git a/ras-record.h b/ras-record.h
index 12e693b..3aec063 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -218,8 +218,10 @@  struct ras_cxl_dram_event {
 	uint8_t dpa_flags;
 	uint8_t descriptor;
 	uint8_t type;
+	uint8_t sub_type;
 	uint8_t transaction_type;
 	uint8_t channel;
+	uint8_t sub_channel;
 	uint8_t rank;
 	uint32_t nibble_mask;
 	uint8_t bank_group;
@@ -231,6 +233,11 @@  struct ras_cxl_dram_event {
 	uint64_t hpa;
 	const char *region;
 	const char *region_uuid;
+	uint8_t *comp_id;
+	uint8_t entity_id[CXL_PLDM_ENTITY_ID_LEN];
+	uint8_t res_id[CXL_PLDM_RES_ID_LEN];
+	uint8_t cme_threshold_ev_flags;
+	uint32_t cvme_count;
 };
 
 struct ras_cxl_memory_module_event {
diff --git a/ras-report.c b/ras-report.c
index ed1f4b8..8e343fc 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -624,17 +624,21 @@  static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev
 		"dpa_flags=%u\n"
 		"descriptor=%u\n"
 		"type=%u\n"
+		"sub_type=0x%x\n"
 		"transaction_type=%u\n"
 		"hpa=0x%lx\n"
 		"region=%s\n"
 		"region_uuid=%s\n"
 		"channel=%u\n"
+		"sub_channel=%u\n"
 		"rank=%u\n"
 		"nibble_mask=%u\n"
 		"bank_group=%u\n"
 		"bank=%u\n"
 		"row=%u\n"
-		"column=%u\n",
+		"column=%u\n"
+		"cme_threshold_ev_flags=0x%x\n"
+		"cvme_count=0x%x\n",
 		ev->hdr.timestamp,
 		ev->hdr.memdev,
 		ev->hdr.host,
@@ -651,17 +655,21 @@  static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev
 		ev->dpa_flags,
 		ev->descriptor,
 		ev->type,
+		ev->sub_type,
 		ev->transaction_type,
 		ev->hpa,
 		ev->region,
 		ev->region_uuid,
 		ev->channel,
+		ev->sub_channel,
 		ev->rank,
 		ev->nibble_mask,
 		ev->bank_group,
 		ev->bank,
 		ev->row,
-		ev->column);
+		ev->column,
+		ev->cme_threshold_ev_flags,
+		ev->cvme_count);
 
 	return 0;
 }