From patchwork Wed Apr 12 08:33:05 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shiju Jose X-Patchwork-Id: 13208679 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id EEA3BC7619A for ; Wed, 12 Apr 2023 09:23:29 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229817AbjDLJX2 (ORCPT ); Wed, 12 Apr 2023 05:23:28 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38370 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229585AbjDLJX0 (ORCPT ); Wed, 12 Apr 2023 05:23:26 -0400 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BA04F61B5; Wed, 12 Apr 2023 02:23:25 -0700 (PDT) Received: from lhrpeml500006.china.huawei.com (unknown [172.18.147.201]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4PxGBt2lVVz6J7Dj; Wed, 12 Apr 2023 16:31:18 +0800 (CST) Received: from SecurePC30232.china.huawei.com (10.122.247.234) by lhrpeml500006.china.huawei.com (7.191.161.198) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.23; Wed, 12 Apr 2023 09:33:40 +0100 From: To: , , CC: , , Subject: [RFC PATCH 1/7] rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format Date: Wed, 12 Apr 2023 16:33:05 +0800 Message-ID: <20230412083312.1384-2-shiju.jose@huawei.com> X-Mailer: git-send-email 2.35.1.windows.2 In-Reply-To: <20230412083312.1384-1-shiju.jose@huawei.com> References: <20230412083312.1384-1-shiju.jose@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.122.247.234] X-ClientProxiedBy: lhrpeml100004.china.huawei.com (7.191.162.219) To lhrpeml500006.china.huawei.com (7.191.161.198) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-edac@vger.kernel.org From: Shiju Jose Add common function to convert the timestamp in the CXL event records in nanoseconds to the broken-down time format. Signed-off-by: Shiju Jose --- ras-cxl-handler.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index adc2fa3..ad93558 100644 --- a/ras-cxl-handler.c +++ b/ras-cxl-handler.c @@ -23,6 +23,25 @@ #include "ras-report.h" #include +/* Common Functions */ +static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size) +{ + /* CXL Specification 3.0 + * Overflow timestamp - The number of unsigned nanoseconds + * that have elapsed since midnight, 01-Jan-1970 UTC + */ + time_t ts_secs = ts / 1000000000ULL; + struct tm *tm; + + tm = localtime(&ts_secs); + if (tm) + strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); + + if (!ts || !tm) + strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", + size); +} + /* Poison List: Payload out flags */ #define CXL_POISON_FLAG_MORE BIT(0) #define CXL_POISON_FLAG_OVERFLOW BIT(1) @@ -160,22 +179,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, if (ev.flags & CXL_POISON_FLAG_OVERFLOW) { if (tep_get_field_val(s, event, "overflow_t", record, &val, 1) < 0) return -1; - if (val) { - /* CXL Specification 3.0 - * Overflow timestamp - The number of unsigned nanoseconds - * that have elapsed since midnight, 01-Jan-1970 UTC - */ - time_t ovf_ts_secs = val / 1000000000ULL; - - tm = localtime(&ovf_ts_secs); - if (tm) { - strftime(ev.overflow_ts, sizeof(ev.overflow_ts), - "%Y-%m-%d %H:%M:%S %z", tm); - } - } - if (!val || !tm) - strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", - sizeof(ev.overflow_ts)); + convert_timestamp(val, ev.overflow_ts, sizeof(ev.overflow_ts)); } else strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts)); if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0) From patchwork Wed Apr 12 08:33:06 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shiju Jose X-Patchwork-Id: 13208663 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 5B2EFC77B6E for ; Wed, 12 Apr 2023 08:54:44 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231480AbjDLIyn (ORCPT ); Wed, 12 Apr 2023 04:54:43 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:44990 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231455AbjDLIyi (ORCPT ); Wed, 12 Apr 2023 04:54:38 -0400 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B3DE9A5C4; Wed, 12 Apr 2023 01:54:16 -0700 (PDT) Received: from lhrpeml500006.china.huawei.com (unknown [172.18.147.201]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4PxGBt3fksz6J7DV; Wed, 12 Apr 2023 16:31:18 +0800 (CST) Received: from SecurePC30232.china.huawei.com (10.122.247.234) by lhrpeml500006.china.huawei.com (7.191.161.198) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.23; Wed, 12 Apr 2023 09:33:40 +0100 From: To: , , CC: , , Subject: [RFC PATCH 2/7] rasdaemon: Add common function to get timestamp for the event Date: Wed, 12 Apr 2023 16:33:06 +0800 Message-ID: <20230412083312.1384-3-shiju.jose@huawei.com> X-Mailer: git-send-email 2.35.1.windows.2 In-Reply-To: <20230412083312.1384-1-shiju.jose@huawei.com> References: <20230412083312.1384-1-shiju.jose@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.122.247.234] X-ClientProxiedBy: lhrpeml100004.china.huawei.com (7.191.162.219) To lhrpeml500006.china.huawei.com (7.191.161.198) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-edac@vger.kernel.org From: Shiju Jose Add common function to get the timestamp for the event reported. Signed-off-by: Shiju Jose --- ras-cxl-handler.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index ad93558..025e582 100644 --- a/ras-cxl-handler.c +++ b/ras-cxl-handler.c @@ -42,6 +42,20 @@ static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size size); } +static void get_timestamp(struct trace_seq *s, struct tep_record *record, + struct ras_events *ras, char *ts_ptr, uint16_t size) +{ + time_t now; + struct tm *tm; + + now = record->ts / user_hz + ras->uptime_diff; + tm = localtime(&now); + if (tm) + strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); + else + strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size); +} + /* Poison List: Payload out flags */ #define CXL_POISON_FLAG_MORE BIT(0) #define CXL_POISON_FLAG_OVERFLOW BIT(1) @@ -68,17 +82,9 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, int len; unsigned long long val; struct ras_events *ras = context; - time_t now; - struct tm *tm; struct ras_cxl_poison_event ev; - now = record->ts / user_hz + ras->uptime_diff; - tm = localtime(&now); - if (tm) - strftime(ev.timestamp, sizeof(ev.timestamp), - "%Y-%m-%d %H:%M:%S %z", tm); - else - strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) return -1; @@ -277,19 +283,11 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, { int len, i; unsigned long long val; - time_t now; - struct tm *tm; struct ras_events *ras = context; struct ras_cxl_aer_ue_event ev; memset(&ev, 0, sizeof(ev)); - now = record->ts / user_hz + ras->uptime_diff; - tm = localtime(&now); - if (tm) - strftime(ev.timestamp, sizeof(ev.timestamp), - "%Y-%m-%d %H:%M:%S %z", tm); - else - strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) return -1; @@ -372,18 +370,10 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, { int len; unsigned long long val; - time_t now; - struct tm *tm; struct ras_events *ras = context; struct ras_cxl_aer_ce_event ev; - now = record->ts / user_hz + ras->uptime_diff; - tm = localtime(&now); - if (tm) - strftime(ev.timestamp, sizeof(ev.timestamp), - "%Y-%m-%d %H:%M:%S %z", tm); - else - strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) return -1; From patchwork Wed Apr 12 08:33:07 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shiju Jose X-Patchwork-Id: 13208643 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 704DFC77B6E for ; Wed, 12 Apr 2023 08:36:44 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230341AbjDLIgn (ORCPT ); Wed, 12 Apr 2023 04:36:43 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39458 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230197AbjDLIgS (ORCPT ); Wed, 12 Apr 2023 04:36:18 -0400 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3C77E65BA; Wed, 12 Apr 2023 01:34:45 -0700 (PDT) Received: from lhrpeml500006.china.huawei.com (unknown [172.18.147.226]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4PxGDW6hBvz67m7K; Wed, 12 Apr 2023 16:32:43 +0800 (CST) Received: from SecurePC30232.china.huawei.com (10.122.247.234) by lhrpeml500006.china.huawei.com (7.191.161.198) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.23; Wed, 12 Apr 2023 09:33:40 +0100 From: To: , , CC: , , Subject: [RFC PATCH 3/7] rasdaemon: Add support for the CXL overflow events Date: Wed, 12 Apr 2023 16:33:07 +0800 Message-ID: <20230412083312.1384-4-shiju.jose@huawei.com> X-Mailer: git-send-email 2.35.1.windows.2 In-Reply-To: <20230412083312.1384-1-shiju.jose@huawei.com> References: <20230412083312.1384-1-shiju.jose@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.122.247.234] X-ClientProxiedBy: lhrpeml100004.china.huawei.com (7.191.162.219) To lhrpeml500006.china.huawei.com (7.191.161.198) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-edac@vger.kernel.org From: Shiju Jose Add support to log and record the CXL overflow events. Signed-off-by: Shiju Jose --- ras-cxl-handler.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++ ras-cxl-handler.h | 3 ++ ras-events.c | 9 +++++ ras-events.h | 1 + ras-record.c | 69 +++++++++++++++++++++++++++++++++ ras-record.h | 15 ++++++++ ras-report.c | 77 +++++++++++++++++++++++++++++++++++++ ras-report.h | 2 + 8 files changed, 274 insertions(+) diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index 025e582..b08c5e3 100644 --- a/ras-cxl-handler.c +++ b/ras-cxl-handler.c @@ -418,3 +418,101 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, return 0; } + +/* + * CXL rev 3.0 section 8.2.9.2.2; Table 8-49 + */ +enum cxl_event_log_type { + CXL_EVENT_TYPE_INFO = 0x00, + CXL_EVENT_TYPE_WARN, + CXL_EVENT_TYPE_FAIL, + CXL_EVENT_TYPE_FATAL, + CXL_EVENT_TYPE_UNKNOWN +}; + +static char *cxl_event_log_type_str(uint32_t log_type) +{ + + switch (log_type) { + case CXL_EVENT_TYPE_INFO: + return "Informational"; + case CXL_EVENT_TYPE_WARN: + return "Warning"; + case CXL_EVENT_TYPE_FAIL: + return "Failure"; + case CXL_EVENT_TYPE_FATAL: + return "Fatal"; + default: + break; + } + + return "Unknown"; +} + +int ras_cxl_overflow_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +{ + int len; + unsigned long long val; + struct ras_events *ras = context; + struct ras_cxl_overflow_event ev; + + memset(&ev, 0, sizeof(ev)); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + + ev.memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1); + if (!ev.memdev) + return -1; + if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) + return -1; + + ev.host = tep_get_field_raw(s, event, "host", record, &len, 1); + if (!ev.host) + return -1; + if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) + return -1; + + if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) + return -1; + ev.serial = val; + if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) + return -1; + + if (tep_get_field_val(s, event, "log", record, &val, 1) < 0) + return -1; + ev.log_type = cxl_event_log_type_str(val); + if (trace_seq_printf(s, "log type:%s ", ev.log_type) <= 0) + return -1; + + if (tep_get_field_val(s, event, "count", record, &val, 1) < 0) + return -1; + ev.count = val; + + if (tep_get_field_val(s, event, "first_ts", record, &val, 1) < 0) + return -1; + convert_timestamp(val, ev.first_ts, sizeof(ev.first_ts)); + + if (tep_get_field_val(s, event, "last_ts", record, &val, 1) < 0) + return -1; + convert_timestamp(val, ev.last_ts, sizeof(ev.last_ts)); + + if (ev.count) { + if (trace_seq_printf(s, "%u errors from %s to %s\n", + ev.count, ev.first_ts, ev.last_ts) <= 0) + return -1; + } + /* Insert data into the SGBD */ +#ifdef HAVE_SQLITE3 + ras_store_cxl_overflow_event(ras, &ev); +#endif + +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_cxl_overflow_event(ras, &ev); +#endif + + return 0; +} diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h index 711daf4..e7847ec 100644 --- a/ras-cxl-handler.h +++ b/ras-cxl-handler.h @@ -29,4 +29,7 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, int ras_cxl_aer_ce_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); +int ras_cxl_overflow_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); #endif diff --git a/ras-events.c b/ras-events.c index 716317b..ded8648 100644 --- a/ras-events.c +++ b/ras-events.c @@ -248,6 +248,7 @@ int toggle_ras_mc_event(int enable) rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); #endif free_ras: @@ -1004,6 +1005,14 @@ int handle_ras_events(int record_events) else log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", "cxl", "cxl_aer_correctable_error"); + + rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_overflow", + ras_cxl_overflow_event_handler, NULL, CXL_OVERFLOW_EVENT); + if (!rc) + num_events++; + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_overflow"); #endif if (!num_events) { diff --git a/ras-events.h b/ras-events.h index dc7bdfb..ea590c9 100644 --- a/ras-events.h +++ b/ras-events.h @@ -42,6 +42,7 @@ enum { CXL_POISON_EVENT, CXL_AER_UE_EVENT, CXL_AER_CE_EVENT, + CXL_OVERFLOW_EVENT, NR_EVENTS }; diff --git a/ras-record.c b/ras-record.c index 82e310b..57fe117 100644 --- a/ras-record.c +++ b/ras-record.c @@ -720,6 +720,59 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve return rc; } + +/* + * Table and functions to handle cxl:cxl_overflow + */ +static const struct db_fields cxl_overflow_event_fields[] = { + { .name = "id", .type = "INTEGER PRIMARY KEY" }, + { .name = "timestamp", .type = "TEXT" }, + { .name = "memdev", .type = "TEXT" }, + { .name = "host", .type = "TEXT" }, + { .name = "serial", .type = "INTEGER" }, + { .name = "log_type", .type = "TEXT" }, + { .name = "count", .type = "INTEGER" }, + { .name = "first_ts", .type = "TEXT" }, + { .name = "last_ts", .type = "TEXT" }, +}; + +static const struct db_table_descriptor cxl_overflow_event_tab = { + .name = "cxl_overflow_event", + .fields = cxl_overflow_event_fields, + .num_fields = ARRAY_SIZE(cxl_overflow_event_fields), +}; + +int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) +{ + int rc; + struct sqlite3_priv *priv = ras->db_priv; + + if (!priv || !priv->stmt_cxl_overflow_event) + return 0; + log(TERM, LOG_INFO, "cxl_overflow_event store: %p\n", priv->stmt_cxl_overflow_event); + + sqlite3_bind_text(priv->stmt_cxl_overflow_event, 1, ev->timestamp, -1, NULL); + sqlite3_bind_text(priv->stmt_cxl_overflow_event, 2, ev->memdev, -1, NULL); + sqlite3_bind_text(priv->stmt_cxl_overflow_event, 3, ev->host, -1, NULL); + sqlite3_bind_int64(priv->stmt_cxl_overflow_event, 4, ev->serial); + sqlite3_bind_text(priv->stmt_cxl_overflow_event, 5, ev->log_type, -1, NULL); + sqlite3_bind_int(priv->stmt_cxl_overflow_event, 6, ev->count); + sqlite3_bind_text(priv->stmt_cxl_overflow_event, 7, ev->first_ts, -1, NULL); + sqlite3_bind_text(priv->stmt_cxl_overflow_event, 8, ev->last_ts, -1, NULL); + + rc = sqlite3_step(priv->stmt_cxl_overflow_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed to do cxl_overflow_event step on sqlite: error = %d\n", rc); + rc = sqlite3_reset(priv->stmt_cxl_overflow_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed reset cxl_overflow_event on sqlite: error = %d\n", + rc); + log(TERM, LOG_INFO, "register inserted at db\n"); + + return rc; +} #endif /* @@ -1083,6 +1136,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) if (rc != SQLITE_OK) goto error; } + + rc = ras_mc_create_table(priv, &cxl_overflow_event_tab); + if (rc == SQLITE_OK) { + rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_overflow_event, + &cxl_overflow_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif ras->db_priv = priv; @@ -1221,6 +1282,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) "cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n", cpu, rc); } + + if (priv->stmt_cxl_overflow_event) { + rc = sqlite3_finalize(priv->stmt_cxl_overflow_event); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, + "cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n", + cpu, rc); + } #endif rc = sqlite3_close_v2(db); diff --git a/ras-record.h b/ras-record.h index ab7153d..90db6ad 100644 --- a/ras-record.h +++ b/ras-record.h @@ -152,6 +152,17 @@ struct ras_cxl_aer_ce_event { uint32_t error_status; }; +struct ras_cxl_overflow_event { + char timestamp[64]; + const char *memdev; + const char *host; + uint64_t serial; + const char *log_type; + char first_ts[64]; + char last_ts[64]; + uint16_t count; +}; + struct ras_mc_event; struct ras_aer_event; struct ras_extlog_event; @@ -164,6 +175,7 @@ struct ras_mf_event; struct ras_cxl_poison_event; struct ras_cxl_aer_ue_event; struct ras_cxl_aer_ce_event; +struct ras_cxl_overflow_event; #ifdef HAVE_SQLITE3 @@ -200,6 +212,7 @@ struct sqlite3_priv { sqlite3_stmt *stmt_cxl_poison_event; sqlite3_stmt *stmt_cxl_aer_ue_event; sqlite3_stmt *stmt_cxl_aer_ce_event; + sqlite3_stmt *stmt_cxl_overflow_event; #endif }; @@ -231,6 +244,7 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); +int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); #else static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; @@ -247,6 +261,7 @@ static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; +static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; #endif diff --git a/ras-report.c b/ras-report.c index 63b47f5..dbed454 100644 --- a/ras-report.c +++ b/ras-report.c @@ -421,6 +421,36 @@ static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event return 0; } +static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_event *ev) +{ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if (!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "memdev=%s\n" \ + "host=%s\n" \ + "serial=0x%lx\n" \ + "log_type=%s\n" \ + "count=%u\n" \ + "first_ts=%s\n" \ + "last_ts=%s\n", \ + ev->timestamp, \ + ev->memdev, \ + ev->host, \ + ev->serial, \ + ev->log_type, \ + ev->count, \ + ev->first_ts, \ + ev->last_ts); + + strcat(buf, bt_buf); + + return 0; +} + static int commit_report_backtrace(int sockfd, int type, void *ev){ char buf[MAX_BACKTRACE_SIZE]; char *pbuf = buf; @@ -467,6 +497,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ case CXL_AER_CE_EVENT: rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev); break; + case CXL_OVERFLOW_EVENT: + rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev); + break; default: return -1; } @@ -1007,3 +1040,47 @@ cxl_aer_ce_fail: else return -1; } + +int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) +{ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if (sockfd < 0) + return -1; + + rc = commit_report_basic(sockfd); + if (rc < 0) + goto cxl_overflow_fail; + + rc = commit_report_backtrace(sockfd, CXL_OVERFLOW_EVENT, ev); + if (rc < 0) + goto cxl_overflow_fail; + + sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-overflow"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_overflow_fail; + + sprintf(buf, "REASON=%s", "CXL overflow"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_overflow_fail; + + done = 1; + +cxl_overflow_fail: + + if (sockfd >= 0) + close(sockfd); + + if (done) + return 0; + else + return -1; +} diff --git a/ras-report.h b/ras-report.h index 46155ee..204d485 100644 --- a/ras-report.h +++ b/ras-report.h @@ -42,6 +42,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); +int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); #else @@ -56,6 +57,7 @@ static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_even static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; +static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; #endif From patchwork Wed Apr 12 08:33:08 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shiju Jose X-Patchwork-Id: 13208655 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 0C422C7619A for ; Wed, 12 Apr 2023 08:53:14 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231428AbjDLIxM (ORCPT ); Wed, 12 Apr 2023 04:53:12 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41874 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231397AbjDLIxM (ORCPT ); Wed, 12 Apr 2023 04:53:12 -0400 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9D0649EDB; Wed, 12 Apr 2023 01:52:49 -0700 (PDT) Received: from lhrpeml500006.china.huawei.com (unknown [172.18.147.226]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4PxGDX0q3xz67n97; Wed, 12 Apr 2023 16:32:44 +0800 (CST) Received: from SecurePC30232.china.huawei.com (10.122.247.234) by lhrpeml500006.china.huawei.com (7.191.161.198) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.23; Wed, 12 Apr 2023 09:33:41 +0100 From: To: , , CC: , , Subject: [RFC PATCH 4/7] rasdaemon: Add support for the CXL generic events Date: Wed, 12 Apr 2023 16:33:08 +0800 Message-ID: <20230412083312.1384-5-shiju.jose@huawei.com> X-Mailer: git-send-email 2.35.1.windows.2 In-Reply-To: <20230412083312.1384-1-shiju.jose@huawei.com> References: <20230412083312.1384-1-shiju.jose@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.122.247.234] X-ClientProxiedBy: lhrpeml100004.china.huawei.com (7.191.162.219) To lhrpeml500006.china.huawei.com (7.191.161.198) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-edac@vger.kernel.org From: Shiju Jose Add support to log and record the CXL generic events. Signed-off-by: Shiju Jose --- ras-cxl-handler.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++ ras-cxl-handler.h | 3 + ras-events.c | 9 +++ ras-events.h | 1 + ras-record.c | 89 ++++++++++++++++++++++ ras-record.h | 25 +++++++ ras-report.c | 86 +++++++++++++++++++++ ras-report.h | 2 + 8 files changed, 400 insertions(+) diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index b08c5e3..59f87c0 100644 --- a/ras-cxl-handler.c +++ b/ras-cxl-handler.c @@ -56,6 +56,49 @@ static void get_timestamp(struct trace_seq *s, struct tep_record *record, strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size); } +struct cxl_event_flags { + uint32_t bit; + const char *flag; +}; + +static int decode_cxl_event_flags(struct trace_seq *s, uint32_t flags, + const struct cxl_event_flags *cxl_ev_flags, + uint8_t num_elems) +{ + int i; + + for (i = 0; i < num_elems; i++) { + if (flags & cxl_ev_flags[i].bit) + if (trace_seq_printf(s, "\'%s\' ", cxl_ev_flags[i].flag) <= 0) + return -1; + } + return 0; +} + +static char *uuid_be(const char *uu) +{ + static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; + char *p = uuid; + int i; + static const unsigned char be[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + for (i = 0; i < 16; i++) { + p += sprintf(p, "%.2x", (unsigned char) uu[be[i]]); + switch (i) { + case 3: + case 5: + case 7: + case 9: + *p++ = '-'; + break; + } + } + + *p = 0; + + return uuid; +} + /* Poison List: Payload out flags */ #define CXL_POISON_FLAG_MORE BIT(0) #define CXL_POISON_FLAG_OVERFLOW BIT(1) @@ -516,3 +559,145 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, return 0; } + +/* + * Common Event Record Format + * CXL 3.0 section 8.2.9.2.1; Table 8-42 + */ +#define CXL_EVENT_RECORD_FLAG_PERMANENT BIT(2) +#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED BIT(3) +#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED BIT(4) +#define CXL_EVENT_RECORD_FLAG_HW_REPLACE BIT(5) + +static const struct cxl_event_flags cxl_hdr_flags[] = { + { .bit = CXL_EVENT_RECORD_FLAG_PERMANENT, .flag = "PERMANENT_CONDITION" }, + { .bit = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, .flag = "MAINTENANCE_NEEDED" }, + { .bit = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, .flag = "PERFORMANCE_DEGRADED" }, + { .bit = CXL_EVENT_RECORD_FLAG_HW_REPLACE, .flag = "HARDWARE_REPLACEMENT_NEEDED" }, +}; + +static int handle_ras_cxl_common_hdr(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context, + struct ras_cxl_event_common_hdr *hdr) +{ + int len; + unsigned long long val; + struct ras_events *ras = context; + + get_timestamp(s, record, ras, (char *)&hdr->timestamp, sizeof(hdr->timestamp)); + if (trace_seq_printf(s, "%s ", hdr->timestamp) <= 0) + return -1; + + hdr->memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1); + if (!hdr->memdev) + return -1; + if (trace_seq_printf(s, "memdev:%s ", hdr->memdev) <= 0) + return -1; + + hdr->host = tep_get_field_raw(s, event, "host", record, &len, 1); + if (!hdr->host) + return -1; + if (trace_seq_printf(s, "host:%s ", hdr->host) <= 0) + return -1; + + if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) + return -1; + hdr->serial = val; + if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)hdr->serial) <= 0) + return -1; + + if (tep_get_field_val(s, event, "log", record, &val, 1) < 0) + return -1; + hdr->log_type = cxl_event_log_type_str(val); + if (trace_seq_printf(s, "log type:%s ", hdr->log_type) <= 0) + return -1; + + hdr->hdr_uuid = tep_get_field_raw(s, event, "hdr_uuid", record, &len, 1); + if (!hdr->hdr_uuid) + return -1; + hdr->hdr_uuid = uuid_be(hdr->hdr_uuid); + if (trace_seq_printf(s, "hdr_uuid:%s ", hdr->hdr_uuid) <= 0) + return -1; + + if (tep_get_field_val(s, event, "hdr_flags", record, &val, 1) < 0) + return -1; + hdr->hdr_flags = val; + if (decode_cxl_event_flags(s, hdr->hdr_flags, cxl_hdr_flags, + ARRAY_SIZE(cxl_hdr_flags)) < 0) + return -1; + + if (tep_get_field_val(s, event, "hdr_handle", record, &val, 1) < 0) + return -1; + hdr->hdr_handle = val; + if (trace_seq_printf(s, "hdr_handle:0x%x ", hdr->hdr_handle) <= 0) + return -1; + + if (tep_get_field_val(s, event, "hdr_related_handle", record, &val, 1) < 0) + return -1; + hdr->hdr_related_handle = val; + if (trace_seq_printf(s, "hdr_related_handle:0x%x ", hdr->hdr_related_handle) <= 0) + return -1; + + if (tep_get_field_val(s, event, "hdr_timestamp", record, &val, 1) < 0) + return -1; + convert_timestamp(val, hdr->hdr_timestamp, sizeof(hdr->hdr_timestamp)); + if (trace_seq_printf(s, "hdr_timestamp:%s ", hdr->hdr_timestamp) <= 0) + return -1; + + if (tep_get_field_val(s, event, "hdr_length", record, &val, 1) < 0) + return -1; + hdr->hdr_length = val; + if (trace_seq_printf(s, "hdr_length:%u ", hdr->hdr_length) <= 0) + return -1; + + if (tep_get_field_val(s, event, "hdr_maint_op_class", record, &val, 1) < 0) + return -1; + hdr->hdr_maint_op_class = val; + if (trace_seq_printf(s, "hdr_maint_op_class:%u ", hdr->hdr_maint_op_class) <= 0) + return -1; + + return 0; +} + +int ras_cxl_generic_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +{ + int len, i; + struct ras_events *ras = context; + struct ras_cxl_generic_event ev; + const uint8_t *buf; + + memset(&ev, 0, sizeof(ev)); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + + ev.data = tep_get_field_raw(s, event, "data", record, &len, 1); + if (!ev.data) + return -1; + i = 0; + buf = ev.data; + if (trace_seq_printf(s, "\ndata:\n %08x: ", i) <= 0) + return -1; + for (i = 0; i < CXL_EVENT_RECORD_DATA_LENGTH; i += 4) { + if ((i > 0) && ((i % 16) == 0)) + if (trace_seq_printf(s, "\n %08x: ", i) <= 0) + break; + if (trace_seq_printf(s, "%02x%02x%02x%02x ", + buf[i], buf[i+1], buf[i+2], buf[i+3]) <= 0) + break; + } + + /* Insert data into the SGBD */ +#ifdef HAVE_SQLITE3 + ras_store_cxl_generic_event(ras, &ev); +#endif + +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_cxl_generic_event(ras, &ev); +#endif + + return 0; +} diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h index e7847ec..9f77cb7 100644 --- a/ras-cxl-handler.h +++ b/ras-cxl-handler.h @@ -32,4 +32,7 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, int ras_cxl_overflow_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); +int ras_cxl_generic_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); #endif diff --git a/ras-events.c b/ras-events.c index ded8648..debdc87 100644 --- a/ras-events.c +++ b/ras-events.c @@ -249,6 +249,7 @@ int toggle_ras_mc_event(int enable) rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); #endif free_ras: @@ -1013,6 +1014,14 @@ int handle_ras_events(int record_events) else log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", "cxl", "cxl_overflow"); + + rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_generic_event", + ras_cxl_generic_event_handler, NULL, CXL_GENERIC_EVENT); + if (!rc) + num_events++; + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_generic_event"); #endif if (!num_events) { diff --git a/ras-events.h b/ras-events.h index ea590c9..989ab29 100644 --- a/ras-events.h +++ b/ras-events.h @@ -43,6 +43,7 @@ enum { CXL_AER_UE_EVENT, CXL_AER_CE_EVENT, CXL_OVERFLOW_EVENT, + CXL_GENERIC_EVENT, NR_EVENTS }; diff --git a/ras-record.c b/ras-record.c index 57fe117..36665aa 100644 --- a/ras-record.c +++ b/ras-record.c @@ -773,6 +773,79 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow return rc; } + +static int ras_store_cxl_common_hdr(sqlite3_stmt *stmt, struct ras_cxl_event_common_hdr *hdr) +{ + if (!stmt || !hdr) + return 0; + + sqlite3_bind_text(stmt, 1, hdr->timestamp, -1, NULL); + sqlite3_bind_text(stmt, 2, hdr->memdev, -1, NULL); + sqlite3_bind_text(stmt, 3, hdr->host, -1, NULL); + sqlite3_bind_int64(stmt, 4, hdr->serial); + sqlite3_bind_text(stmt, 5, hdr->log_type, -1, NULL); + sqlite3_bind_text(stmt, 6, hdr->hdr_uuid, -1, NULL); + sqlite3_bind_int(stmt, 7, hdr->hdr_flags); + sqlite3_bind_int(stmt, 8, hdr->hdr_handle); + sqlite3_bind_int(stmt, 9, hdr->hdr_related_handle); + sqlite3_bind_text(stmt, 10, hdr->hdr_timestamp, -1, NULL); + sqlite3_bind_int(stmt, 11, hdr->hdr_length); + sqlite3_bind_int(stmt, 12, hdr->hdr_maint_op_class); + + return 0; +} + +/* + * Table and functions to handle cxl:cxl_generic_event + */ +static const struct db_fields cxl_generic_event_fields[] = { + { .name = "id", .type = "INTEGER PRIMARY KEY" }, + { .name = "timestamp", .type = "TEXT" }, + { .name = "memdev", .type = "TEXT" }, + { .name = "host", .type = "TEXT" }, + { .name = "serial", .type = "INTEGER" }, + { .name = "log_type", .type = "TEXT" }, + { .name = "hdr_uuid", .type = "TEXT" }, + { .name = "hdr_flags", .type = "INTEGER" }, + { .name = "hdr_handle", .type = "INTEGER" }, + { .name = "hdr_related_handle", .type = "INTEGER" }, + { .name = "hdr_ts", .type = "TEXT" }, + { .name = "hdr_length", .type = "INTEGER" }, + { .name = "hdr_maint_op_class", .type = "INTEGER" }, + { .name = "data", .type = "BLOB" }, +}; + +static const struct db_table_descriptor cxl_generic_event_tab = { + .name = "cxl_generic_event", + .fields = cxl_generic_event_fields, + .num_fields = ARRAY_SIZE(cxl_generic_event_fields), +}; + +int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) +{ + int rc; + struct sqlite3_priv *priv = ras->db_priv; + + if (!priv || !priv->stmt_cxl_generic_event) + return 0; + log(TERM, LOG_INFO, "cxl_generic_event store: %p\n", priv->stmt_cxl_generic_event); + + ras_store_cxl_common_hdr(priv->stmt_cxl_generic_event, &ev->hdr); + sqlite3_bind_blob(priv->stmt_cxl_generic_event, 13, ev->data, + CXL_EVENT_RECORD_DATA_LENGTH, NULL); + + rc = sqlite3_step(priv->stmt_cxl_generic_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed to do stmt_cxl_generic_event step on sqlite: error = %d\n", rc); + rc = sqlite3_reset(priv->stmt_cxl_generic_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed reset stmt_cxl_generic_event on sqlite: error = %d\n", rc); + log(TERM, LOG_INFO, "register inserted at db\n"); + + return rc; +} #endif /* @@ -1144,6 +1217,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) if (rc != SQLITE_OK) goto error; } + + rc = ras_mc_create_table(priv, &cxl_generic_event_tab); + if (rc == SQLITE_OK) { + rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_generic_event, + &cxl_generic_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif ras->db_priv = priv; @@ -1290,6 +1371,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) "cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n", cpu, rc); } + + if (priv->stmt_cxl_generic_event) { + rc = sqlite3_finalize(priv->stmt_cxl_generic_event); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, + "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n", + cpu, rc); + } #endif rc = sqlite3_close_v2(db); diff --git a/ras-record.h b/ras-record.h index 90db6ad..9ecfcda 100644 --- a/ras-record.h +++ b/ras-record.h @@ -133,6 +133,7 @@ struct ras_cxl_poison_event { #define SZ_512 0x200 #define CXL_HEADERLOG_SIZE SZ_512 #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) +#define CXL_EVENT_RECORD_DATA_LENGTH 0x50 struct ras_cxl_aer_ue_event { char timestamp[64]; @@ -163,6 +164,26 @@ struct ras_cxl_overflow_event { uint16_t count; }; +struct ras_cxl_event_common_hdr { + char timestamp[64]; + const char *memdev; + const char *host; + uint64_t serial; + const char *log_type; + const char *hdr_uuid; + uint32_t hdr_flags; + uint16_t hdr_handle; + uint16_t hdr_related_handle; + char hdr_timestamp[64]; + uint8_t hdr_length; + uint8_t hdr_maint_op_class; +}; + +struct ras_cxl_generic_event { + struct ras_cxl_event_common_hdr hdr; + uint8_t *data; +}; + struct ras_mc_event; struct ras_aer_event; struct ras_extlog_event; @@ -176,6 +197,7 @@ struct ras_cxl_poison_event; struct ras_cxl_aer_ue_event; struct ras_cxl_aer_ce_event; struct ras_cxl_overflow_event; +struct ras_cxl_generic_event; #ifdef HAVE_SQLITE3 @@ -213,6 +235,7 @@ struct sqlite3_priv { sqlite3_stmt *stmt_cxl_aer_ue_event; sqlite3_stmt *stmt_cxl_aer_ce_event; sqlite3_stmt *stmt_cxl_overflow_event; + sqlite3_stmt *stmt_cxl_generic_event; #endif }; @@ -245,6 +268,7 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); +int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); #else static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; @@ -262,6 +286,7 @@ static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_ static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; +static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; #endif diff --git a/ras-report.c b/ras-report.c index dbed454..8d7b76a 100644 --- a/ras-report.c +++ b/ras-report.c @@ -451,6 +451,44 @@ static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_e return 0; } +static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_event *ev) +{ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if (!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "memdev=%s\n" \ + "host=%s\n" \ + "serial=0x%lx\n" \ + "log_type=%s\n" \ + "hdr_uuid=%s\n" \ + "hdr_flags=0x%x\n" \ + "hdr_handle=0x%x\n" \ + "hdr_related_handle=0x%x\n" \ + "hdr_timestamp=%s\n" \ + "hdr_length=%u\n" \ + "hdr_maint_op_class=%u\n", \ + ev->hdr.timestamp, \ + ev->hdr.memdev, \ + ev->hdr.host, \ + ev->hdr.serial, \ + ev->hdr.log_type, \ + ev->hdr.hdr_uuid, \ + ev->hdr.hdr_flags, \ + ev->hdr.hdr_handle, \ + ev->hdr.hdr_related_handle, \ + ev->hdr.hdr_timestamp, \ + ev->hdr.hdr_length, \ + ev->hdr.hdr_maint_op_class); + + strcat(buf, bt_buf); + + return 0; +} + static int commit_report_backtrace(int sockfd, int type, void *ev){ char buf[MAX_BACKTRACE_SIZE]; char *pbuf = buf; @@ -500,6 +538,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ case CXL_OVERFLOW_EVENT: rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev); break; + case CXL_GENERIC_EVENT: + rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev); + break; default: return -1; } @@ -1084,3 +1125,48 @@ cxl_overflow_fail: else return -1; } + +int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) +{ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if (sockfd < 0) + return -1; + + rc = commit_report_basic(sockfd); + if (rc < 0) + goto cxl_generic_fail; + + rc = commit_report_backtrace(sockfd, CXL_GENERIC_EVENT, ev); + if (rc < 0) + goto cxl_generic_fail; + + sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_generic_event"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_generic_fail; + + sprintf(buf, "REASON=%s", "CXL Generic Event "); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_generic_fail; + + done = 1; + +cxl_generic_fail: + + if (sockfd >= 0) + close(sockfd); + + if (done) + return 0; + else + return -1; + +} diff --git a/ras-report.h b/ras-report.h index 204d485..bf591a6 100644 --- a/ras-report.h +++ b/ras-report.h @@ -43,6 +43,7 @@ int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_ev int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); +int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); #else @@ -58,6 +59,7 @@ static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; +static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; #endif From patchwork Wed Apr 12 08:33:09 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shiju Jose X-Patchwork-Id: 13208678 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 00976C77B71 for ; Wed, 12 Apr 2023 09:22:18 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229933AbjDLJWR (ORCPT ); Wed, 12 Apr 2023 05:22:17 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:36952 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229962AbjDLJWO (ORCPT ); Wed, 12 Apr 2023 05:22:14 -0400 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D427676BB; Wed, 12 Apr 2023 02:22:03 -0700 (PDT) Received: from lhrpeml500006.china.huawei.com (unknown [172.18.147.226]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4PxGDX2c8Bz67lH1; Wed, 12 Apr 2023 16:32:44 +0800 (CST) Received: from SecurePC30232.china.huawei.com (10.122.247.234) by lhrpeml500006.china.huawei.com (7.191.161.198) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.23; Wed, 12 Apr 2023 09:33:41 +0100 From: To: , , CC: , , Subject: [RFC PATCH 5/7] rasdaemon: Add support for the CXL general media events Date: Wed, 12 Apr 2023 16:33:09 +0800 Message-ID: <20230412083312.1384-6-shiju.jose@huawei.com> X-Mailer: git-send-email 2.35.1.windows.2 In-Reply-To: <20230412083312.1384-1-shiju.jose@huawei.com> References: <20230412083312.1384-1-shiju.jose@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.122.247.234] X-ClientProxiedBy: lhrpeml100004.china.huawei.com (7.191.162.219) To lhrpeml500006.china.huawei.com (7.191.161.198) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-edac@vger.kernel.org From: Shiju Jose Add support to log and record the CXL general media events. Signed-off-by: Shiju Jose --- ras-cxl-handler.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++ ras-cxl-handler.h | 3 + ras-events.c | 9 +++ ras-events.h | 1 + ras-record.c | 85 +++++++++++++++++++++++++ ras-record.h | 19 ++++++ ras-report.c | 101 ++++++++++++++++++++++++++++++ ras-report.h | 2 + 8 files changed, 376 insertions(+) diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index 59f87c0..e2e80ff 100644 --- a/ras-cxl-handler.c +++ b/ras-cxl-handler.c @@ -99,6 +99,14 @@ static char *uuid_be(const char *uu) return uuid; } +static const char* get_cxl_type_str(const char** type_array, uint8_t num_elems, uint8_t type) +{ + if (type >= num_elems) + return "Unknown"; + + return type_array[type]; +} + /* Poison List: Payload out flags */ #define CXL_POISON_FLAG_MORE BIT(0) #define CXL_POISON_FLAG_OVERFLOW BIT(1) @@ -701,3 +709,151 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, return 0; } + +#define CXL_DPA_VOLATILE BIT(0) +#define CXL_DPA_NOT_REPAIRABLE BIT(1) + +static const struct cxl_event_flags cxl_dpa_flags[] = { + { .bit = CXL_DPA_VOLATILE, .flag = "VOLATILE" }, + { .bit = CXL_DPA_NOT_REPAIRABLE, .flag = "NOT_REPAIRABLE" }, +}; + +/* + * General Media Event Record - GMER + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT BIT(0) +#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT BIT(1) +#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW BIT(2) + +static const struct cxl_event_flags cxl_gmer_event_desc_flags[] = { + { .bit = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, .flag = "UNCORRECTABLE EVENT" }, + { .bit = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, .flag = "THRESHOLD EVENT" }, + { .bit = CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW, .flag = "POISON LIST OVERFLOW" }, +}; + +#define CXL_GMER_VALID_CHANNEL BIT(0) +#define CXL_GMER_VALID_RANK BIT(1) +#define CXL_GMER_VALID_DEVICE BIT(2) +#define CXL_GMER_VALID_COMPONENT BIT(3) + +static const char* cxl_gmer_mem_event_type[] = { + "ECC Error", + "Invalid Address", + "Data Path Error", +}; + +static const char* cxl_gmer_trans_type[] = { + "Unknown", + "Host Read", + "Host Write", + "Host Scan Media", + "Host Inject Poison", + "Internal Media Scrub", + "Internal Media Management", +}; + +int ras_cxl_general_media_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +{ + int len, i; + unsigned long long val; + struct ras_events *ras = context; + struct ras_cxl_general_media_event ev; + + memset(&ev, 0, sizeof(ev)); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + + if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) + return -1; + ev.dpa = val; + if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) + return -1; + + if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) + return -1; + ev.dpa_flags = val; + if (trace_seq_printf(s, "dpa_flags:") <= 0) + return -1; + if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) + return -1; + + if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) + return -1; + ev.descriptor = val; + if (trace_seq_printf(s, "descriptor:") <= 0) + return -1; + if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, + ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) + return -1; + + if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) + return -1; + ev.type = val; + if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, + ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) + return -1; + + if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) + return -1; + ev.transaction_type = val; + if (trace_seq_printf(s, "transaction_type:%s ", + get_cxl_type_str(cxl_gmer_trans_type, + ARRAY_SIZE(cxl_gmer_trans_type), + ev.transaction_type)) <= 0) + return -1; + + if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) + return -1; + ev.validity_flags = val; + + if (ev.validity_flags & CXL_GMER_VALID_CHANNEL) { + if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) + return -1; + ev.channel = val; + if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_GMER_VALID_RANK) { + if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) + return -1; + ev.rank = val; + if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_GMER_VALID_DEVICE) { + if (tep_get_field_val(s, event, "device", record, &val, 1) < 0) + return -1; + ev.device = val; + if (trace_seq_printf(s, "device:%x ", ev.device) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_GMER_VALID_COMPONENT) { + ev.comp_id = tep_get_field_raw(s, event, "comp_id", record, &len, 1); + if (!ev.comp_id) + return -1; + if (trace_seq_printf(s, "comp_id:") <= 0) + return -1; + for (i = 0; i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; i++) { + if (trace_seq_printf(s, "%02x ", ev.comp_id[i]) <= 0) + break; + } + } + + /* Insert data into the SGBD */ +#ifdef HAVE_SQLITE3 + ras_store_cxl_general_media_event(ras, &ev); +#endif + +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_cxl_general_media_event(ras, &ev); +#endif + + return 0; +} diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h index 9f77cb7..3adca4a 100644 --- a/ras-cxl-handler.h +++ b/ras-cxl-handler.h @@ -35,4 +35,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, int ras_cxl_generic_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); +int ras_cxl_general_media_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); #endif diff --git a/ras-events.c b/ras-events.c index debdc87..0858b51 100644 --- a/ras-events.c +++ b/ras-events.c @@ -250,6 +250,7 @@ int toggle_ras_mc_event(int enable) rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); #endif free_ras: @@ -1022,6 +1023,14 @@ int handle_ras_events(int record_events) else log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", "cxl", "cxl_generic_event"); + + rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_general_media", + ras_cxl_general_media_event_handler, NULL, CXL_GENERAL_MEDIA_EVENT); + if (!rc) + num_events++; + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_general_media"); #endif if (!num_events) { diff --git a/ras-events.h b/ras-events.h index 989ab29..0a3edf5 100644 --- a/ras-events.h +++ b/ras-events.h @@ -44,6 +44,7 @@ enum { CXL_AER_CE_EVENT, CXL_OVERFLOW_EVENT, CXL_GENERIC_EVENT, + CXL_GENERAL_MEDIA_EVENT, NR_EVENTS }; diff --git a/ras-record.c b/ras-record.c index 36665aa..0546b29 100644 --- a/ras-record.c +++ b/ras-record.c @@ -846,6 +846,75 @@ int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_e return rc; } + +/* + * Table and functions to handle cxl:cxl_general_media_event + */ +static const struct db_fields cxl_general_media_event_fields[] = { + { .name = "id", .type = "INTEGER PRIMARY KEY" }, + { .name = "timestamp", .type = "TEXT" }, + { .name = "memdev", .type = "TEXT" }, + { .name = "host", .type = "TEXT" }, + { .name = "serial", .type = "INTEGER" }, + { .name = "log_type", .type = "TEXT" }, + { .name = "hdr_uuid", .type = "TEXT" }, + { .name = "hdr_flags", .type = "INTEGER" }, + { .name = "hdr_handle", .type = "INTEGER" }, + { .name = "hdr_related_handle", .type = "INTEGER" }, + { .name = "hdr_ts", .type = "TEXT" }, + { .name = "hdr_length", .type = "INTEGER" }, + { .name = "hdr_maint_op_class", .type = "INTEGER" }, + { .name = "dpa", .type = "INTEGER" }, + { .name = "dpa_flags", .type = "INTEGER" }, + { .name = "descriptor", .type = "INTEGER" }, + { .name = "type", .type = "INTEGER" }, + { .name = "transaction_type", .type = "INTEGER" }, + { .name = "channel", .type = "INTEGER" }, + { .name = "rank", .type = "INTEGER" }, + { .name = "device", .type = "INTEGER" }, + { .name = "comp_id", .type = "BLOB" }, +}; + +static const struct db_table_descriptor cxl_general_media_event_tab = { + .name = "cxl_general_media_event", + .fields = cxl_general_media_event_fields, + .num_fields = ARRAY_SIZE(cxl_general_media_event_fields), +}; + +int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) +{ + int rc; + struct sqlite3_priv *priv = ras->db_priv; + + if (!priv || !priv->stmt_cxl_general_media_event) + return 0; + log(TERM, LOG_INFO, "cxl_general_media_event store: %p\n", + priv->stmt_cxl_general_media_event); + + ras_store_cxl_common_hdr(priv->stmt_cxl_general_media_event, &ev->hdr); + sqlite3_bind_int64(priv->stmt_cxl_general_media_event, 13, ev->dpa); + sqlite3_bind_int(priv->stmt_cxl_general_media_event, 14, ev->dpa_flags); + sqlite3_bind_int(priv->stmt_cxl_general_media_event, 15, ev->descriptor); + sqlite3_bind_int(priv->stmt_cxl_general_media_event, 16, ev->type); + sqlite3_bind_int(priv->stmt_cxl_general_media_event, 17, ev->transaction_type); + sqlite3_bind_int(priv->stmt_cxl_general_media_event, 18, ev->channel); + sqlite3_bind_int(priv->stmt_cxl_general_media_event, 19, ev->rank); + sqlite3_bind_int(priv->stmt_cxl_general_media_event, 20, ev->device); + sqlite3_bind_blob(priv->stmt_cxl_general_media_event, 21, ev->comp_id, + CXL_EVENT_GEN_MED_COMP_ID_SIZE, NULL); + + rc = sqlite3_step(priv->stmt_cxl_general_media_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed to do stmt_cxl_general_media_event step on sqlite: error = %d\n", rc); + rc = sqlite3_reset(priv->stmt_cxl_general_media_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed reset stmt_cxl_general_media_event on sqlite: error = %d\n", rc); + log(TERM, LOG_INFO, "register inserted at db\n"); + + return rc; +} #endif /* @@ -1225,6 +1294,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) if (rc != SQLITE_OK) goto error; } + + rc = ras_mc_create_table(priv, &cxl_general_media_event_tab); + if (rc == SQLITE_OK) { + rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_general_media_event, + &cxl_general_media_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif ras->db_priv = priv; @@ -1379,6 +1456,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n", cpu, rc); } + + if (priv->stmt_cxl_general_media_event) { + rc = sqlite3_finalize(priv->stmt_cxl_general_media_event); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, + "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", + cpu, rc); + } #endif rc = sqlite3_close_v2(db); diff --git a/ras-record.h b/ras-record.h index 9ecfcda..37c32de 100644 --- a/ras-record.h +++ b/ras-record.h @@ -134,6 +134,7 @@ struct ras_cxl_poison_event { #define CXL_HEADERLOG_SIZE SZ_512 #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 +#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 struct ras_cxl_aer_ue_event { char timestamp[64]; @@ -184,6 +185,20 @@ struct ras_cxl_generic_event { uint8_t *data; }; +struct ras_cxl_general_media_event { + struct ras_cxl_event_common_hdr hdr; + uint64_t dpa; + uint8_t dpa_flags; + uint8_t descriptor; + uint8_t type; + uint8_t transaction_type; + uint8_t channel; + uint8_t rank; + uint32_t device; + uint8_t *comp_id; + uint16_t validity_flags; +}; + struct ras_mc_event; struct ras_aer_event; struct ras_extlog_event; @@ -198,6 +213,7 @@ struct ras_cxl_aer_ue_event; struct ras_cxl_aer_ce_event; struct ras_cxl_overflow_event; struct ras_cxl_generic_event; +struct ras_cxl_general_media_event; #ifdef HAVE_SQLITE3 @@ -236,6 +252,7 @@ struct sqlite3_priv { sqlite3_stmt *stmt_cxl_aer_ce_event; sqlite3_stmt *stmt_cxl_overflow_event; sqlite3_stmt *stmt_cxl_generic_event; + sqlite3_stmt *stmt_cxl_general_media_event; #endif }; @@ -269,6 +286,7 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); +int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); #else static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; @@ -287,6 +305,7 @@ static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_ static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; +static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; #endif diff --git a/ras-report.c b/ras-report.c index 8d7b76a..725dc9b 100644 --- a/ras-report.c +++ b/ras-report.c @@ -489,6 +489,60 @@ static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_eve return 0; } +static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_general_media_event *ev) +{ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if (!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "memdev=%s\n" \ + "host=%s\n" \ + "serial=0x%lx\n" \ + "log_type=%s\n" \ + "hdr_uuid=%s\n" \ + "hdr_flags=0x%x\n" \ + "hdr_handle=0x%x\n" \ + "hdr_related_handle=0x%x\n" \ + "hdr_timestamp=%s\n" \ + "hdr_length=%u\n" \ + "hdr_maint_op_class=%u\n" \ + "dpa=0x%lx\n" \ + "dpa_flags=%u\n" \ + "descriptor=%u\n" \ + "type=%u\n" \ + "transaction_type=%u\n" \ + "channel=%u\n" \ + "rank=%u\n" \ + "device=0x%x\n", \ + ev->hdr.timestamp, \ + ev->hdr.memdev, \ + ev->hdr.host, \ + ev->hdr.serial, \ + ev->hdr.log_type, \ + ev->hdr.hdr_uuid, \ + ev->hdr.hdr_flags, \ + ev->hdr.hdr_handle, \ + ev->hdr.hdr_related_handle, \ + ev->hdr.hdr_timestamp, \ + ev->hdr.hdr_length, \ + ev->hdr.hdr_maint_op_class, \ + ev->dpa, \ + ev->dpa_flags, \ + ev->descriptor, \ + ev->type, \ + ev->transaction_type, \ + ev->channel, \ + ev->rank, \ + ev->device); + + strcat(buf, bt_buf); + + return 0; +} + static int commit_report_backtrace(int sockfd, int type, void *ev){ char buf[MAX_BACKTRACE_SIZE]; char *pbuf = buf; @@ -541,6 +595,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ case CXL_GENERIC_EVENT: rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev); break; + case CXL_GENERAL_MEDIA_EVENT: + rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); + break; default: return -1; } @@ -1170,3 +1227,47 @@ cxl_generic_fail: return -1; } + +int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) +{ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if (sockfd < 0) + return -1; + + rc = commit_report_basic(sockfd); + if (rc < 0) + goto cxl_general_media_fail; + + rc = commit_report_backtrace(sockfd, CXL_GENERAL_MEDIA_EVENT, ev); + if (rc < 0) + goto cxl_general_media_fail; + + sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_general_media_event"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_general_media_fail; + + sprintf(buf, "REASON=%s", "CXL General Media Event"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_general_media_fail; + + done = 1; + +cxl_general_media_fail: + + if (sockfd >= 0) + close(sockfd); + + if (done) + return 0; + else + return -1; +} diff --git a/ras-report.h b/ras-report.h index bf591a6..d9ec7df 100644 --- a/ras-report.h +++ b/ras-report.h @@ -44,6 +44,7 @@ int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_ev int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); +int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); #else @@ -60,6 +61,7 @@ static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; +static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; #endif From patchwork Wed Apr 12 08:33:10 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shiju Jose X-Patchwork-Id: 13208656 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E495EC77B79 for ; Wed, 12 Apr 2023 08:53:43 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231429AbjDLIxn (ORCPT ); Wed, 12 Apr 2023 04:53:43 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43098 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231444AbjDLIxj (ORCPT ); Wed, 12 Apr 2023 04:53:39 -0400 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 713B2A25C; Wed, 12 Apr 2023 01:53:17 -0700 (PDT) Received: from lhrpeml500006.china.huawei.com (unknown [172.18.147.226]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4PxGDX3fYZz67gR6; Wed, 12 Apr 2023 16:32:44 +0800 (CST) Received: from SecurePC30232.china.huawei.com (10.122.247.234) by lhrpeml500006.china.huawei.com (7.191.161.198) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.23; Wed, 12 Apr 2023 09:33:41 +0100 From: To: , , CC: , , Subject: [RFC PATCH 6/7] rasdaemon: Add support for the CXL dram events Date: Wed, 12 Apr 2023 16:33:10 +0800 Message-ID: <20230412083312.1384-7-shiju.jose@huawei.com> X-Mailer: git-send-email 2.35.1.windows.2 In-Reply-To: <20230412083312.1384-1-shiju.jose@huawei.com> References: <20230412083312.1384-1-shiju.jose@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.122.247.234] X-ClientProxiedBy: lhrpeml100004.china.huawei.com (7.191.162.219) To lhrpeml500006.china.huawei.com (7.191.161.198) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-edac@vger.kernel.org From: Shiju Jose Add support to log and record the CXL dram events. Signed-off-by: Shiju Jose --- ras-cxl-handler.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++ ras-cxl-handler.h | 3 + ras-events.c | 9 +++ ras-events.h | 1 + ras-record.c | 93 ++++++++++++++++++++++++++++ ras-record.h | 23 +++++++ ras-report.c | 109 +++++++++++++++++++++++++++++++++ ras-report.h | 2 + 8 files changed, 391 insertions(+) diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index e2e80ff..fadf5db 100644 --- a/ras-cxl-handler.c +++ b/ras-cxl-handler.c @@ -857,3 +857,154 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, return 0; } + +/* + * DRAM Event Record - DER + * + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CXL_DER_VALID_CHANNEL BIT(0) +#define CXL_DER_VALID_RANK BIT(1) +#define CXL_DER_VALID_NIBBLE BIT(2) +#define CXL_DER_VALID_BANK_GROUP BIT(3) +#define CXL_DER_VALID_BANK BIT(4) +#define CXL_DER_VALID_ROW BIT(5) +#define CXL_DER_VALID_COLUMN BIT(6) +#define CXL_DER_VALID_CORRECTION_MASK BIT(7) + +int ras_cxl_dram_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +{ + int len, i; + unsigned long long val; + struct ras_events *ras = context; + struct ras_cxl_dram_event ev; + + memset(&ev, 0, sizeof(ev)); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + + if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) + return -1; + ev.dpa = val; + if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) + return -1; + + if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) + return -1; + ev.dpa_flags = val; + if (trace_seq_printf(s, "dpa_flags:") <= 0) + return -1; + if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) + return -1; + + if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) + return -1; + ev.descriptor = val; + if (trace_seq_printf(s, "descriptor:") <= 0) + return -1; + if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, + ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) + return -1; + + if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) + return -1; + ev.type = val; + if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, + ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) + return -1; + + if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) + return -1; + ev.transaction_type = val; + if (trace_seq_printf(s, "transaction_type:%s ", + get_cxl_type_str(cxl_gmer_trans_type, + ARRAY_SIZE(cxl_gmer_trans_type), + ev.transaction_type)) <= 0) + return -1; + + if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) + return -1; + ev.validity_flags = val; + + if (ev.validity_flags & CXL_DER_VALID_CHANNEL) { + if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) + return -1; + ev.channel = val; + if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_DER_VALID_RANK) { + if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) + return -1; + ev.rank = val; + if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_DER_VALID_NIBBLE) { + if (tep_get_field_val(s, event, "nibble_mask", record, &val, 1) < 0) + return -1; + ev.nibble_mask = val; + if (trace_seq_printf(s, "nibble_mask:%u ", ev.nibble_mask) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_DER_VALID_BANK_GROUP) { + if (tep_get_field_val(s, event, "bank_group", record, &val, 1) < 0) + return -1; + ev.bank_group = val; + if (trace_seq_printf(s, "bank_group:%u ", ev.bank_group) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_DER_VALID_BANK) { + if (tep_get_field_val(s, event, "bank", record, &val, 1) < 0) + return -1; + ev.bank = val; + if (trace_seq_printf(s, "bank:%u ", ev.bank) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_DER_VALID_ROW) { + if (tep_get_field_val(s, event, "row", record, &val, 1) < 0) + return -1; + ev.row = val; + if (trace_seq_printf(s, "row:%u ", ev.row) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_DER_VALID_COLUMN) { + if (tep_get_field_val(s, event, "column", record, &val, 1) < 0) + return -1; + ev.column = val; + if (trace_seq_printf(s, "column:%u ", ev.column) <= 0) + return -1; + } + + if (ev.validity_flags & CXL_DER_VALID_CORRECTION_MASK) { + ev.cor_mask = tep_get_field_raw(s, event, "cor_mask", record, &len, 1); + if (!ev.cor_mask) + return -1; + if (trace_seq_printf(s, "correction_mask:") <= 0) + return -1; + for (i = 0; i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; i++) { + if (trace_seq_printf(s, "%02x ", ev.cor_mask[i]) <= 0) + break; + } + } + + /* Insert data into the SGBD */ +#ifdef HAVE_SQLITE3 + ras_store_cxl_dram_event(ras, &ev); +#endif + +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_cxl_dram_event(ras, &ev); +#endif + + return 0; +} diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h index 3adca4a..35455af 100644 --- a/ras-cxl-handler.h +++ b/ras-cxl-handler.h @@ -38,4 +38,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, int ras_cxl_general_media_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); +int ras_cxl_dram_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); #endif diff --git a/ras-events.c b/ras-events.c index 0858b51..00159e6 100644 --- a/ras-events.c +++ b/ras-events.c @@ -251,6 +251,7 @@ int toggle_ras_mc_event(int enable) rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable); #endif free_ras: @@ -1031,6 +1032,14 @@ int handle_ras_events(int record_events) else log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", "cxl", "cxl_general_media"); + + rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_dram", + ras_cxl_dram_event_handler, NULL, CXL_DRAM_EVENT); + if (!rc) + num_events++; + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_dram"); #endif if (!num_events) { diff --git a/ras-events.h b/ras-events.h index 0a3edf5..3fe28da 100644 --- a/ras-events.h +++ b/ras-events.h @@ -45,6 +45,7 @@ enum { CXL_OVERFLOW_EVENT, CXL_GENERIC_EVENT, CXL_GENERAL_MEDIA_EVENT, + CXL_DRAM_EVENT, NR_EVENTS }; diff --git a/ras-record.c b/ras-record.c index 0546b29..36f43cf 100644 --- a/ras-record.c +++ b/ras-record.c @@ -915,6 +915,83 @@ int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_gen return rc; } + +/* + * Table and functions to handle cxl:cxl_dram_event + */ +static const struct db_fields cxl_dram_event_fields[] = { + { .name = "id", .type = "INTEGER PRIMARY KEY" }, + { .name = "timestamp", .type = "TEXT" }, + { .name = "memdev", .type = "TEXT" }, + { .name = "host", .type = "TEXT" }, + { .name = "serial", .type = "INTEGER" }, + { .name = "log_type", .type = "TEXT" }, + { .name = "hdr_uuid", .type = "TEXT" }, + { .name = "hdr_flags", .type = "INTEGER" }, + { .name = "hdr_handle", .type = "INTEGER" }, + { .name = "hdr_related_handle", .type = "INTEGER" }, + { .name = "hdr_ts", .type = "TEXT" }, + { .name = "hdr_length", .type = "INTEGER" }, + { .name = "hdr_maint_op_class", .type = "INTEGER" }, + { .name = "dpa", .type = "INTEGER" }, + { .name = "dpa_flags", .type = "INTEGER" }, + { .name = "descriptor", .type = "INTEGER" }, + { .name = "type", .type = "INTEGER" }, + { .name = "transaction_type", .type = "INTEGER" }, + { .name = "channel", .type = "INTEGER" }, + { .name = "rank", .type = "INTEGER" }, + { .name = "nibble_mask", .type = "INTEGER" }, + { .name = "bank_group", .type = "INTEGER" }, + { .name = "bank", .type = "INTEGER" }, + { .name = "row", .type = "INTEGER" }, + { .name = "column", .type = "INTEGER" }, + { .name = "cor_mask", .type = "BLOB" }, +}; + +static const struct db_table_descriptor cxl_dram_event_tab = { + .name = "cxl_dram_event", + .fields = cxl_dram_event_fields, + .num_fields = ARRAY_SIZE(cxl_dram_event_fields), +}; + +int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) +{ + int rc; + struct sqlite3_priv *priv = ras->db_priv; + + if (!priv || !priv->stmt_cxl_dram_event) + return 0; + log(TERM, LOG_INFO, "cxl_dram_event store: %p\n", + priv->stmt_cxl_dram_event); + + ras_store_cxl_common_hdr(priv->stmt_cxl_dram_event, &ev->hdr); + sqlite3_bind_int64(priv->stmt_cxl_dram_event, 13, ev->dpa); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 14, ev->dpa_flags); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 15, ev->descriptor); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 16, ev->type); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 17, ev->transaction_type); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 18, ev->channel); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 19, ev->rank); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 20, ev->nibble_mask); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 21, ev->bank_group); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 22, ev->bank); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 23, ev->row); + sqlite3_bind_int(priv->stmt_cxl_dram_event, 24, ev->column); + sqlite3_bind_blob(priv->stmt_cxl_dram_event, 25, ev->cor_mask, + CXL_EVENT_DER_CORRECTION_MASK_SIZE, NULL); + + rc = sqlite3_step(priv->stmt_cxl_dram_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed to do stmt_cxl_dram_event step on sqlite: error = %d\n", rc); + rc = sqlite3_reset(priv->stmt_cxl_dram_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed reset stmt_cxl_dram_event on sqlite: error = %d\n", rc); + log(TERM, LOG_INFO, "register inserted at db\n"); + + return rc; +} #endif /* @@ -1302,6 +1379,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) if (rc != SQLITE_OK) goto error; } + + rc = ras_mc_create_table(priv, &cxl_dram_event_tab); + if (rc == SQLITE_OK) { + rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_dram_event, + &cxl_dram_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif ras->db_priv = priv; @@ -1464,6 +1549,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", cpu, rc); } + + if (priv->stmt_cxl_dram_event) { + rc = sqlite3_finalize(priv->stmt_cxl_dram_event); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, + "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n", + cpu, rc); + } #endif rc = sqlite3_close_v2(db); diff --git a/ras-record.h b/ras-record.h index 37c32de..480ff92 100644 --- a/ras-record.h +++ b/ras-record.h @@ -135,6 +135,7 @@ struct ras_cxl_poison_event { #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 +#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 struct ras_cxl_aer_ue_event { char timestamp[64]; @@ -199,6 +200,24 @@ struct ras_cxl_general_media_event { uint16_t validity_flags; }; +struct ras_cxl_dram_event { + struct ras_cxl_event_common_hdr hdr; + uint64_t dpa; + uint8_t dpa_flags; + uint8_t descriptor; + uint8_t type; + uint8_t transaction_type; + uint8_t channel; + uint8_t rank; + uint32_t nibble_mask; + uint8_t bank_group; + uint8_t bank; + uint32_t row; + uint16_t column; + uint8_t *cor_mask; + uint16_t validity_flags; +}; + struct ras_mc_event; struct ras_aer_event; struct ras_extlog_event; @@ -214,6 +233,7 @@ struct ras_cxl_aer_ce_event; struct ras_cxl_overflow_event; struct ras_cxl_generic_event; struct ras_cxl_general_media_event; +struct ras_cxl_dram_event; #ifdef HAVE_SQLITE3 @@ -253,6 +273,7 @@ struct sqlite3_priv { sqlite3_stmt *stmt_cxl_overflow_event; sqlite3_stmt *stmt_cxl_generic_event; sqlite3_stmt *stmt_cxl_general_media_event; + sqlite3_stmt *stmt_cxl_dram_event; #endif }; @@ -287,6 +308,7 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); +int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); #else static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; @@ -306,6 +328,7 @@ static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_ static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; +static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; #endif diff --git a/ras-report.c b/ras-report.c index 725dc9b..21180b1 100644 --- a/ras-report.c +++ b/ras-report.c @@ -543,6 +543,68 @@ static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_gener return 0; } +static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev) +{ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if (!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "memdev=%s\n" \ + "host=%s\n" \ + "serial=0x%lx\n" \ + "log_type=%s\n" \ + "hdr_uuid=%s\n" \ + "hdr_flags=0x%x\n" \ + "hdr_handle=0x%x\n" \ + "hdr_related_handle=0x%x\n" \ + "hdr_timestamp=%s\n" \ + "hdr_length=%u\n" \ + "hdr_maint_op_class=%u\n" \ + "dpa=0x%lx\n" \ + "dpa_flags=%u\n" \ + "descriptor=%u\n" \ + "type=%u\n" \ + "transaction_type=%u\n" \ + "channel=%u\n" \ + "rank=%u\n" \ + "nibble_mask=%u\n" \ + "bank_group=%u\n" \ + "bank=%u\n" \ + "row=%u\n" \ + "column=%u\n", \ + ev->hdr.timestamp, \ + ev->hdr.memdev, \ + ev->hdr.host, \ + ev->hdr.serial, \ + ev->hdr.log_type, \ + ev->hdr.hdr_uuid, \ + ev->hdr.hdr_flags, \ + ev->hdr.hdr_handle, \ + ev->hdr.hdr_related_handle, \ + ev->hdr.hdr_timestamp, \ + ev->hdr.hdr_length, \ + ev->hdr.hdr_maint_op_class, \ + ev->dpa, \ + ev->dpa_flags, \ + ev->descriptor, \ + ev->type, \ + ev->transaction_type, \ + ev->channel, \ + ev->rank, \ + ev->nibble_mask, \ + ev->bank_group, \ + ev->bank, \ + ev->row, \ + ev->column); + + strcat(buf, bt_buf); + + return 0; +} + static int commit_report_backtrace(int sockfd, int type, void *ev){ char buf[MAX_BACKTRACE_SIZE]; char *pbuf = buf; @@ -598,6 +660,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ case CXL_GENERAL_MEDIA_EVENT: rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); break; + case CXL_DRAM_EVENT: + rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev); + break; default: return -1; } @@ -1271,3 +1336,47 @@ cxl_general_media_fail: else return -1; } + +int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) +{ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if (sockfd < 0) + return -1; + + rc = commit_report_basic(sockfd); + if (rc < 0) + goto cxl_dram_fail; + + rc = commit_report_backtrace(sockfd, CXL_DRAM_EVENT, ev); + if (rc < 0) + goto cxl_dram_fail; + + sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_dram_event"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_dram_fail; + + sprintf(buf, "REASON=%s", "CXL DRAM Event"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_dram_fail; + + done = 1; + +cxl_dram_fail: + + if (sockfd >= 0) + close(sockfd); + + if (done) + return 0; + else + return -1; +} diff --git a/ras-report.h b/ras-report.h index d9ec7df..1ad00e0 100644 --- a/ras-report.h +++ b/ras-report.h @@ -45,6 +45,7 @@ int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_ev int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); +int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); #else @@ -62,6 +63,7 @@ static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; +static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; #endif From patchwork Wed Apr 12 08:33:11 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shiju Jose X-Patchwork-Id: 13208662 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D7F79C7619A for ; Wed, 12 Apr 2023 08:54:39 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231452AbjDLIyh (ORCPT ); Wed, 12 Apr 2023 04:54:37 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:44964 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231443AbjDLIyg (ORCPT ); Wed, 12 Apr 2023 04:54:36 -0400 Received: from frasgout.his.huawei.com (frasgout.his.huawei.com [185.176.79.56]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3521AA265; Wed, 12 Apr 2023 01:54:15 -0700 (PDT) Received: from lhrpeml500006.china.huawei.com (unknown [172.18.147.201]) by frasgout.his.huawei.com (SkyGuard) with ESMTP id 4PxGBw57Zlz6J7Dy; Wed, 12 Apr 2023 16:31:20 +0800 (CST) Received: from SecurePC30232.china.huawei.com (10.122.247.234) by lhrpeml500006.china.huawei.com (7.191.161.198) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.23; Wed, 12 Apr 2023 09:33:42 +0100 From: To: , , CC: , , Subject: [RFC PATCH 7/7] rasdaemon: Add support for the CXL memory module events Date: Wed, 12 Apr 2023 16:33:11 +0800 Message-ID: <20230412083312.1384-8-shiju.jose@huawei.com> X-Mailer: git-send-email 2.35.1.windows.2 In-Reply-To: <20230412083312.1384-1-shiju.jose@huawei.com> References: <20230412083312.1384-1-shiju.jose@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.122.247.234] X-ClientProxiedBy: lhrpeml100004.china.huawei.com (7.191.162.219) To lhrpeml500006.china.huawei.com (7.191.161.198) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-edac@vger.kernel.org From: Shiju Jose Add support to log and record the CXL memory module events. Signed-off-by: Shiju Jose --- ras-cxl-handler.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++ ras-cxl-handler.h | 3 + ras-events.c | 9 +++ ras-events.h | 1 + ras-record.c | 84 +++++++++++++++++++++++++ ras-record.h | 17 +++++ ras-report.c | 103 ++++++++++++++++++++++++++++++ ras-report.h | 2 + 8 files changed, 375 insertions(+) diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c index fadf5db..ca23b97 100644 --- a/ras-cxl-handler.c +++ b/ras-cxl-handler.c @@ -1008,3 +1008,159 @@ int ras_cxl_dram_event_handler(struct trace_seq *s, return 0; } + +/* + * Memory Module Event Record - MMER + * + * CXL res 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +static const char* cxl_dev_evt_type[] = { + "Health Status Change", + "Media Status Change", + "Life Used Change", + "Temperature Change", + "Data Path Error", + "LSA Error", +}; + +/* + * Device Health Information - DHI + * + * CXL res 3.0 section 8.2.9.8.3.1; Table 8-100 + */ +#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0) +#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1) +#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2) + +static const struct cxl_event_flags cxl_health_status[] = { + { .bit = CXL_DHI_HS_MAINTENANCE_NEEDED, .flag = "MAINTENANCE_NEEDED" }, + { .bit = CXL_DHI_HS_PERFORMANCE_DEGRADED, .flag = "PERFORMANCE_DEGRADED" }, + { .bit = CXL_DHI_HS_HW_REPLACEMENT_NEEDED, .flag = "REPLACEMENT_NEEDED" }, +}; + +static const char* cxl_media_status[] = { + "Normal", + "Not Ready", + "Write Persistency Lost", + "All Data Lost", + "Write Persistency Loss in the Event of Power Loss", + "Write Persistency Loss in Event of Shutdown", + "Write Persistency Loss Imminent", + "All Data Loss in Event of Power Loss", + "All Data loss in the Event of Shutdown", + "All Data Loss Imminent", +}; + +static const char* cxl_two_bit_status[] = { + "Normal", + "Warning", + "Critical", +}; + +static const char* cxl_one_bit_status[] = { + "Normal", + "Warning", +}; + +#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3) +#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2) +#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4) +#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5) + +int ras_cxl_memory_module_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +{ + unsigned long long val; + struct ras_events *ras = context; + struct ras_cxl_memory_module_event ev; + + memset(&ev, 0, sizeof(ev)); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + + if (tep_get_field_val(s, event, "event_type", record, &val, 1) < 0) + return -1; + ev.event_type = val; + if (trace_seq_printf(s, "event_type:%s ", get_cxl_type_str(cxl_dev_evt_type, + ARRAY_SIZE(cxl_dev_evt_type), ev.event_type)) <= 0) + return -1; + + if (tep_get_field_val(s, event, "health_status", record, &val, 1) < 0) + return -1; + ev.health_status = val; + if (trace_seq_printf(s, "health_status:") <= 0) + return -1; + if (decode_cxl_event_flags(s, ev.health_status, cxl_health_status, + ARRAY_SIZE(cxl_health_status)) < 0) + return -1; + + if (tep_get_field_val(s, event, "media_status", record, &val, 1) < 0) + return -1; + ev.media_status = val; + if (trace_seq_printf(s, "media_status:%s ", get_cxl_type_str(cxl_media_status, + ARRAY_SIZE(cxl_media_status), ev.media_status)) <= 0) + return -1; + + if (tep_get_field_val(s, event, "add_status", record, &val, 1) < 0) + return -1; + ev.add_status = val; + if (trace_seq_printf(s, "as_life_used:%s ", get_cxl_type_str(cxl_two_bit_status, + ARRAY_SIZE(cxl_two_bit_status), + CXL_DHI_AS_LIFE_USED(ev.add_status))) <= 0) + return -1; + if (trace_seq_printf(s, "as_dev_temp:%s ", get_cxl_type_str(cxl_two_bit_status, + ARRAY_SIZE(cxl_two_bit_status), + CXL_DHI_AS_DEV_TEMP(ev.add_status))) <= 0) + return -1; + if (trace_seq_printf(s, "as_cor_vol_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status, + ARRAY_SIZE(cxl_one_bit_status), + CXL_DHI_AS_COR_VOL_ERR_CNT(ev.add_status))) <= 0) + return -1; + if (trace_seq_printf(s, "as_cor_per_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status, + ARRAY_SIZE(cxl_one_bit_status), + CXL_DHI_AS_COR_PER_ERR_CNT(ev.add_status))) <= 0) + return -1; + + if (tep_get_field_val(s, event, "life_used", record, &val, 1) < 0) + return -1; + ev.life_used = val; + if (trace_seq_printf(s, "life_used:%u ", ev.life_used) <= 0) + return -1; + + if (tep_get_field_val(s, event, "device_temp", record, &val, 1) < 0) + return -1; + ev.device_temp = val; + if (trace_seq_printf(s, "device_temp:%u ", ev.device_temp) <= 0) + return -1; + + if (tep_get_field_val(s, event, "dirty_shutdown_cnt", record, &val, 1) < 0) + return -1; + ev.dirty_shutdown_cnt = val; + if (trace_seq_printf(s, "dirty_shutdown_cnt:%u ", ev.dirty_shutdown_cnt) <= 0) + return -1; + + if (tep_get_field_val(s, event, "cor_vol_err_cnt", record, &val, 1) < 0) + return -1; + ev.cor_vol_err_cnt = val; + if (trace_seq_printf(s, "cor_vol_err_cnt:%u ", ev.cor_vol_err_cnt) <= 0) + return -1; + + if (tep_get_field_val(s, event, "cor_per_err_cnt", record, &val, 1) < 0) + return -1; + ev.cor_per_err_cnt = val; + if (trace_seq_printf(s, "cor_per_err_cnt:%u ", ev.cor_per_err_cnt) <= 0) + return -1; + + /* Insert data into the SGBD */ +#ifdef HAVE_SQLITE3 + ras_store_cxl_memory_module_event(ras, &ev); +#endif + +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_cxl_memory_module_event(ras, &ev); +#endif + + return 0; +} diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h index 35455af..1ea0f93 100644 --- a/ras-cxl-handler.h +++ b/ras-cxl-handler.h @@ -41,4 +41,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, int ras_cxl_dram_event_handler(struct trace_seq *s, struct tep_record *record, struct tep_event *event, void *context); +int ras_cxl_memory_module_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); #endif diff --git a/ras-events.c b/ras-events.c index 00159e6..182f28f 100644 --- a/ras-events.c +++ b/ras-events.c @@ -252,6 +252,7 @@ int toggle_ras_mc_event(int enable) rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable); #endif free_ras: @@ -1040,6 +1041,14 @@ int handle_ras_events(int record_events) else log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", "cxl", "cxl_dram"); + + rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_memory_module", + ras_cxl_memory_module_event_handler, NULL, CXL_MEMORY_MODULE_EVENT); + if (!rc) + num_events++; + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "memory_module"); #endif if (!num_events) { diff --git a/ras-events.h b/ras-events.h index 3fe28da..ccc1336 100644 --- a/ras-events.h +++ b/ras-events.h @@ -46,6 +46,7 @@ enum { CXL_GENERIC_EVENT, CXL_GENERAL_MEDIA_EVENT, CXL_DRAM_EVENT, + CXL_MEMORY_MODULE_EVENT, NR_EVENTS }; diff --git a/ras-record.c b/ras-record.c index 36f43cf..89fca74 100644 --- a/ras-record.c +++ b/ras-record.c @@ -992,6 +992,74 @@ int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event * return rc; } + +/* + * Table and functions to handle cxl:cxl_memory_module_event + */ +static const struct db_fields cxl_memory_module_event_fields[] = { + { .name = "id", .type = "INTEGER PRIMARY KEY" }, + { .name = "timestamp", .type = "TEXT" }, + { .name = "memdev", .type = "TEXT" }, + { .name = "host", .type = "TEXT" }, + { .name = "serial", .type = "INTEGER" }, + { .name = "log_type", .type = "TEXT" }, + { .name = "hdr_uuid", .type = "TEXT" }, + { .name = "hdr_flags", .type = "INTEGER" }, + { .name = "hdr_handle", .type = "INTEGER" }, + { .name = "hdr_related_handle", .type = "INTEGER" }, + { .name = "hdr_ts", .type = "TEXT" }, + { .name = "hdr_length", .type = "INTEGER" }, + { .name = "hdr_maint_op_class", .type = "INTEGER" }, + { .name = "event_type", .type = "INTEGER" }, + { .name = "health_status", .type = "INTEGER" }, + { .name = "media_status", .type = "INTEGER" }, + { .name = "life_used", .type = "INTEGER" }, + { .name = "dirty_shutdown_cnt", .type = "INTEGER" }, + { .name = "cor_vol_err_cnt", .type = "INTEGER" }, + { .name = "cor_per_err_cnt", .type = "INTEGER" }, + { .name = "device_temp", .type = "INTEGER" }, + { .name = "add_status", .type = "INTEGER" }, +}; + +static const struct db_table_descriptor cxl_memory_module_event_tab = { + .name = "cxl_memory_module_event", + .fields = cxl_memory_module_event_fields, + .num_fields = ARRAY_SIZE(cxl_memory_module_event_fields), +}; + +int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) +{ + int rc; + struct sqlite3_priv *priv = ras->db_priv; + + if (!priv || !priv->stmt_cxl_memory_module_event) + return 0; + log(TERM, LOG_INFO, "cxl_memory_module_event store: %p\n", + priv->stmt_cxl_memory_module_event); + + ras_store_cxl_common_hdr(priv->stmt_cxl_memory_module_event, &ev->hdr); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 13, ev->event_type); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 14, ev->health_status); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 15, ev->media_status); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 16, ev->life_used); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 17, ev->dirty_shutdown_cnt); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 18, ev->cor_vol_err_cnt); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 19, ev->cor_per_err_cnt); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 20, ev->device_temp); + sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 21, ev->add_status); + + rc = sqlite3_step(priv->stmt_cxl_memory_module_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed to do stmt_cxl_memory_module_event step on sqlite: error = %d\n", rc); + rc = sqlite3_reset(priv->stmt_cxl_memory_module_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) + log(TERM, LOG_ERR, + "Failed reset stmt_cxl_memory_module_event on sqlite: error = %d\n", rc); + log(TERM, LOG_INFO, "register inserted at db\n"); + + return rc; +} #endif /* @@ -1387,6 +1455,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) if (rc != SQLITE_OK) goto error; } + + rc = ras_mc_create_table(priv, &cxl_memory_module_event_tab); + if (rc == SQLITE_OK) { + rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_memory_module_event, + &cxl_memory_module_event_tab); + if (rc != SQLITE_OK) + goto error; + } #endif ras->db_priv = priv; @@ -1557,6 +1633,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n", cpu, rc); } + + if (priv->stmt_cxl_memory_module_event) { + rc = sqlite3_finalize(priv->stmt_cxl_memory_module_event); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, + "cpu %u: Failed to finalize stmt_cxl_memory_module_event sqlite: error = %d\n", + cpu, rc); + } #endif rc = sqlite3_close_v2(db); diff --git a/ras-record.h b/ras-record.h index 480ff92..a7b9ab9 100644 --- a/ras-record.h +++ b/ras-record.h @@ -218,6 +218,19 @@ struct ras_cxl_dram_event { uint16_t validity_flags; }; +struct ras_cxl_memory_module_event { + struct ras_cxl_event_common_hdr hdr; + uint8_t event_type; + uint8_t health_status; + uint8_t media_status; + uint8_t life_used; + uint32_t dirty_shutdown_cnt; + uint32_t cor_vol_err_cnt; + uint32_t cor_per_err_cnt; + int16_t device_temp; + uint8_t add_status; +}; + struct ras_mc_event; struct ras_aer_event; struct ras_extlog_event; @@ -234,6 +247,7 @@ struct ras_cxl_overflow_event; struct ras_cxl_generic_event; struct ras_cxl_general_media_event; struct ras_cxl_dram_event; +struct ras_cxl_memory_module_event; #ifdef HAVE_SQLITE3 @@ -274,6 +288,7 @@ struct sqlite3_priv { sqlite3_stmt *stmt_cxl_generic_event; sqlite3_stmt *stmt_cxl_general_media_event; sqlite3_stmt *stmt_cxl_dram_event; + sqlite3_stmt *stmt_cxl_memory_module_event; #endif }; @@ -309,6 +324,7 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); +int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev); #else static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; @@ -329,6 +345,7 @@ static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ra static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; +static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; }; #endif diff --git a/ras-report.c b/ras-report.c index 21180b1..a30b66d 100644 --- a/ras-report.c +++ b/ras-report.c @@ -605,6 +605,62 @@ static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev return 0; } +static int set_cxl_memory_module_event_backtrace(char *buf, struct ras_cxl_memory_module_event *ev) +{ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if (!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "memdev=%s\n" \ + "host=%s\n" \ + "serial=0x%lx\n" \ + "log_type=%s\n" \ + "hdr_uuid=%s\n" \ + "hdr_flags=0x%x\n" \ + "hdr_handle=0x%x\n" \ + "hdr_related_handle=0x%x\n" \ + "hdr_timestamp=%s\n" \ + "hdr_length=%u\n" \ + "hdr_maint_op_class=%u\n" \ + "event_type=%u\n" \ + "health_status=%u\n" \ + "media_status=%u\n" \ + "life_used=%u\n" \ + "dirty_shutdown_cnt=%u\n" \ + "cor_vol_err_cnt=%u\n" \ + "cor_per_err_cnt=%u\n" \ + "device_temp=%d\n" \ + "add_status=%u\n", \ + ev->hdr.timestamp, \ + ev->hdr.memdev, \ + ev->hdr.host, \ + ev->hdr.serial, \ + ev->hdr.log_type, \ + ev->hdr.hdr_uuid, \ + ev->hdr.hdr_flags, \ + ev->hdr.hdr_handle, \ + ev->hdr.hdr_related_handle, \ + ev->hdr.hdr_timestamp, \ + ev->hdr.hdr_length, \ + ev->hdr.hdr_maint_op_class, \ + ev->event_type, \ + ev->health_status, \ + ev->media_status, \ + ev->life_used, \ + ev->dirty_shutdown_cnt, \ + ev->cor_vol_err_cnt, \ + ev->cor_per_err_cnt, \ + ev->device_temp, \ + ev->add_status); + + strcat(buf, bt_buf); + + return 0; +} + static int commit_report_backtrace(int sockfd, int type, void *ev){ char buf[MAX_BACKTRACE_SIZE]; char *pbuf = buf; @@ -663,6 +719,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ case CXL_DRAM_EVENT: rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev); break; + case CXL_MEMORY_MODULE_EVENT: + rc = set_cxl_memory_module_event_backtrace(buf, (struct ras_cxl_memory_module_event *)ev); + break; default: return -1; } @@ -1380,3 +1439,47 @@ cxl_dram_fail: else return -1; } + +int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) +{ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if (sockfd < 0) + return -1; + + rc = commit_report_basic(sockfd); + if (rc < 0) + goto cxl_memory_module_fail; + + rc = commit_report_backtrace(sockfd, CXL_MEMORY_MODULE_EVENT, ev); + if (rc < 0) + goto cxl_memory_module_fail; + + sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_memory_module_event"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_memory_module_fail; + + sprintf(buf, "REASON=%s", "CXL Memory Module Event"); + rc = write(sockfd, buf, strlen(buf) + 1); + if (rc < strlen(buf) + 1) + goto cxl_memory_module_fail; + + done = 1; + +cxl_memory_module_fail: + + if (sockfd >= 0) + close(sockfd); + + if (done) + return 0; + else + return -1; +} diff --git a/ras-report.h b/ras-report.h index 1ad00e0..e401850 100644 --- a/ras-report.h +++ b/ras-report.h @@ -46,6 +46,7 @@ int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflo int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); +int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev); #else @@ -64,6 +65,7 @@ static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct r static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; +static inline int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; }; #endif