@@ -88,3 +88,7 @@ TRIGGER_DIR=
# MC_UE_TRIGGER=mc_event_trigger
MC_CE_TRIGGER=
MC_UE_TRIGGER=
+
+# CXL memory auto repair control
+# Whether to enable CXL memory auto repair (yes|no).
+CXL_AUTO_REPAIR_ENABLE="no"
@@ -4,7 +4,9 @@
* Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
*/
+#include <dirent.h>
#include <endian.h>
+#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -722,6 +724,140 @@ static int handle_ras_cxl_common_hdr(struct trace_seq *s,
return 0;
}
+/* memory repair */
+/*
+ * Common Event Record Format
+ * CXL 3.1 section 8.2.9.2.1; Table 8-43
+ */
+#define CXL_MAINT_CLASS_SPARING 0x02
+#define CXL_MAINT_SUBCLASS_CACHE_SPARING 0x00
+#define CXL_MAINT_SUBCLASS_ROW_SPARING 0x01
+#define CXL_MAINT_SUBCLASS_BANK_SPARING 0x02
+#define CXL_MAINT_SUBCLASS_RANK_SPARING 0x03
+
+#define CXL_CMD_BUF_SIZE 256
+
+enum cxl_mem_sparing_type {
+ CXL_CACHE_SPARING,
+ CXL_ROW_SPARING,
+ CXL_BANK_SPARING,
+ CXL_RANK_SPARING,
+};
+
+static const char *edac_bus_path = "/sys/bus/edac/devices/";
+#define EDAC_CXL_DEV_PREFIX "cxl_"
+
+/*
+ * Auto repair is disabled default.
+ * 'export CXL_AUTO_REPAIR_ENABLE=yes' to enable auto repair.
+ */
+static bool auto_repair;
+
+static void check_config_status(void)
+{
+ char *env = getenv("CXL_AUTO_REPAIR_ENABLE");
+
+ if (!env || strcasecmp(env, "yes"))
+ return;
+
+ auto_repair = true;
+}
+
+static int get_sysfs_data_str(const char *dir, const char *file, char *out)
+{
+ char path[CXL_CMD_BUF_SIZE];
+ char buf[CXL_CMD_BUF_SIZE];
+ int fd;
+
+ snprintf(path, CXL_CMD_BUF_SIZE, "%s/%s", dir, file);
+ fd = open(path, O_RDONLY);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
+ return -1;
+ }
+
+ memset(buf, 0, strlen(buf));
+ if (read(fd, buf, sizeof(buf)) <= 0)
+ goto error;
+
+ if (sscanf(buf, "%s", out) <= 0)
+ goto error;
+
+ close(fd);
+ return 0;
+
+error:
+ close(fd);
+ return -1;
+}
+
+static int set_sysfs_data_uint32(const char *dir, const char *file, uint32_t data)
+{
+ char path[CXL_CMD_BUF_SIZE];
+ int fd;
+
+ snprintf(path, CXL_CMD_BUF_SIZE, "%s/%s", dir, file);
+ fd = open(path, O_WRONLY);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
+ return -1;
+ }
+
+ if (dprintf(fd, "%d", data) <= 0) {
+ log(TERM, LOG_ERR, "[%s]: write data to [%s] failed, errno:%d\n",
+ __func__, path, errno);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ return 0;
+}
+
+static int set_sysfs_data_uint64(const char *dir, const char *file, uint64_t data)
+{
+ char path[CXL_CMD_BUF_SIZE];
+ int fd;
+
+ snprintf(path, CXL_CMD_BUF_SIZE, "%s/%s", dir, file);
+ fd = open(path, O_WRONLY);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
+ return -1;
+ }
+
+ if (dprintf(fd, "0x%lx", data) <= 0) {
+ log(TERM, LOG_ERR, "[%s]: write data to [%s] failed, errno:%d\n",
+ __func__, path, errno);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ return 0;
+}
+
+static int cxl_find_spare(const char *repair_dev, const char *repair_type)
+{
+ char dir[CXL_CMD_BUF_SIZE];
+ char out[CXL_CMD_BUF_SIZE];
+ int idx = 0;
+
+ while (1) {
+ snprintf(dir, CXL_CMD_BUF_SIZE, "%s%s%s/mem_repair%d",
+ edac_bus_path, EDAC_CXL_DEV_PREFIX, repair_dev, idx);
+
+ if (get_sysfs_data_str(dir, "repair_type", out))
+ return -1;
+
+ if (!strcmp(repair_type, out))
+ return idx;
+ idx++;
+ }
+
+ return -1;
+}
+
int ras_cxl_generic_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
@@ -1027,6 +1163,155 @@ static const char * const cxl_der_mem_event_type[] = {
"CKID Violation",
};
+/*
+ * Each type of sparing requires a superset of the info needed for
+ * coarser grained sparing.
+ */
+static int fill_rank_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (set_sysfs_data_uint64(dir, "dpa", ev->dpa))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "channel", ev->channel))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "rank", ev->rank))
+ return -1;
+
+ if (ev->validity_flags & CXL_DER_VALID_NIBBLE) {
+ if (set_sysfs_data_uint32(dir, "nibble_mask", ev->nibble_mask))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int fill_bank_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (fill_rank_sparing_attrs(ev, dir))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "bank_group", ev->bank_group))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "bank", ev->bank))
+ return -1;
+
+ return 0;
+}
+
+static int fill_row_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (fill_bank_sparing_attrs(ev, dir))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "row", ev->row))
+ return -1;
+
+ return 0;
+}
+
+static int fill_cacheline_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (fill_row_sparing_attrs(ev, dir))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "column", ev->column))
+ return -1;
+
+ if (ev->validity_flags & CXL_DER_VALID_SUB_CHANNEL) {
+ if (set_sysfs_data_uint32(dir, "sub_channel", ev->sub_channel))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int cxl_dram_sparing(struct ras_cxl_dram_event *ev)
+{
+ struct ras_cxl_event_common_hdr *hdr = &ev->hdr;
+ char dir[CXL_CMD_BUF_SIZE];
+ char repair_type[256];
+ uint8_t sparing_type;
+ int idx;
+
+ check_config_status();
+ if (!auto_repair)
+ return -1;
+
+ if (!(ev->hdr.hdr_flags & CXL_EVENT_RECORD_FLAG_MAINT_NEEDED) ||
+ !(ev->hdr.hdr_flags & CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID) ||
+ ev->hdr.hdr_maint_op_class != CXL_MAINT_CLASS_SPARING ||
+ ev->dpa_flags & CXL_DPA_NOT_REPAIRABLE)
+ return -1;
+
+ if (!(ev->validity_flags & CXL_DER_VALID_CHANNEL) ||
+ !(ev->validity_flags & CXL_DER_VALID_RANK))
+ return -1;
+
+ /*
+ * CXL device reports the type of the repair in the event record.
+ */
+ switch (hdr->hdr_maint_op_sub_class) {
+ case CXL_MAINT_SUBCLASS_CACHE_SPARING:
+ if (!(ev->validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(ev->validity_flags & CXL_DER_VALID_BANK) ||
+ !(ev->validity_flags & CXL_DER_VALID_ROW) ||
+ !(ev->validity_flags & CXL_DER_VALID_COLUMN))
+ return -1;
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "cacheline-sparing");
+ sparing_type = CXL_CACHE_SPARING;
+ break;
+ case CXL_MAINT_SUBCLASS_ROW_SPARING:
+ if (!(ev->validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(ev->validity_flags & CXL_DER_VALID_BANK) ||
+ !(ev->validity_flags & CXL_DER_VALID_ROW))
+ return -1;
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "row-sparing");
+ sparing_type = CXL_ROW_SPARING;
+ break;
+ case CXL_MAINT_SUBCLASS_BANK_SPARING:
+ if (!(ev->validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(ev->validity_flags & CXL_DER_VALID_BANK))
+ return -1;
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "bank-sparing");
+ sparing_type = CXL_CACHE_SPARING;
+ break;
+ case CXL_MAINT_SUBCLASS_RANK_SPARING:
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "rank-sparing");
+ sparing_type = CXL_CACHE_SPARING;
+ break;
+ default:
+ return -1;
+ }
+
+ idx = cxl_find_spare(hdr->memdev, repair_type);
+ if (idx < 0)
+ return -1;
+
+ snprintf(dir, CXL_CMD_BUF_SIZE, "%s%s%s/mem_repair%d",
+ edac_bus_path, EDAC_CXL_DEV_PREFIX, ev->hdr.memdev, idx);
+
+ if (sparing_type == CXL_CACHE_SPARING)
+ fill_cacheline_sparing_attrs(ev, dir);
+ else if (sparing_type == CXL_ROW_SPARING)
+ fill_row_sparing_attrs(ev, dir);
+ else if (sparing_type == CXL_BANK_SPARING)
+ fill_bank_sparing_attrs(ev, dir);
+ else if (sparing_type == CXL_RANK_SPARING)
+ fill_rank_sparing_attrs(ev, dir);
+ else
+ return -1;
+
+ set_sysfs_data_uint32(dir, "repair", 1);
+
+ return 0;
+}
+
int ras_cxl_dram_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
@@ -1231,6 +1516,8 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
if (trace_seq_printf(s, "CVME Count:%u ", ev.cvme_count) <= 0)
return -1;
+ cxl_dram_sparing(&ev);
+
/* Insert data into the SGBD */
#ifdef HAVE_SQLITE3
ras_store_cxl_dram_event(ras, &ev);