diff mbox series

[v2,02/10] rasdaemon: Support cpu fault isolation for recoverable errors

Message ID 20221003161742.1697-3-shiju.jose@huawei.com (mailing list archive)
State New, archived
Headers show
Series rasdaemon: Add cpu fault isolation support and improvements to the HiSilicon vendor specific code | expand

Commit Message

Shiju Jose Oct. 3, 2022, 4:17 p.m. UTC
From: Shengwei Luo <luoshengwei@huawei.com>

When the recoverable errors in cpu core occurred, try to offline
the related cpu core.

Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
Signed-off-by: Junchong Pan <panjunchong@hisilicon.com>
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 ras-arm-handler.c   | 22 +++++++++++++++++++---
 ras-cpu-isolation.c | 17 +++++++++++++++++
 ras-cpu-isolation.h |  4 +++-
 3 files changed, 39 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 9c7a3c3..a0dfc51 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -26,6 +26,7 @@ 
 
 #define ARM_ERR_VALID_ERROR_COUNT BIT(0)
 #define ARM_ERR_VALID_FLAGS BIT(1)
+#define BIT2 2
 
 void display_raw_data(struct trace_seq *s,
 		const uint8_t *buf,
@@ -47,7 +48,20 @@  void display_raw_data(struct trace_seq *s,
 }
 
 #ifdef HAVE_CPU_FAULT_ISOLATION
-static int count_errors(struct ras_arm_event *ev)
+static int is_core_failure(struct ras_arm_err_info *err_info)
+{
+	if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
+		/*
+		 * core failure:
+		 * Bit 0\1\3: (at lease 1)
+		 * Bit 2: 0
+		 */
+		return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2));
+	}
+	return 0;
+}
+
+static int count_errors(struct ras_arm_event *ev, int sev)
 {
 	struct ras_arm_err_info *err_info;
 	int num_pei;
@@ -75,6 +89,8 @@  static int count_errors(struct ras_arm_event *ev)
 			 */
 			error_count = err_info->multiple_error + 1;
 		}
+		if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
+			error_count = 0;
 
 		num += error_count;
 		err_info += 1;
@@ -118,8 +134,8 @@  static int ras_handle_cpu_error(struct trace_seq *s,
 	}
 	trace_seq_printf(s, "\n severity: %s", severity);
 
-	if (val == GHES_SEV_CORRECTED) {
-		int nums = count_errors(ev);
+	if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
+		int nums = count_errors(ev, val);
 
 		if (nums > 0) {
 			err_info.nums = nums;
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
index 1694a08..90633fd 100644
--- a/ras-cpu-isolation.c
+++ b/ras-cpu-isolation.c
@@ -126,6 +126,7 @@  static int init_cpu_info(unsigned int cpus)
 
 	for (unsigned int i = 0; i < cpus; ++i) {
 		cpu_infos[i].ce_nums = 0;
+		cpu_infos[i].uce_nums = 0;
 		cpu_infos[i].state = get_cpu_status(i);
 		cpu_infos[i].ce_queue = init_queue();
 
@@ -306,6 +307,15 @@  static int do_ce_handler(unsigned int cpu)
 	return HANDLE_NOTHING;
 }
 
+static int do_uce_handler(unsigned int cpu)
+{
+	if (cpu_infos[cpu].uce_nums > 0) {
+		log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu);
+		return do_cpu_offline(cpu);
+	}
+	return HANDLE_NOTHING;
+}
+
 static int error_handler(unsigned int cpu, struct error_info *err_info)
 {
 	int ret = HANDLE_NOTHING;
@@ -314,6 +324,9 @@  static int error_handler(unsigned int cpu, struct error_info *err_info)
 	case CE:
 		ret = do_ce_handler(cpu);
 		break;
+	case UCE:
+		ret = do_uce_handler(cpu);
+		break;
 	default:
 		break;
 	}
@@ -336,6 +349,9 @@  static void record_error_info(unsigned int cpu, struct error_info *err_info)
 		cpu_infos[cpu].ce_nums += err_info->nums;
 		break;
 	}
+	case UCE:
+		cpu_infos[cpu].uce_nums++;
+		break;
 	default:
 		break;
 	}
@@ -382,6 +398,7 @@  void ras_record_cpu_error(struct error_info *err_info, int cpu)
 			cpu, cpu_state[cpu_infos[cpu].state]);
 		clear_queue(cpu_infos[cpu].ce_queue);
 		cpu_infos[cpu].ce_nums = 0;
+		cpu_infos[cpu].uce_nums = 0;
 	} else
 		log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
 			cpu, cpu_state[cpu_infos[cpu].state]);
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
index 35b5225..5682106 100644
--- a/ras-cpu-isolation.h
+++ b/ras-cpu-isolation.h
@@ -45,10 +45,12 @@  enum error_handle_result {
 };
 
 enum error_type {
-	CE = 1
+	CE = 1,
+	UCE
 };
 
 struct cpu_info {
+	unsigned long uce_nums;
 	unsigned long ce_nums;
 	struct link_queue *ce_queue;
 	enum cpu_state state;