diff mbox

[3/7,v4] CPER: Adjust code flow of some functions

Message ID 20140523101143.GC21332@pd.tnic (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Borislav Petkov May 23, 2014, 10:11 a.m. UTC
On Fri, May 23, 2014 at 11:37:03AM +0200, Borislav Petkov wrote:
> Or if you don't wanna, I can do it instead - it'll be much easier for
> me than reviewing it again.

Here's a version with the suggested changes incorporated that builds
fine here:

--

Comments

Chen Gong May 26, 2014, 1:59 a.m. UTC | #1
On Fri, May 23, 2014 at 12:11:43PM +0200, Borislav Petkov wrote:
> +	TP_printk("%d %s error: %s %s %llx (mask lsb: %x), %s%s",
What if pa_mask_lsb not existing? It will show something like:

extlog_mem_event: 1 corrected error: unknown DIMM location:
Memriser1 CHANNEL A DIMM 0 0x0000000074516000 (mask lsb: ), node: 0 card: 0
module: 0 rank: 0 bank: 0 row:
7329 column: 656 FRU: 00000000-0000-0000-0000-000000000000

even worse, if pa not existed, it will show:

extlog_mem_event: 1 corrected error: unknown DIMM location:
Memriser1 CHANNEL A DIMM 0  (mask lsb: ), node: 0 card: 0
module: 0 rank: 0 bank: 0 row:
7329 column: 656 FRU: 00000000-0000-0000-0000-000000000000

What I want to do is to make output format more graceful.

> +		  __entry->error_number,
> +		  cper_severity_str(__entry->severity),
> +		  cper_mem_err_type_str(__entry->etype),
> +		  __get_str(dimm_info),
> +		  __entry->pa,
> +		  __entry->pa_mask_lsb,
> +		  __get_str(mem_loc),
> +		  __get_str(fru))
> +);
>
Borislav Petkov May 26, 2014, 10:21 a.m. UTC | #2
On Sun, May 25, 2014 at 09:59:44PM -0400, Chen, Gong wrote:
> On Fri, May 23, 2014 at 12:11:43PM +0200, Borislav Petkov wrote:
> > +	TP_printk("%d %s error: %s %s %llx (mask lsb: %x), %s%s",
> What if pa_mask_lsb not existing?

Then you make it the default which says that all bits in the mask are
invalid: -1, i.e. 255.

This becomes part of the interface then, just like phys_addr is
0xfffff... ,i.e. -1 in the invalid case.
Chen Gong May 26, 2014, 10:42 a.m. UTC | #3
On Mon, May 26, 2014 at 12:21:51PM +0200, Borislav Petkov wrote:
> Date: Mon, 26 May 2014 12:21:51 +0200
> From: Borislav Petkov <bp@alien8.de>
> To: "Chen, Gong" <gong.chen@linux.intel.com>
> Cc: tony.luck@intel.com, m.chehab@samsung.com, linux-acpi@vger.kernel.org
> Subject: Re: [PATCH 3/7 v4] CPER: Adjust code flow of some functions
> User-Agent: Mutt/1.5.23 (2014-03-12)
> 
> On Sun, May 25, 2014 at 09:59:44PM -0400, Chen, Gong wrote:
> > On Fri, May 23, 2014 at 12:11:43PM +0200, Borislav Petkov wrote:
> > > +	TP_printk("%d %s error: %s %s %llx (mask lsb: %x), %s%s",
> > What if pa_mask_lsb not existing?
> 
> Then you make it the default which says that all bits in the mask are
> invalid: -1, i.e. 255.
> 
> This becomes part of the interface then, just like phys_addr is
> 0xfffff... ,i.e. -1 in the invalid case.
> 
OK, fine to me. I will update it soon.
diff mbox

Patch

Index: linux/drivers/acpi/Kconfig
===================================================================
--- linux.orig/drivers/acpi/Kconfig	2014-05-23 11:14:33.856625534 +0200
+++ linux/drivers/acpi/Kconfig	2014-05-23 11:14:33.840625534 +0200
@@ -370,6 +370,7 @@  config ACPI_EXTLOG
 	tristate "Extended Error Log support"
 	depends on X86_MCE && X86_LOCAL_APIC
 	select UEFI_CPER
+	select RAS_TRACE
 	default n
 	help
 	  Certain usages such as Predictive Failure Analysis (PFA) require
@@ -384,6 +385,7 @@  config ACPI_EXTLOG
 
 	  Enhanced MCA Logging allows firmware to provide additional error
 	  information to system software, synchronous with MCE or CMCI. This
-	  driver adds support for that functionality.
+	  driver adds support for that functionality with corresponding
+	  tracepoint which carries that information to userspace.
 
 endif	# ACPI
Index: linux/drivers/acpi/acpi_extlog.c
===================================================================
--- linux.orig/drivers/acpi/acpi_extlog.c	2014-05-23 11:14:33.856625534 +0200
+++ linux/drivers/acpi/acpi_extlog.c	2014-05-23 12:09:56.000000000 +0200
@@ -16,6 +16,7 @@ 
 #include <asm/mce.h>
 
 #include "apei/apei-internal.h"
+#include <ras/ras_event.h>
 
 #define EXT_ELOG_ENTRY_MASK	GENMASK_ULL(51, 0) /* elog entry address mask */
 
@@ -43,6 +44,9 @@  struct extlog_l1_head {
 
 static int old_edac_report_status;
 
+static char mem_location[CPER_REC_LEN];
+static char dimm_location[CPER_REC_LEN];
+
 static u8 extlog_dsm_uuid[] __initdata = "663E35AF-CC10-41A4-88EA-5470AF055295";
 
 /* L1 table related physical address */
@@ -69,6 +73,30 @@  static u32 l1_percpu_entry;
 #define ELOG_ENTRY_ADDR(phyaddr) \
 	(phyaddr - elog_base + (u8 *)elog_addr)
 
+static void __trace_mem_error(const uuid_le *fru_id, char *fru_text,
+			       u64 err_count, u32 severity,
+			       struct cper_sec_mem_err *mem)
+{
+	u8 etype = -1, pa_mask_lsb = 0;
+	u64 pa = 0;
+
+	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
+		etype = mem->error_type;
+
+	if (mem->validation_bits & CPER_MEM_VALID_PA)
+		pa = mem->physical_addr;
+
+	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
+		pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
+
+	memset(mem_location, 0, CPER_REC_LEN);
+	cper_mem_err_location(mem, mem_location);
+	memset(dimm_location, 0, CPER_REC_LEN);
+	cper_dimm_err_location(mem, dimm_location);
+	trace_extlog_mem_event(err_count, etype, severity, pa, pa_mask_lsb,
+			       fru_id, dimm_location, mem_location, fru_text);
+}
+
 static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank)
 {
 	int idx;
@@ -137,8 +165,12 @@  static int extlog_print(struct notifier_
 	struct mce *mce = (struct mce *)data;
 	int	bank = mce->bank;
 	int	cpu = mce->extcpu;
-	struct acpi_generic_status *estatus;
-	int rc;
+	struct acpi_generic_status *estatus, *tmp;
+	struct acpi_generic_data *gdata;
+	const uuid_le *fru_id = &NULL_UUID_LE;
+	char *fru_text = "";
+	uuid_le *sec_type;
+	static u64 err_count;
 
 	estatus = extlog_elog_entry_check(cpu, bank);
 	if (estatus == NULL)
@@ -148,7 +180,23 @@  static int extlog_print(struct notifier_
 	/* clear record status to enable BIOS to update it again */
 	estatus->block_status = 0;
 
-	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
+	tmp = (struct acpi_generic_status *)elog_buf;
+	print_extlog_rcd(NULL, tmp, cpu);
+
+	/* log event via trace */
+	err_count++;
+	gdata = (struct acpi_generic_data *)(tmp + 1);
+	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+		fru_id = (uuid_le *)gdata->fru_id;
+	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+		fru_text = gdata->fru_text;
+	sec_type = (uuid_le *)gdata->section_type;
+	if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
+		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
+		if (gdata->error_data_length >= sizeof(*mem_err))
+			__trace_mem_error(fru_id, fru_text, err_count,
+					  gdata->error_severity, mem_err);
+	}
 
 	return NOTIFY_STOP;
 }
Index: linux/drivers/ras/ras.c
===================================================================
--- linux.orig/drivers/ras/ras.c	2014-05-23 11:14:33.856625534 +0200
+++ linux/drivers/ras/ras.c	2014-05-23 11:14:33.840625534 +0200
@@ -23,4 +23,5 @@  static int __init ras_init(void)
 }
 subsys_initcall(ras_init);
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
Index: linux/include/ras/ras_event.h
===================================================================
--- linux.orig/include/ras/ras_event.h	2014-05-23 11:14:33.856625534 +0200
+++ linux/include/ras/ras_event.h	2014-05-23 12:10:45.252569816 +0200
@@ -9,6 +9,63 @@ 
 #include <linux/edac.h>
 #include <linux/ktime.h>
 #include <linux/aer.h>
+#include <linux/cper.h>
+
+
+/*
+ * MCE Extended Error Log trace event
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event.
+ */
+
+TRACE_EVENT(extlog_mem_event,
+	TP_PROTO(u32 error_number,
+		 u8 etype,
+		 u8 severity,
+		 u64 pa,
+		 u8 pa_mask_lsb,
+		 const uuid_le *fru_id,
+		 const char *dimm_info,
+		 const char *mem_loc,
+		 const char *fru_text),
+
+	TP_ARGS(error_number, etype, severity, pa, pa_mask_lsb,
+		fru_id, dimm_info, mem_loc, fru_text),
+
+	TP_STRUCT__entry(
+		__field(u32, error_number)
+		__field(u8, etype)
+		__field(u8, severity)
+		__field(u64, pa)
+		__field(u8, pa_mask_lsb)
+		__string(dimm_info, dimm_info)
+		__string(mem_loc, mem_loc)
+		__dynamic_array(char, fru, CPER_REC_LEN)
+	),
+
+	TP_fast_assign(
+		__entry->error_number = error_number;
+		__entry->etype = etype;
+		__entry->severity = severity;
+		__entry->pa = pa;
+		__entry->pa_mask_lsb = pa_mask_lsb;
+		__assign_str(dimm_info, dimm_info);
+		__assign_str(mem_loc, mem_loc);
+		snprintf(__get_dynamic_array(fru), CPER_REC_LEN - 1,
+			 "FRU: %pUl %.20s", fru_id, fru_text);
+	),
+
+	TP_printk("%d %s error: %s %s %llx (mask lsb: %x), %s%s",
+		  __entry->error_number,
+		  cper_severity_str(__entry->severity),
+		  cper_mem_err_type_str(__entry->etype),
+		  __get_str(dimm_info),
+		  __entry->pa,
+		  __entry->pa_mask_lsb,
+		  __get_str(mem_loc),
+		  __get_str(fru))
+);
 
 /*
  * Hardware Events Report