diff mbox

[5/7,v6] trace, RAS: Add eMCA trace event interface

Message ID 1401247938-22125-2-git-send-email-gong.chen@linux.intel.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Chen Gong May 28, 2014, 3:32 a.m. UTC
Add trace interface to elaborate all H/W error related information.

v6 -> v5: format adjustment.
v5 -> v4: Add physical mask(LSB) in trace.
v4 -> v3: change ras trace dependency rule.
v3 -> v2: minor adjustment according to the suggestion from Boris.
v2 -> v1: spinlock is not needed anymore.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
---
 drivers/acpi/Kconfig       |  4 +++-
 drivers/acpi/acpi_extlog.c | 54 +++++++++++++++++++++++++++++++++++++++---
 drivers/ras/ras.c          |  1 +
 include/ras/ras_event.h    | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+), 4 deletions(-)

Comments

Steven Rostedt May 28, 2014, 3:28 p.m. UTC | #1
Added LKML

On Tue, 27 May 2014 23:32:18 -0400
"Chen, Gong" <gong.chen@linux.intel.com> wrote:

> Add trace interface to elaborate all H/W error related information.
> 
> v6 -> v5: format adjustment.
> v5 -> v4: Add physical mask(LSB) in trace.
> v4 -> v3: change ras trace dependency rule.
> v3 -> v2: minor adjustment according to the suggestion from Boris.
> v2 -> v1: spinlock is not needed anymore.
> 
> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> ---
>  drivers/acpi/Kconfig       |  4 +++-
>  drivers/acpi/acpi_extlog.c | 54 +++++++++++++++++++++++++++++++++++++++---
>  drivers/ras/ras.c          |  1 +
>  include/ras/ras_event.h    | 59 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 114 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
> index a34a228..099a2d5 100644
> --- a/drivers/acpi/Kconfig
> +++ b/drivers/acpi/Kconfig
> @@ -370,6 +370,7 @@ config ACPI_EXTLOG
>  	tristate "Extended Error Log support"
>  	depends on X86_MCE && X86_LOCAL_APIC
>  	select UEFI_CPER
> +	select RAS_TRACE
>  	default n
>  	help
>  	  Certain usages such as Predictive Failure Analysis (PFA) require
> @@ -384,6 +385,7 @@ config ACPI_EXTLOG
>  
>  	  Enhanced MCA Logging allows firmware to provide additional error
>  	  information to system software, synchronous with MCE or CMCI. This
> -	  driver adds support for that functionality.
> +	  driver adds support for that functionality with corresponding
> +	  tracepoint which carries that information to userspace.
>  
>  endif	# ACPI
> diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
> index c4a5d87..8815b73 100644
> --- a/drivers/acpi/acpi_extlog.c
> +++ b/drivers/acpi/acpi_extlog.c
> @@ -16,6 +16,7 @@
>  #include <asm/mce.h>
>  
>  #include "apei/apei-internal.h"
> +#include <ras/ras_event.h>
>  
>  #define EXT_ELOG_ENTRY_MASK	GENMASK_ULL(51, 0) /* elog entry address mask */
>  
> @@ -43,6 +44,9 @@ struct extlog_l1_head {
>  
>  static int old_edac_report_status;
>  
> +static char mem_location[CPER_REC_LEN];
> +static char dimm_location[CPER_REC_LEN];
> +
>  static u8 extlog_dsm_uuid[] __initdata = "663E35AF-CC10-41A4-88EA-5470AF055295";
>  
>  /* L1 table related physical address */
> @@ -69,6 +73,30 @@ static u32 l1_percpu_entry;
>  #define ELOG_ENTRY_ADDR(phyaddr) \
>  	(phyaddr - elog_base + (u8 *)elog_addr)
>  
> +static void __trace_mem_error(const uuid_le *fru_id, char *fru_text,
> +			       u32 err_number, u8 severity,
> +			       struct cper_sec_mem_err *mem)
> +{
> +	u8 etype = ~0, pa_mask_lsb = ~0;
> +	u64 pa = ~0ull;
> +
> +	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
> +		etype = mem->error_type;
> +
> +	if (mem->validation_bits & CPER_MEM_VALID_PA)
> +		pa = mem->physical_addr;
> +
> +	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
> +		pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
> +
> +	memset(mem_location, 0, CPER_REC_LEN);
> +	cper_mem_err_location(mem, mem_location);
> +	memset(dimm_location, 0, CPER_REC_LEN);
> +	cper_dimm_err_location(mem, dimm_location);
> +	trace_extlog_mem_event(err_number, etype, severity, pa, pa_mask_lsb,
> +			       fru_id, dimm_location, mem_location, fru_text);

This seems like a lot of work for a tracepoint. Why all the strings?
Ideally, you want to record in the fast path the minimum raw data and
reconstruct it at the time it is read.

> +}
> +
>  static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank)
>  {
>  	int idx;
> @@ -137,8 +165,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
>  	struct mce *mce = (struct mce *)data;
>  	int	bank = mce->bank;
>  	int	cpu = mce->extcpu;
> -	struct acpi_generic_status *estatus;
> -	int rc;
> +	struct acpi_generic_status *estatus, *tmp;
> +	struct acpi_generic_data *gdata;
> +	const uuid_le *fru_id = &NULL_UUID_LE;
> +	char *fru_text = "";
> +	uuid_le *sec_type;
> +	static u32 err_number;
>  
>  	estatus = extlog_elog_entry_check(cpu, bank);
>  	if (estatus == NULL)
> @@ -148,7 +180,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
>  	/* clear record status to enable BIOS to update it again */
>  	estatus->block_status = 0;
>  
> -	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
> +	tmp = (struct acpi_generic_status *)elog_buf;
> +	print_extlog_rcd(NULL, tmp, cpu);
> +
> +	/* log event via trace */
> +	err_number++;
> +	gdata = (struct acpi_generic_data *)(tmp + 1);
> +	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
> +		fru_id = (uuid_le *)gdata->fru_id;
> +	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
> +		fru_text = gdata->fru_text;
> +	sec_type = (uuid_le *)gdata->section_type;
> +	if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
> +		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
> +		if (gdata->error_data_length >= sizeof(*mem_err))
> +			__trace_mem_error(fru_id, fru_text, err_number,
> +					  (u8)gdata->error_severity, mem_err);
> +	}
>  
>  	return NOTIFY_STOP;
>  }
> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
> index 4cac43a..da227a3 100644
> --- a/drivers/ras/ras.c
> +++ b/drivers/ras/ras.c
> @@ -23,4 +23,5 @@ static int __init ras_init(void)
>  }
>  subsys_initcall(ras_init);
>  
> +EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index acbcbb8..4d3bc92 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -9,6 +9,65 @@
>  #include <linux/edac.h>
>  #include <linux/ktime.h>
>  #include <linux/aer.h>
> +#include <linux/cper.h>
> +
> +
> +/*
> + * MCE Extended Error Log trace event
> + *
> + * These events are generated when hardware detects a corrected or
> + * uncorrected event.
> + */
> +
> +/* memory trace event */
> +
> +TRACE_EVENT(extlog_mem_event,
> +	TP_PROTO(u32 error_number,
> +		 u8 etype,
> +		 u8 severity,
> +		 u64 pa,
> +		 u8 pa_mask_lsb,
> +		 const uuid_le *fru_id,
> +		 const char *dimm_info,
> +		 const char *mem_loc,
> +		 const char *fru_text),
> +
> +	TP_ARGS(error_number, etype, severity, pa, pa_mask_lsb, fru_id,
> +		dimm_info, mem_loc, fru_text),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, error_number)
> +		__field(u8, etype)
> +		__field(u8, severity)
> +		__field(u64, pa)
> +		__field(u8, pa_mask_lsb)
> +		__string(dimm_info, dimm_info)
> +		__string(mem_loc, mem_loc)
> +		__dynamic_array(char, fru, CPER_REC_LEN)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->error_number = error_number;
> +		__entry->etype = etype;
> +		__entry->severity = severity;
> +		__entry->pa = pa;
> +		__entry->pa_mask_lsb = pa_mask_lsb;
> +		__assign_str(dimm_info, dimm_info);
> +		__assign_str(mem_loc, mem_loc);
> +		snprintf(__get_dynamic_array(fru), CPER_REC_LEN - 1,
> +			 "FRU: %pUl %.20s", fru_id, fru_text);

For example, here don't use snprintf(). Save that processing for the
TP_printk(), as that is done at time of read. Again, only store the
minimum raw data, and reconstruct it later. Why slow down the fast path?

-- Steve

> +	),
> +
> +	TP_printk("%d %s error: %s %s physical addr: %016llx (mask lsb: %x), %s%s",
> +		  __entry->error_number,
> +		  cper_severity_str(__entry->severity),
> +		  cper_mem_err_type_str(__entry->etype),
> +		  __get_str(dimm_info),
> +		  __entry->pa,
> +		  __entry->pa_mask_lsb,
> +		  __get_str(mem_loc),
> +		  __get_str(fru))
> +);
>  
>  /*
>   * Hardware Events Report

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov May 28, 2014, 4:34 p.m. UTC | #2
On Wed, May 28, 2014 at 11:28:32AM -0400, Steven Rostedt wrote:
> > +static void __trace_mem_error(const uuid_le *fru_id, char *fru_text,
> > +			       u32 err_number, u8 severity,
> > +			       struct cper_sec_mem_err *mem)
> > +{
> > +	u8 etype = ~0, pa_mask_lsb = ~0;
> > +	u64 pa = ~0ull;
> > +
> > +	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
> > +		etype = mem->error_type;
> > +
> > +	if (mem->validation_bits & CPER_MEM_VALID_PA)
> > +		pa = mem->physical_addr;
> > +
> > +	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
> > +		pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
> > +
> > +	memset(mem_location, 0, CPER_REC_LEN);
> > +	cper_mem_err_location(mem, mem_location);
> > +	memset(dimm_location, 0, CPER_REC_LEN);
> > +	cper_dimm_err_location(mem, dimm_location);
> > +	trace_extlog_mem_event(err_number, etype, severity, pa, pa_mask_lsb,
> > +			       fru_id, dimm_location, mem_location, fru_text);
> 
> This seems like a lot of work for a tracepoint. Why all the strings?
> Ideally, you want to record in the fast path the minimum raw data and
> reconstruct it at the time it is read.

Well, they're constructed from a bunch of values which are checked for
validity first:

http://lkml.kernel.org/r/1400142646-10127-4-git-send-email-gong.chen@linux.intel.com

We probably could get rid of the fru* things by reading them out from
ACPI and enumerating them and issuing only an index here which the slow
path decodes.

The others we can split into fields again which should definitely make
the record smaller. Fields are defined in struct cper_sec_mem_err.

The thing is, this TP needs to be designed properly before we expose it
to the world and so that hoperfully most future uses are covered.

Thanks.
Steven Rostedt May 28, 2014, 4:56 p.m. UTC | #3
On Wed, 28 May 2014 18:34:52 +0200
Borislav Petkov <bp@alien8.de> wrote:
 
> Well, they're constructed from a bunch of values which are checked for
> validity first:
> 
> http://lkml.kernel.org/r/1400142646-10127-4-git-send-email-gong.chen@linux.intel.com

OK, looks like you are saving a bunch of integers.

> 
> We probably could get rid of the fru* things by reading them out from
> ACPI and enumerating them and issuing only an index here which the slow
> path decodes.
> 
> The others we can split into fields again which should definitely make
> the record smaller. Fields are defined in struct cper_sec_mem_err.
> 
> The thing is, this TP needs to be designed properly before we expose it
> to the world and so that hoperfully most future uses are covered.

My concern is passing in a large string and wasting a lot of the ring
buffer space. The max you can hold per event is just under a page size
(4k). And all these strings add up. If it happens to be 512bytes, then
you end up with one event per page.

Instead of making that a huge string, what about a dynamic array of
special structures?


struct __attribute__((__packed__)) cper_sec_mem_rec {
	short type;
	int data;
};


static struct cper_sec_mem_rec mem_location[CPER_REC_LEN];

then have the:

 	if (mem->validation_bits & CPER_MEM_VALID_NODE) {
		msg[n].type = CPER_MEM_VALID_NODE_TYPE;
		msg[n++].data = mem->node;
	}
	if (mem->validation_bits & CPER_MEM_VALID_CARD) {
		msg[n].type = CPER_MEM_VALID_CARD_TYPE;
		msg[n++].data = mem->card;
	}
	if (mem->validation_bits & CPER_MEM_VALID_MODULE) {
[ and so on ]


Then return the array of msg and the length and you can record this
into the dynamic array in the tracepoint.

Then you have the type and data saved, the TP_printk() can parse it.
You can create your own handler for it to do it nicely (see the ftrace
version of __print_symbol() and friends).

Makes the tracepoint much more compact.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chen Gong May 29, 2014, 7:43 a.m. UTC | #4
On Wed, May 28, 2014 at 12:56:25PM -0400, Steven Rostedt wrote:
> My concern is passing in a large string and wasting a lot of the ring
> buffer space. The max you can hold per event is just under a page size
> (4k). And all these strings add up. If it happens to be 512bytes, then
> you end up with one event per page.
I just don't understand why you say wasting memory. I just pass
a char * not a string array. And most of time these strings are partial full,
about 1/5 ~ 1/4 spaces are used.

> 
> Instead of making that a huge string, what about a dynamic array of
> special structures?
> 
> 
> struct __attribute__((__packed__)) cper_sec_mem_rec {
> 	short type;
> 	int data;
> };
> 
> 
> static struct cper_sec_mem_rec mem_location[CPER_REC_LEN];
> 
> then have the:
> 
>  	if (mem->validation_bits & CPER_MEM_VALID_NODE) {
> 		msg[n].type = CPER_MEM_VALID_NODE_TYPE;
> 		msg[n++].data = mem->node;
> 	}
> 	if (mem->validation_bits & CPER_MEM_VALID_CARD) {
> 		msg[n].type = CPER_MEM_VALID_CARD_TYPE;
> 		msg[n++].data = mem->card;
> 	}
> 	if (mem->validation_bits & CPER_MEM_VALID_MODULE) {
> [ and so on ]
> 

This function is not only for perf but for dmesg. So key is how
to handle two strings: dimm_location and mem_location.

I read some __print_symbolic implementations like btrfs trace,

#define show_ref_type(type)                                     \
        __print_symbolic(type,                                  \
        { BTRFS_TREE_BLOCK_REF_KEY,     "TREE_BLOCK_REF" },     \
        { BTRFS_EXTENT_DATA_REF_KEY,    "EXTENT_DATA_REF" },    \
        { BTRFS_EXTENT_REF_V0_KEY,      "EXTENT_REF_V0" },      \
        { BTRFS_SHARED_BLOCK_REF_KEY,   "SHARED_BLOCK_REF" },   \
        { BTRFS_SHARED_DATA_REF_KEY,    "SHARED_DATA_REF" })

So for this case, maybe we need a macro like:

#define show_dimm_location(type)                                \
        __print_symbolic(type,                                  \
        { CPER_MEM_VALID_NODE,          "node" },               \
        { CPER_MEM_VALID_CARD,          "card" },               \
        { CPER_MEM_VALID_MODULE,        "module" },             \
        ...

IMO, it is just another implementation method, maybe more graceful,
but I don't know how it can save space. Again, original functions
work both for trace and dmesg. If we add such interface, it looks
a little bit repeated.
Borislav Petkov May 29, 2014, 10:35 a.m. UTC | #5
On Thu, May 29, 2014 at 03:43:45AM -0400, Chen, Gong wrote:
> On Wed, May 28, 2014 at 12:56:25PM -0400, Steven Rostedt wrote:
> > My concern is passing in a large string and wasting a lot of the ring
> > buffer space. The max you can hold per event is just under a page size
> > (4k). And all these strings add up. If it happens to be 512bytes, then
> > you end up with one event per page.
> I just don't understand why you say wasting memory. I just pass
> a char * not a string array.

And this char * points to where?

+               __string(dimm_info, dimm_info)
+               __string(mem_loc, mem_loc)
Steven Rostedt May 29, 2014, 1:12 p.m. UTC | #6
On Thu, 29 May 2014 03:43:45 -0400
"Chen, Gong" <gong.chen@linux.intel.com> wrote:

> On Wed, May 28, 2014 at 12:56:25PM -0400, Steven Rostedt wrote:
> > My concern is passing in a large string and wasting a lot of the ring
> > buffer space. The max you can hold per event is just under a page size
> > (4k). And all these strings add up. If it happens to be 512bytes, then
> > you end up with one event per page.
> I just don't understand why you say wasting memory. I just pass
> a char * not a string array. And most of time these strings are partial full,
> about 1/5 ~ 1/4 spaces are used.

What do you think gets recorded in the ring buffer? The pointer to the
string? No! You copy the entire string into the ring buffer, with
markers and all. How big is that string? 60 chars? 80? I see you
recording meta data there too:

 	if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
-		pr_debug("bit_position: %d\n", mem->bit_pos);
+		n += scnprintf(msg + n, len - n, "bit_position: %d ",
+			       mem->bit_pos);

You record "bit_position: <number>\n"

That by itself is 15 characters not counting the number of characters
used to write the number. All you need to record for that is a two byte
type (perhaps even one byte) and a int (4 bytes) for a total of 6
bytes.  And this is just one example, you have this for all the cases.
That will fill up fast!

> 
> > 
> > Instead of making that a huge string, what about a dynamic array of
> > special structures?
> > 
> > 
> > struct __attribute__((__packed__)) cper_sec_mem_rec {
> > 	short type;
> > 	int data;
> > };
> > 
> > 
> > static struct cper_sec_mem_rec mem_location[CPER_REC_LEN];
> > 
> > then have the:
> > 
> >  	if (mem->validation_bits & CPER_MEM_VALID_NODE) {
> > 		msg[n].type = CPER_MEM_VALID_NODE_TYPE;
> > 		msg[n++].data = mem->node;
> > 	}
> > 	if (mem->validation_bits & CPER_MEM_VALID_CARD) {
> > 		msg[n].type = CPER_MEM_VALID_CARD_TYPE;
> > 		msg[n++].data = mem->card;
> > 	}
> > 	if (mem->validation_bits & CPER_MEM_VALID_MODULE) {
> > [ and so on ]
> > 
> 
> This function is not only for perf but for dmesg. So key is how
> to handle two strings: dimm_location and mem_location.
> 
> I read some __print_symbolic implementations like btrfs trace,
> 
> #define show_ref_type(type)                                     \
>         __print_symbolic(type,                                  \
>         { BTRFS_TREE_BLOCK_REF_KEY,     "TREE_BLOCK_REF" },     \
>         { BTRFS_EXTENT_DATA_REF_KEY,    "EXTENT_DATA_REF" },    \
>         { BTRFS_EXTENT_REF_V0_KEY,      "EXTENT_REF_V0" },      \
>         { BTRFS_SHARED_BLOCK_REF_KEY,   "SHARED_BLOCK_REF" },   \
>         { BTRFS_SHARED_DATA_REF_KEY,    "SHARED_DATA_REF" })
> 
> So for this case, maybe we need a macro like:
> 
> #define show_dimm_location(type)                                \
>         __print_symbolic(type,                                  \
>         { CPER_MEM_VALID_NODE,          "node" },               \
>         { CPER_MEM_VALID_CARD,          "card" },               \
>         { CPER_MEM_VALID_MODULE,        "module" },             \
>         ...
> 
> IMO, it is just another implementation method, maybe more graceful,
> but I don't know how it can save space. Again, original functions
> work both for trace and dmesg. If we add such interface, it looks
> a little bit repeated.

I'm not sure you could use that, as you save an array of data. The
print_symbolic() wont work with an array.

You can still use the same code for both the tracepoint (perf and
ftrace) and for dmesg. You need to write a packed array that is
returned as well as a way to convert that array into a human readable
string for later processing. The dmesg version would just have them
both together where as the tracepoint records the packed version on the
ring buffer and the TP_printk() will use the extraction.

That is, dmesg has the compress and extraction in one place where the
tracepoint has them in two different places.

Understand?

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chen Gong May 30, 2014, 2:56 a.m. UTC | #7
On Thu, May 29, 2014 at 09:12:51AM -0400, Steven Rostedt wrote:
> What do you think gets recorded in the ring buffer? The pointer to the
> string? No! You copy the entire string into the ring buffer, with
> markers and all. How big is that string? 60 chars? 80? I see you
> recording meta data there too:
I'm not the expert of trace. If I am wrong please fix me. So it looks
like all trace data shhould be pushed into the ring buffer as raw
material and when it is needed to be printed via TP_printk, these
raw data will be expanded(unzip them as you told) into another
temp buffer, maybe kernel printk buffer directly.

[...]
> > #define show_dimm_location(type)                                \
> >         __print_symbolic(type,                                  \
> >         { CPER_MEM_VALID_NODE,          "node" },               \
> >         { CPER_MEM_VALID_CARD,          "card" },               \
> >         { CPER_MEM_VALID_MODULE,        "module" },             \
> >         ...
> > 
> 
> I'm not sure you could use that, as you save an array of data. The
> print_symbolic() wont work with an array.
Just FWIW, call it again and again. But it looks too awkward I will
use other style.

> 
> You can still use the same code for both the tracepoint (perf and
> ftrace) and for dmesg. You need to write a packed array that is
> returned as well as a way to convert that array into a human readable
> string for later processing. The dmesg version would just have them
> both together where as the tracepoint records the packed version on the
> ring buffer and the TP_printk() will use the extraction.
> 
> That is, dmesg has the compress and extraction in one place where the
> tracepoint has them in two different places.
> 
> Understand?
Yes, I feel the same way. Zip/unzip is unavoidable.
Chen Gong May 30, 2014, 9:22 a.m. UTC | #8
On Wed, May 28, 2014 at 12:56:25PM -0400, Steven Rostedt wrote:
> Instead of making that a huge string, what about a dynamic array of
> special structures?
> 
> 
> struct __attribute__((__packed__)) cper_sec_mem_rec {
> 	short type;
> 	int data;
> };
> 
> 
HI, Steven & Boris

We have two big chunk string. One for memory error location, the other
for DIMM error location. Since DIMM error location depends on some
other conditions, how about just converting memory error location
to a compact mode but leaving DIMM error location alone? 

For memory error location, I will utilize type offset to save one
more byte, furthermore, I want to drop requestor_id, responder_id
and target_id. 1) They are very rare (I've never seen them by now)
2) They are u64 but not u16. So to keep whole struct clean I want
to use following struct. We can extend it later when necessary.

struct __attribute__((__packed__)) cper_sec_mem_rec {
	u8 type;
	u16 data;
};

So whole struct is just 3 bytes. Even if all fields are valid, we
have 3 * 9 = 27 bytes in total for a record in the ring buffer.

Make sense?
Borislav Petkov May 30, 2014, 10:07 a.m. UTC | #9
On Fri, May 30, 2014 at 05:22:32AM -0400, Chen, Gong wrote:
> We have two big chunk string. One for memory error location, the other
> for DIMM error location. Since DIMM error location depends on some
> other conditions, how about just converting memory error location to a
> compact mode but leaving DIMM error location alone?

Please elaborate, what conditions? DIMM silk screen labels or so? Maybe
we can generate a mapping between text labels and indices and we can
dump the indices in the tracepoint and do the mapping back to strings in
userspace...?

> For memory error location, I will utilize type offset to save one
> more byte, furthermore, I want to drop requestor_id, responder_id
> and target_id. 1) They are very rare (I've never seen them by now)

My concern is, are we sure we're never going to need them at all? Tony,
what's your take on this?

> 2) They are u64 but not u16. So to keep whole struct clean I want
> to use following struct. We can extend it later when necessary.
> 
> struct __attribute__((__packed__)) cper_sec_mem_rec {
> 	u8 type;
> 	u16 data;
> };
> 
> So whole struct is just 3 bytes. Even if all fields are valid, we
> have 3 * 9 = 27 bytes in total for a record in the ring buffer.
> 
> Make sense?

That is definitely much better than what we have now.
Tony Luck May 30, 2014, 9:16 p.m. UTC | #10
On Fri, May 30, 2014 at 3:07 AM, Borislav Petkov <bp@alien8.de> wrote:
> Please elaborate, what conditions? DIMM silk screen labels or so? Maybe
> we can generate a mapping between text labels and indices and we can
> dump the indices in the tracepoint and do the mapping back to strings in
> userspace...?

The UEFI error record gives us the SMBIOS "handle" (2-byte index). We
use that to look up the bank and device locator strings ... which should be
the silk-screen labels (in a correctly written BIOS).

So we could just have the tracepoint save the "handle" and do the
decode later.  If we want to keep doing mappings in the kernel (so console
logs can say "DIMM location: CPU 0 DIMM_C1" rather than
"SMBIOS handle 0x0015") - and would like to make things easier
for ourselves - we could have dmi_memdev_walk() do a bit more
work so we can just index an allocated array of strings that are
the concatenation of the bank/device locators.

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov May 30, 2014, 9:26 p.m. UTC | #11
On Fri, May 30, 2014 at 02:16:06PM -0700, Tony Luck wrote:
> On Fri, May 30, 2014 at 3:07 AM, Borislav Petkov <bp@alien8.de> wrote:
> > Please elaborate, what conditions? DIMM silk screen labels or so? Maybe
> > we can generate a mapping between text labels and indices and we can
> > dump the indices in the tracepoint and do the mapping back to strings in
> > userspace...?
> 
> The UEFI error record gives us the SMBIOS "handle" (2-byte index). We
> use that to look up the bank and device locator strings ... which should be
> the silk-screen labels (in a correctly written BIOS).

Ok, sounds straightforward.

> So we could just have the tracepoint save the "handle" and do the
> decode later. If we want to keep doing mappings in the kernel (so
> console logs can say "DIMM location: CPU 0 DIMM_C1" rather than
> "SMBIOS handle 0x0015") - and would like to make things easier for
> ourselves - we could have dmi_memdev_walk() do a bit more work
> so we can just index an allocated array of strings that are the
> concatenation of the bank/device locators.

Right, we probably would need both: if a userspace agent is active, it'd
need to resolve the handle and otherwise we'll be doing it in the kernel
with dmi.

But it all sounds very doable and nice to me.
Tony Luck May 30, 2014, 11:03 p.m. UTC | #12
>> For memory error location, I will utilize type offset to save one

>> more byte, furthermore, I want to drop requestor_id, responder_id

>> and target_id. 1) They are very rare (I've never seen them by now)

>

> My concern is, are we sure we're never going to need them at all? Tony,

> what's your take on this?


They may seem rare because our BIOS doesn't bother to provide them.
Other BIOS writers may be more diligent.

I flip-flop on the issue of how much detail to log.  For the majority
of users it is enough to just point at the DIMM.  That's the thing that
they can easily replace.

But OEMs and large scale users often want to know every tiny detail
so they can look for patterns between errors reported across a large
fleet. So I hate to drop information on the floor that might be useful
to someone later.

All of this stuff only applies to server systems - so quibbling over
a handful of *bytes* in an error record on a system that has tens,
hundreds or even thousands of *gigabytes* of memory seems
a bit pointless.

-Tony
Steven Rostedt May 31, 2014, 1:07 a.m. UTC | #13
On Fri, 30 May 2014 23:03:27 +0000
"Luck, Tony" <tony.luck@intel.com> wrote:

 
> All of this stuff only applies to server systems - so quibbling over
> a handful of *bytes* in an error record on a system that has tens,
> hundreds or even thousands of *gigabytes* of memory seems
> a bit pointless.

But there's still only a limited number of bytes in the ring buffer no
matter what the system, thus we still need to quibble over it.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tony Luck June 2, 2014, 4:22 p.m. UTC | #14
>> All of this stuff only applies to server systems - so quibbling over
>> a handful of *bytes* in an error record on a system that has tens,
>> hundreds or even thousands of *gigabytes* of memory seems
>> a bit pointless.
>
> But there's still only a limited number of bytes in the ring buffer no
> matter what the system, thus we still need to quibble over it.

To which I'll counter that the trace ring buffer can handle tracing of
events like page faults and context switches (can't it?) that happen
at a rate of thousands per second.  Our eMCA records will normally
happen at a rate of X per month (where X may well be less than one).
If there is a storm of errors - we disable CMCI interrupts and revert
to polling.  We declare a "storm" as just 15 events in a second. If we
switch to polling, then we won't poll faster than once per second.

So worst case is that we are seeing some steady flow of events that
don't quite trigger the storm detector ... about 14 events per second.

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steven Rostedt June 2, 2014, 4:57 p.m. UTC | #15
On Mon, 2 Jun 2014 16:22:19 +0000
"Luck, Tony" <tony.luck@intel.com> wrote:

 
> To which I'll counter that the trace ring buffer can handle tracing of
> events like page faults and context switches (can't it?) that happen
> at a rate of thousands per second.  Our eMCA records will normally
> happen at a rate of X per month (where X may well be less than one).
> If there is a storm of errors - we disable CMCI interrupts and revert
> to polling.  We declare a "storm" as just 15 events in a second. If we
> switch to polling, then we won't poll faster than once per second.
> 
> So worst case is that we are seeing some steady flow of events that
> don't quite trigger the storm detector ... about 14 events per second.

Also matters how big you expect these events to be. If you get a
"christmas tree" set of flags, how big will that event grow with all
the descriptions attached?

The max event size after all headers is 4056 bytes. If you go over
that, the event is ignored.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chen Gong June 3, 2014, 8:36 a.m. UTC | #16
On Mon, Jun 02, 2014 at 12:57:48PM -0400, Steven Rostedt wrote:
> Also matters how big you expect these events to be. If you get a
> "christmas tree" set of flags, how big will that event grow with all
> the descriptions attached?
> 
> The max event size after all headers is 4056 bytes. If you go over
> that, the event is ignored.
> 
Hi, Steven

Normally, the length of one eMCA trace record is between 200 and 256 bytes.
Once CMCI storm happens, before it is turned into poll mode, there are
about ~15 CMCI events are recorded, because I don't use rate limit for
trace so they should be recorded so seriously, some records will be lost.
But they are repeated and similar records so maybe the *lost* is not a 
big issue.

Return to how to print trace record. To avoid buffer waste, I need to
print data when TP_printk called, in the meanwhile, the print content
is an array of [name, value], but we don't know how many items are
valid. Here is the question: I can't create a dynamic printk format
like "%s %d, %s %d, ..." in TP_printk. So the only way to me is
printking them all, even some of them are invalid, which means an 12
group "%s %d", or somthing like "%.*s" to make output format graceful.
This is what we want?
Steven Rostedt June 3, 2014, 2:35 p.m. UTC | #17
On Tue, 3 Jun 2014 04:36:07 -0400
"Chen, Gong" <gong.chen@linux.intel.com> wrote:

> On Mon, Jun 02, 2014 at 12:57:48PM -0400, Steven Rostedt wrote:
> > Also matters how big you expect these events to be. If you get a
> > "christmas tree" set of flags, how big will that event grow with all
> > the descriptions attached?
> > 
> > The max event size after all headers is 4056 bytes. If you go over
> > that, the event is ignored.
> > 
> Hi, Steven
> 
> Normally, the length of one eMCA trace record is between 200 and 256 bytes.
> Once CMCI storm happens, before it is turned into poll mode, there are
> about ~15 CMCI events are recorded, because I don't use rate limit for
> trace so they should be recorded so seriously, some records will be lost.
> But they are repeated and similar records so maybe the *lost* is not a 
> big issue.
> 
> Return to how to print trace record. To avoid buffer waste, I need to
> print data when TP_printk called, in the meanwhile, the print content
> is an array of [name, value], but we don't know how many items are
> valid. Here is the question: I can't create a dynamic printk format
> like "%s %d, %s %d, ..." in TP_printk. So the only way to me is
> printking them all, even some of them are invalid, which means an 12
> group "%s %d", or somthing like "%.*s" to make output format graceful.
> This is what we want?

You can create a helper function to call (needs to be placed in a .c
file).

Note, there's a pointer to a trace_seq structure "p" that is available.
Hmm, I should add a get_dynamic_array_len(field), to give you the
length. I'll add that now. I also don't like the trace_seq being "p" as
that is too generic. Maybe I'll change that to "__trace_seq" or
something not so generic.


Anyway, have something like this:


	TP_printk("%s", emca_parse_events(p, __get_dynamic_array(field),
			__get_dynamic_array_len(field)));

I'll still need to add that __get_dynamic_array_len() helper. I'll send
you something tonight.

Then you write the emca_parse_events() as:


const char *emca_parse_events(struct trace_seq *p,
		struct cper_sec_mem_rec *data, int len)
{
	const char *ret = p->buffer + p->len;
	int i;

	len = len / sizeof(struct cper_sec_mem_rec);
	for (i = 0; i < len; i++) {
		switch (data[i].type) {
		case FOO:
			trace_seq_printf(p, "BAR: %d\n", data[i].data);
			break;
		[..]
		}
	}
	trace_seq_putc('\0');

	return ret;
}

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index a34a228..099a2d5 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -370,6 +370,7 @@  config ACPI_EXTLOG
 	tristate "Extended Error Log support"
 	depends on X86_MCE && X86_LOCAL_APIC
 	select UEFI_CPER
+	select RAS_TRACE
 	default n
 	help
 	  Certain usages such as Predictive Failure Analysis (PFA) require
@@ -384,6 +385,7 @@  config ACPI_EXTLOG
 
 	  Enhanced MCA Logging allows firmware to provide additional error
 	  information to system software, synchronous with MCE or CMCI. This
-	  driver adds support for that functionality.
+	  driver adds support for that functionality with corresponding
+	  tracepoint which carries that information to userspace.
 
 endif	# ACPI
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index c4a5d87..8815b73 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -16,6 +16,7 @@ 
 #include <asm/mce.h>
 
 #include "apei/apei-internal.h"
+#include <ras/ras_event.h>
 
 #define EXT_ELOG_ENTRY_MASK	GENMASK_ULL(51, 0) /* elog entry address mask */
 
@@ -43,6 +44,9 @@  struct extlog_l1_head {
 
 static int old_edac_report_status;
 
+static char mem_location[CPER_REC_LEN];
+static char dimm_location[CPER_REC_LEN];
+
 static u8 extlog_dsm_uuid[] __initdata = "663E35AF-CC10-41A4-88EA-5470AF055295";
 
 /* L1 table related physical address */
@@ -69,6 +73,30 @@  static u32 l1_percpu_entry;
 #define ELOG_ENTRY_ADDR(phyaddr) \
 	(phyaddr - elog_base + (u8 *)elog_addr)
 
+static void __trace_mem_error(const uuid_le *fru_id, char *fru_text,
+			       u32 err_number, u8 severity,
+			       struct cper_sec_mem_err *mem)
+{
+	u8 etype = ~0, pa_mask_lsb = ~0;
+	u64 pa = ~0ull;
+
+	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
+		etype = mem->error_type;
+
+	if (mem->validation_bits & CPER_MEM_VALID_PA)
+		pa = mem->physical_addr;
+
+	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
+		pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
+
+	memset(mem_location, 0, CPER_REC_LEN);
+	cper_mem_err_location(mem, mem_location);
+	memset(dimm_location, 0, CPER_REC_LEN);
+	cper_dimm_err_location(mem, dimm_location);
+	trace_extlog_mem_event(err_number, etype, severity, pa, pa_mask_lsb,
+			       fru_id, dimm_location, mem_location, fru_text);
+}
+
 static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank)
 {
 	int idx;
@@ -137,8 +165,12 @@  static int extlog_print(struct notifier_block *nb, unsigned long val,
 	struct mce *mce = (struct mce *)data;
 	int	bank = mce->bank;
 	int	cpu = mce->extcpu;
-	struct acpi_generic_status *estatus;
-	int rc;
+	struct acpi_generic_status *estatus, *tmp;
+	struct acpi_generic_data *gdata;
+	const uuid_le *fru_id = &NULL_UUID_LE;
+	char *fru_text = "";
+	uuid_le *sec_type;
+	static u32 err_number;
 
 	estatus = extlog_elog_entry_check(cpu, bank);
 	if (estatus == NULL)
@@ -148,7 +180,23 @@  static int extlog_print(struct notifier_block *nb, unsigned long val,
 	/* clear record status to enable BIOS to update it again */
 	estatus->block_status = 0;
 
-	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
+	tmp = (struct acpi_generic_status *)elog_buf;
+	print_extlog_rcd(NULL, tmp, cpu);
+
+	/* log event via trace */
+	err_number++;
+	gdata = (struct acpi_generic_data *)(tmp + 1);
+	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+		fru_id = (uuid_le *)gdata->fru_id;
+	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+		fru_text = gdata->fru_text;
+	sec_type = (uuid_le *)gdata->section_type;
+	if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
+		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
+		if (gdata->error_data_length >= sizeof(*mem_err))
+			__trace_mem_error(fru_id, fru_text, err_number,
+					  (u8)gdata->error_severity, mem_err);
+	}
 
 	return NOTIFY_STOP;
 }
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 4cac43a..da227a3 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -23,4 +23,5 @@  static int __init ras_init(void)
 }
 subsys_initcall(ras_init);
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index acbcbb8..4d3bc92 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -9,6 +9,65 @@ 
 #include <linux/edac.h>
 #include <linux/ktime.h>
 #include <linux/aer.h>
+#include <linux/cper.h>
+
+
+/*
+ * MCE Extended Error Log trace event
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event.
+ */
+
+/* memory trace event */
+
+TRACE_EVENT(extlog_mem_event,
+	TP_PROTO(u32 error_number,
+		 u8 etype,
+		 u8 severity,
+		 u64 pa,
+		 u8 pa_mask_lsb,
+		 const uuid_le *fru_id,
+		 const char *dimm_info,
+		 const char *mem_loc,
+		 const char *fru_text),
+
+	TP_ARGS(error_number, etype, severity, pa, pa_mask_lsb, fru_id,
+		dimm_info, mem_loc, fru_text),
+
+	TP_STRUCT__entry(
+		__field(u32, error_number)
+		__field(u8, etype)
+		__field(u8, severity)
+		__field(u64, pa)
+		__field(u8, pa_mask_lsb)
+		__string(dimm_info, dimm_info)
+		__string(mem_loc, mem_loc)
+		__dynamic_array(char, fru, CPER_REC_LEN)
+	),
+
+	TP_fast_assign(
+		__entry->error_number = error_number;
+		__entry->etype = etype;
+		__entry->severity = severity;
+		__entry->pa = pa;
+		__entry->pa_mask_lsb = pa_mask_lsb;
+		__assign_str(dimm_info, dimm_info);
+		__assign_str(mem_loc, mem_loc);
+		snprintf(__get_dynamic_array(fru), CPER_REC_LEN - 1,
+			 "FRU: %pUl %.20s", fru_id, fru_text);
+	),
+
+	TP_printk("%d %s error: %s %s physical addr: %016llx (mask lsb: %x), %s%s",
+		  __entry->error_number,
+		  cper_severity_str(__entry->severity),
+		  cper_mem_err_type_str(__entry->etype),
+		  __get_str(dimm_info),
+		  __entry->pa,
+		  __entry->pa_mask_lsb,
+		  __get_str(mem_loc),
+		  __get_str(fru))
+);
 
 /*
  * Hardware Events Report