Message ID | 1381473166-29303-9-git-send-email-gong.chen@linux.intel.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
On Fri, Oct 11, 2013 at 02:32:46AM -0400, Chen, Gong wrote: > diff --git a/drivers/acpi/extlog_trace.h b/drivers/acpi/extlog_trace.h > new file mode 100644 > index 0000000..21f0887 > --- /dev/null > +++ b/drivers/acpi/extlog_trace.h > @@ -0,0 +1,77 @@ > +#if !defined(_TRACE_EXTLOG_H) || defined(TRACE_HEADER_MULTI_READ) > +#define _TRACE_EXTLOG_H > + > +#include <linux/tracepoint.h> > +#include <linux/cper.h> > + > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM extlog > + > +/* > + * MCE Extended Error Log Trace event Right, we have a perfectly good header for ras TPs: include/ras/ras_event.h Mind adding this TP there please? > + * > + * These events are generated when hardware detects a corrected or > + * uncorrected event. > + * > + */ > + > +/* memory trace event */ > + > +#define LOC_LEN 512 > +#define MSG_LEN ((LOC_LEN) * 2) > + > +TRACE_EVENT(extlog_mem_event, > + TP_PROTO(u32 etype, > + char *dimm_loc, > + const uuid_le *fru_id, > + char *fru_text, > + u64 error_count, > + u32 severity, > + u64 phy_addr, > + char *mem_loc), > + > + TP_ARGS(etype, dimm_loc, fru_id, fru_text, error_count, severity, > + phy_addr, mem_loc), > + > + TP_STRUCT__entry( > + __field(u32, etype) > + __dynamic_array(char, dimm_info, LOC_LEN) > + __field(u64, error_count) > + __field(u32, severity) > + __dynamic_array(char, msg, MSG_LEN) > + ), > + > + TP_fast_assign( > + __entry->error_count = error_count; > + __entry->severity = severity; > + __entry->etype = etype; > + if (dimm_loc[0] != '\0') > + snprintf(__get_dynamic_array(dimm_info), LOC_LEN - 1, > + "on %s", dimm_loc); > + else > + __assign_str(dimm_info, ""); > + if (phy_addr != 0) > + snprintf(__get_dynamic_array(msg), MSG_LEN - 1, > + "(FRU: %pUl %.20s physical addr: 0x%016llx%s)", > + fru_id, fru_text, phy_addr, mem_loc); > + else > + __assign_str(msg, ""); > + ), > + > + TP_printk("%llu %s error%s:%s %s%s", > + __entry->error_count, > + cper_severity_str(__entry->severity), > + __entry->error_count > 1 ? "s" : "", > + cper_mem_err_type_str(__entry->etype), > + __get_str(dimm_info), > + __get_str(msg)) > +); > + > +#endif /* _TRACE_EXTLOG_H */ > + > +/* This part must be outside protection */ > +#undef TRACE_INCLUDE_PATH > +#define TRACE_INCLUDE_PATH . > +#undef TRACE_INCLUDE_FILE > +#define TRACE_INCLUDE_FILE extlog_trace > +#include <trace/define_trace.h> > diff --git a/include/linux/cper.h b/include/linux/cper.h > index bd01c9a..c00eb55 100644 > --- a/include/linux/cper.h > +++ b/include/linux/cper.h > @@ -395,6 +395,8 @@ struct cper_sec_pcie { > #pragma pack() > > u64 cper_next_record_id(void); > +const char *cper_severity_str(unsigned int); > +const char *cper_mem_err_type_str(unsigned int); > void cper_print_bits(const char *prefix, unsigned int bits, > const char *strs[], unsigned int strs_size); > > -- > 1.8.4.rc3 > >
On Fri, Oct 11, 2013 at 02:32:46AM -0400, Chen, Gong wrote: > Use trace interface to elaborate all H/W error related > information. > > Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> > --- > drivers/acpi/Kconfig | 7 ++- > drivers/acpi/Makefile | 4 ++ > drivers/acpi/acpi_extlog.c | 28 +++++++++++- > drivers/acpi/apei/cper.c | 13 ++++-- > drivers/acpi/debug_extlog.h | 16 +++++++ > drivers/acpi/extlog_trace.c | 105 ++++++++++++++++++++++++++++++++++++++++++++ > drivers/acpi/extlog_trace.h | 77 ++++++++++++++++++++++++++++++++ > include/linux/cper.h | 2 + > 8 files changed, 246 insertions(+), 6 deletions(-) > create mode 100644 drivers/acpi/debug_extlog.h > create mode 100644 drivers/acpi/extlog_trace.c > create mode 100644 drivers/acpi/extlog_trace.h > > diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig > index 1465fa8..9ea343e 100644 > --- a/drivers/acpi/Kconfig > +++ b/drivers/acpi/Kconfig > @@ -372,12 +372,17 @@ config ACPI_BGRT > > source "drivers/acpi/apei/Kconfig" > > +config EXTLOG_TRACE > + def_bool n Why the separate config item? > + > config ACPI_EXTLOG > tristate "Extended Error Log support" > depends on X86 && X86_MCE > + select EXTLOG_TRACE > default n > help > This driver adds support for decoding extended errors from hardware. > - which allows the operating system to obtain data from trace. > + which allows the operating system to obtain data from trace. It will > + appear under /sys/kernel/debug/tracing/ras/ . > > endif # ACPI > diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile > index bce34af..a6e41b7 100644 > --- a/drivers/acpi/Makefile > +++ b/drivers/acpi/Makefile > @@ -83,4 +83,8 @@ obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o > > obj-$(CONFIG_ACPI_APEI) += apei/ > > +# extended log support > +acpi-$(CONFIG_EXTLOG_TRACE) += extlog_trace.o You can simply do obj-$(CONFIG_ACPI_EXTLOG) += acpi_extlog.o extlog_trace.o > +CFLAGS_extlog_trace.o := -I$(src) > + > obj-$(CONFIG_ACPI_EXTLOG) += acpi_extlog.o [ ... ] > diff --git a/drivers/acpi/extlog_trace.c b/drivers/acpi/extlog_trace.c > new file mode 100644 > index 0000000..2b2824c > --- /dev/null > +++ b/drivers/acpi/extlog_trace.c > @@ -0,0 +1,105 @@ > +#include <linux/export.h> > +#include <linux/dmi.h> > +#include "debug_extlog.h" > + > +#define CREATE_TRACE_POINTS > +#include "extlog_trace.h" > + > +static char mem_location[LOC_LEN]; > +static char dimm_location[LOC_LEN]; > + > +static void mem_err_location(struct cper_sec_mem_err *mem) > +{ > + char *p; > + u32 n = 0; > + > + memset(mem_location, 0, LOC_LEN); > + p = mem_location; > + if (mem->validation_bits & CPER_MEM_VALID_NODE) > + n += sprintf(p + n, " node: %d", mem->node); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_CARD) > + n += sprintf(p + n, " card: %d", mem->card); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_MODULE) > + n += sprintf(p + n, " module: %d", mem->module); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER) > + n += sprintf(p + n, " rank: %d", mem->rank); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_BANK) > + n += sprintf(p + n, " bank: %d", mem->bank); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_DEVICE) > + n += sprintf(p + n, " device: %d", mem->device); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_ROW) > + n += sprintf(p + n, " row: %d", mem->row); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_COLUMN) > + n += sprintf(p + n, " column: %d", mem->column); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) > + n += sprintf(p + n, " bit_position: %d", mem->bit_pos); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) > + n += sprintf(p + n, " requestor_id: 0x%016llx", > + mem->requestor_id); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID) > + n += sprintf(p + n, " responder_id: 0x%016llx", > + mem->responder_id); > + if (n >= LOC_LEN) > + goto end; > + if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID) > + n += sprintf(p + n, " target_id: 0x%016llx", mem->target_id); > +end: > + return; > +} > + > +static void dimm_err_location(struct cper_sec_mem_err *mem) > +{ > + memset(dimm_location, 0, LOC_LEN); > + if (mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { By reversing this test you can save yourself an indentation level and a superfluous memset: if (!(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)) return; memset(dimm_location, 0, LOC_LEN); dmi_memdev_name... ... > + const char *bank = NULL, *device = NULL; > + dmi_memdev_name(mem->mem_dev_handle, &bank, &device); > + if (bank != NULL && device != NULL) > + snprintf(dimm_location, LOC_LEN - 1, > + "%s %s", bank, device); > + else > + snprintf(dimm_location, LOC_LEN - 1, > + "DMI handle: 0x%.4x", mem->mem_dev_handle); > + } > +} > + > +void trace_mem_error(const uuid_le *fru_id, char *fru_text, > + u64 err_count, u32 severity, struct cper_sec_mem_err *mem) > +{ > + u32 etype = ~0U; > + u64 phy_addr = 0; > + > + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) > + etype = mem->error_type; > + if (mem->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { > + phy_addr = mem->physical_addr; > + if (mem->validation_bits & > + CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) This mask could use some slimming: CPER_MEM_VALID_PA_MASK or CPER_MEM_VALID_PHYS_ADDR_MASK or so so that it fits in 80 cols. > + phy_addr &= mem->physical_addr_mask; > + } > + mem_err_location(mem); > + dimm_err_location(mem); > + > + trace_extlog_mem_event(etype, dimm_location, fru_id, fru_text, > + err_count, severity, phy_addr, mem_location); arg alignment > +} > +EXPORT_SYMBOL_GPL(trace_mem_error); > diff --git a/drivers/acpi/extlog_trace.h b/drivers/acpi/extlog_trace.h > new file mode 100644 > index 0000000..21f0887 > --- /dev/null > +++ b/drivers/acpi/extlog_trace.h > @@ -0,0 +1,77 @@ > +#if !defined(_TRACE_EXTLOG_H) || defined(TRACE_HEADER_MULTI_READ) > +#define _TRACE_EXTLOG_H > + > +#include <linux/tracepoint.h> > +#include <linux/cper.h> > + > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM extlog Shouldn't that be TRACE_SYSTEM ras ... Thanks.
On Fri, Oct 11, 2013 at 06:14:36PM +0200, Borislav Petkov wrote: > Date: Fri, 11 Oct 2013 18:14:36 +0200 > From: Borislav Petkov <bp@alien8.de> > To: "Chen, Gong" <gong.chen@linux.intel.com> > Cc: tony.luck@intel.com, linux-kernel@vger.kernel.org, > linux-acpi@vger.kernel.org > Subject: Re: [PATCH 8/8] ACPI / trace: Add trace interface for eMCA driver > User-Agent: Mutt/1.5.21 (2010-09-15) > ... > > +static void dimm_err_location(struct cper_sec_mem_err *mem) > > +{ > > + memset(dimm_location, 0, LOC_LEN); > > + if (mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { > > By reversing this test you can save yourself an indentation level and a > superfluous memset: > > if (!(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)) > return; > > memset(dimm_location, 0, LOC_LEN); > dmi_memdev_name... > ... > > memset should be called before return, otherwise the values from last time will happen again in this time.
On 2013/10/11 02:32AM, Chen Gong wrote: > Use trace interface to elaborate all H/W error related > information. > > Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> > --- <snip> > +TRACE_EVENT(extlog_mem_event, > + TP_PROTO(u32 etype, > + char *dimm_loc, > + const uuid_le *fru_id, > + char *fru_text, > + u64 error_count, > + u32 severity, > + u64 phy_addr, > + char *mem_loc), [Adding Mauro...] This looks very similar to the trace event I wrote a while back, which was similar to the one provided by ghes_edac: http://thread.gmane.org/gmane.linux.kernel.pci/24616 Seems to me this has the same issues we previously discussed w.r.t EDAC conflicts... Regards, Naveen -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Oct 15, 2013 at 10:24:35PM +0530, Naveen N. Rao wrote: > On 2013/10/11 02:32AM, Chen Gong wrote: > > Use trace interface to elaborate all H/W error related > > information. > > > > Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> > > --- > <snip> > > +TRACE_EVENT(extlog_mem_event, > > + TP_PROTO(u32 etype, > > + char *dimm_loc, > > + const uuid_le *fru_id, > > + char *fru_text, > > + u64 error_count, > > + u32 severity, > > + u64 phy_addr, > > + char *mem_loc), > > [Adding Mauro...] > > This looks very similar to the trace event I wrote a while back, > which was similar to the one provided by ghes_edac: > http://thread.gmane.org/gmane.linux.kernel.pci/24616 > > Seems to me this has the same issues we previously discussed w.r.t > EDAC conflicts... Right, I'm inclined to leave this trace_mc_event in ras_event.h to edac use alone because of all those layers which don't mean whit for GHES and eMCA error sources. And maybe define a trace_mem_event which is shared by GHES and eMCA and not use the edac tracepoint there not load ghes_edac on such systems which have sufficient decoding capability in firmware. Thoughts?
On 10/15/2013 10:30 PM, Borislav Petkov wrote: > On Tue, Oct 15, 2013 at 10:24:35PM +0530, Naveen N. Rao wrote: >> On 2013/10/11 02:32AM, Chen Gong wrote: >>> Use trace interface to elaborate all H/W error related >>> information. >>> >>> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> >>> --- >> <snip> >>> +TRACE_EVENT(extlog_mem_event, >>> + TP_PROTO(u32 etype, >>> + char *dimm_loc, >>> + const uuid_le *fru_id, >>> + char *fru_text, >>> + u64 error_count, >>> + u32 severity, >>> + u64 phy_addr, >>> + char *mem_loc), >> >> [Adding Mauro...] >> >> This looks very similar to the trace event I wrote a while back, >> which was similar to the one provided by ghes_edac: >> http://thread.gmane.org/gmane.linux.kernel.pci/24616 >> >> Seems to me this has the same issues we previously discussed w.r.t >> EDAC conflicts... > > Right, I'm inclined to leave this trace_mc_event in ras_event.h to edac > use alone because of all those layers which don't mean whit for GHES and > eMCA error sources. > > And maybe define a trace_mem_event which is shared by GHES and eMCA and > not use the edac tracepoint there not load ghes_edac on such systems > which have sufficient decoding capability in firmware. > > Thoughts? I thought the primary problem was the conflict with edac core itself. So, if I'm not mistaken, we would have to prevent all edac drivers from loading. Thanks, Naveen -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Oct 15, 2013 at 11:00:53PM +0530, Naveen N. Rao wrote: > I thought the primary problem was the conflict with edac core itself. > So, if I'm not mistaken, we would have to prevent all edac drivers > from loading. That too - I don't see the need for them if the firmware does the decoding.
I see a few problems on this patchset: Em Tue, 15 Oct 2013 23:00:53 +0530 "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu: > On 10/15/2013 10:30 PM, Borislav Petkov wrote: > > On Tue, Oct 15, 2013 at 10:24:35PM +0530, Naveen N. Rao wrote: > >> On 2013/10/11 02:32AM, Chen Gong wrote: > >>> Use trace interface to elaborate all H/W error related > >>> information. > >>> > >>> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> > >>> --- > >> <snip> > >>> +TRACE_EVENT(extlog_mem_event, > >>> + TP_PROTO(u32 etype, > >>> + char *dimm_loc, > >>> + const uuid_le *fru_id, Using a custom typedef here seems problematic, as that can make userspace interface more complicated. > >>> + char *fru_text, > >>> + u64 error_count, > >>> + u32 severity, > >>> + u64 phy_addr, > >>> + char *mem_loc), By looking on the rest of the changes, the mem_loc can now contain the right location of the memory error, including on what DIMM the error happened. It can also (optionally) contain the DIMM label. Mangling those information on just one string field seems to be a very bad idea to me, as it prevents to write a generic logic, on userspace, that would apply a per-DIMM threshold policy. Also, userspace needs to know what's the granularity for the error that an eMCA driver will give, in order to adjust its policies. > >> > >> [Adding Mauro...] > >> > >> This looks very similar to the trace event I wrote a while back, > >> which was similar to the one provided by ghes_edac: > >> http://thread.gmane.org/gmane.linux.kernel.pci/24616 > >> > >> Seems to me this has the same issues we previously discussed w.r.t > >> EDAC conflicts... Agreed. > > > > Right, I'm inclined to leave this trace_mc_event in ras_event.h to edac > > use alone because of all those layers which don't mean whit for GHES and > > eMCA error sources. If you don't create the EDAC nodes, it means that userspace doesn't have any glue about what error information will be provided. The right thing to do is, IMHO, add some additional EDAC sysfs nodes that shows what kind of error information will be provided by the device, e. g.: - a complete hardware-based type of information directly obtained from the hardware; - a very poor BIOS-based type of error information, where the provided data is not sufficient to pinpoint to the DIMM where the error actually occurred (what's currently there at ghes_edac); - an eMCA-based type of error information, where the BIOS and ACPI will provide the complete error path, allowing userspace to properly parse the errors as if they come from the hardware-based approach. In any case, this is provided by the EDAC core functions that describe the memory in details. So, IMHO, get rid of EDAC is a big mistake. > > And maybe define a trace_mem_event which is shared by GHES and eMCA and > > not use the edac tracepoint there not load ghes_edac on such systems > > which have sufficient decoding capability in firmware. > > > > Thoughts? > > I thought the primary problem was the conflict with edac core itself. > So, if I'm not mistaken, we would have to prevent all edac drivers from > loading. Yes, this is another aspect of this approach: whatever provided mechanism, the Kernel should assure that one error path won't conflict with the other ones. We know by experience that enabling both BIOS-based and hardware-based mechanisms cause race conditions, with affects both ways. It is also nice to allow the user to choose his preferred mechanism, when more than one is properly supported on a given system. Regards, Mauro (c/c Aristeu, as he might also being working with similar stuff) -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Oct 15, 2013 at 09:43:46PM -0300, Mauro Carvalho Chehab wrote: > Using a custom typedef here seems problematic, as that can make userspace > interface more complicated. It is defined in a userspace header: include/uapi/linux/uuid.h > > >>> + char *fru_text, > > >>> + u64 error_count, > > >>> + u32 severity, > > >>> + u64 phy_addr, > > >>> + char *mem_loc), > > By looking on the rest of the changes, the mem_loc can now contain the > right location of the memory error, including on what DIMM the error > happened. It can also (optionally) contain the DIMM label. No, dimm_loc contains the label. > Also, userspace needs to know what's the granularity for the error > that an eMCA driver will give, in order to adjust its policies. u32 error_count > If you don't create the EDAC nodes, it means that userspace doesn't > have any glue about what error information will be provided. Of course it does - it is a tracepoint. There's no need for EDAC at all with eMCA present on the system. > In any case, this is provided by the EDAC core functions that describe > the memory in details. So, IMHO, get rid of EDAC is a big mistake. No one said we're getting rid of EDAC - we're basically disabling its services on machines with eMCA because we simply don't need it. > It is also nice to allow the user to choose his preferred mechanism, > when more than one is properly supported on a given system. We did this with firmware-first reporting so I don't see any need for user interaction. When you have eMCA on the system, you disable everything else reporting memory errors and go to sleep. So, similar to firmware first. If eMCA turns out to have f*cked BIOS implementations - which I don't doubt - then we can add a chicken bit similar to "acpi=nocmcff" It is that simple.
On Tue, Oct 15, 2013 at 10:24:35PM +0530, Naveen N. Rao wrote: > Date: Tue, 15 Oct 2013 22:24:35 +0530 > From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> > To: "Chen, Gong" <gong.chen@linux.intel.com> > Cc: tony.luck@intel.com, bp@alien8.de, linux-kernel@vger.kernel.org, > linux-acpi@vger.kernel.org, m.chehab@samsung.com > Subject: Re: [PATCH 8/8] ACPI / trace: Add trace interface for eMCA driver > User-Agent: Mutt/1.5.21 (2010-09-15) > > On 2013/10/11 02:32AM, Chen Gong wrote: > > Use trace interface to elaborate all H/W error related > > information. > > > > Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> > > --- > <snip> > > +TRACE_EVENT(extlog_mem_event, > > + TP_PROTO(u32 etype, > > + char *dimm_loc, > > + const uuid_le *fru_id, > > + char *fru_text, > > + u64 error_count, > > + u32 severity, > > + u64 phy_addr, > > + char *mem_loc), > > [Adding Mauro...] > > This looks very similar to the trace event I wrote a while back, > which was similar to the one provided by ghes_edac: > http://thread.gmane.org/gmane.linux.kernel.pci/24616 > > Seems to me this has the same issues we previously discussed w.r.t > EDAC conflicts... > This thread is so long. I have to say I'm lost ... Anyway, it looks like there are many different opnions on this last patch. I will send the 2nd version for further discussion soon.
Em Wed, 16 Oct 2013 11:16:40 +0200 Borislav Petkov <bp@alien8.de> escreveu: > On Tue, Oct 15, 2013 at 09:43:46PM -0300, Mauro Carvalho Chehab wrote: > > > + const uuid_le *fru_id, > > Using a custom typedef here seems problematic, as that can make userspace > > interface more complicated. > It is defined in a userspace header: include/uapi/linux/uuid.h Hmm... a non-packed structure is defined there. Ok, it has 16 bytes. I generally avoid using non-packed structs on uapi, as the compiler alignment rules can cause troubles if kernelspace is compiled with 64 bits, and userspace with 32 bits. In this specific case, it looks ok. > > > > >>> + char *fru_text, > > > >>> + u64 error_count, > > > >>> + u32 severity, > > > >>> + u64 phy_addr, > > > >>> + char *mem_loc), > > > > By looking on the rest of the changes, the mem_loc can now contain the > > right location of the memory error, including on what DIMM the error > > happened. It can also (optionally) contain the DIMM label. > > No, dimm_loc contains the label. Yeah, right. This is badly named, then. Anyway, the dimm_loc is filled from DMI table by dmi_memdev_name(). Based on past experiences, this can be problematic, as manufactures tend to re-use the same DMI table on machines with different layouts. Tony/Chen, Are there any changes with regards to that, like some enforcement policy for BIOS manufacturers to make it right? If not, then I think we still need EDAC's code to allow overriding those labels by the ones actually found on the system. > > Also, userspace needs to know what's the granularity for the error > > that an eMCA driver will give, in order to adjust its policies. > > u32 error_count I'm talking about granularity, not error count. I mean: how the memory is organized, what happens with errors that could be affecting more than one DIMM (for example, on mirrored memories), etc. Userspace can only do something useful if it knows what kind of support the hardware error mechanism is providing. > > If you don't create the EDAC nodes, it means that userspace doesn't > > have any glue about what error information will be provided. > > Of course it does - it is a tracepoint. There's no need for EDAC at all > with eMCA present on the system. Well, try to write some code on userspace to discover what's the error. An error threshold mechanism on userspace will only work if userspace knows that the error belongs to the same DIMM. If several different DIMMs are showing errors at the very same channel, and replacing those dimms don't happen, that likely means that the channel BUS is damaged. What I'm saying is that hiding those information from userspace doesn't help at all to improve the quality of the error report mechanism. I can't see any reason why hiding those information from userspace. The tracepoint interface is not for that. Such data is provided via sysfs, and should be used for the application to properly tune their algorithms. > > In any case, this is provided by the EDAC core functions that describe > > the memory in details. So, IMHO, get rid of EDAC is a big mistake. > > No one said we're getting rid of EDAC - we're basically disabling its > services on machines with eMCA because we simply don't need it. We do need, if we want do do a good job decoding the errors. > > It is also nice to allow the user to choose his preferred mechanism, > > when more than one is properly supported on a given system. > > We did this with firmware-first reporting so I don't see any need > for user interaction. When you have eMCA on the system, you disable > everything else reporting memory errors and go to sleep. So, similar to > firmware first. > > If eMCA turns out to have f*cked BIOS implementations - which I don't > doubt - then we can add a chicken bit similar to "acpi=nocmcff" > > It is that simple. > Works for me. Regards, Mauro -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Oct 16, 2013 at 07:35:39AM -0300, Mauro Carvalho Chehab wrote: > Well, try to write some code on userspace to discover what's the error. > > An error threshold mechanism on userspace will only work if userspace > knows that the error belongs to the same DIMM. Just read the first mail again: <idle>-0 [000] d.h. 56068.488759: extlog_mem_event: 3 corrected errors:unknown on Memriser1 CHANNEL A DIMM 0(FRU: 00000000-0000 -0000-0000-000000000000 physical addr: 0x0000000851fe0000 node: 0 card: 0 module: 0 rank: 0 bank: 0 row: 28927 column: 1296)
Btw, I don't know what's the problem but when I hit reply-to-all to your emails, mutt drops your email address from the To: and makes the CC: list become the To: list. Strange. On Wed, Oct 16, 2013 at 05:50:30AM -0400, Chen Gong wrote: > This thread is so long. I have to say I'm lost ... Anyway, it looks > like there are many different opnions on this last patch. Why, I think it is fine. The only compatibility issue I see is if we're going to share the tracepoint with Naveen's GHES reporting stuff we were discussing a couple of weeks ago. But AFAIR, we still needed EDAC assistance there while here there's no need for that. > I will send the 2nd version for further discussion soon. Yes please. :) Thanks.
Em Wed, 16 Oct 2013 12:42:21 +0200 Borislav Petkov <bp@alien8.de> escreveu: > On Wed, Oct 16, 2013 at 07:35:39AM -0300, Mauro Carvalho Chehab wrote: > > Well, try to write some code on userspace to discover what's the error. > > > > An error threshold mechanism on userspace will only work if userspace > > knows that the error belongs to the same DIMM. > > Just read the first mail again: > > <idle>-0 [000] d.h. 56068.488759: extlog_mem_event: 3 corrected errors:unknown on Memriser1 CHANNEL A DIMM 0(FRU: 00000000-0000 > -0000-0000-000000000000 physical addr: 0x0000000851fe0000 node: 0 card: 0 module: 0 rank: 0 bank: 0 row: 28927 column: 1296) On that log, "physical addr: 0x0000000851fe0000 node: 0 card: 0 module: 0 rank: 0 bank: 0 row: 28927 column: 1296" is a string, instead of an hierarchical position, like what it is provided on EDAC. Worse than that, not all data may be available, as CPER allows to ommit some data. Also, I suspect that, if an error happens to affect more than one DIMM (e. g. part of the location is not available for a given error), that the DIMM label will also not be properly shown. Also, writing the userspace counterpart that would work properly is extremely hard, if the information about the memory layout is not known in advance. So, in practice, if the above memory error is provided, all userspace will likely be able to do is to store it and require someone to manually identify what's happening. On the other hand, if node, channel and dimm number information is properly filled (like it happens on EDAC), usersapce can rely on those data, in order to apply per dimm, per channel and per node thresholds. It may even use the physical address to identify if the problem is only on a certain region of a physical DIMM and poison that region, while it is not possible to replace the damaged component. Regards, Mauro -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Oct 16, 2013 at 08:55:58AM -0300, Mauro Carvalho Chehab wrote: > On that log, "physical addr: 0x0000000851fe0000 node: 0 card: 0 module: 0 rank: 0 bank: 0 row: 28927 column: 1296" > is a string, instead of an hierarchical position, like what it is provided > on EDAC. So you can't split strings in userspace? > Also, I suspect that, if an error happens to affect more than one DIMM > (e. g. part of the location is not available for a given error), that > the DIMM label will also not be properly shown. That's unrelated to these patches and needs to be reported properly by the firmware.
> Are there any changes with regards to that, like some enforcement policy > for BIOS manufacturers to make it right? I am using a cricket bat to beat BIOS teams that implement eMCA to make sure they get the labels in SMBIOS right. :-) It's a non-trivial implementation effort to get all the decoding done for the eMCA log. It's typically a simple string change to fix the SMBIOS table ... there has been some resistance - but so far I have been victorious. Wish me luck as I'm about to engage with another team. -Tony -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Also, I suspect that, if an error happens to affect more than one DIMM > (e. g. part of the location is not available for a given error), > that the DIMM label will also not be properly shown. There are a couple of cases here: 1) There are a number of DIMMs behind some flaky h/w that introduces errors that are apparently blamed onto each of those DIMMs. All we can do here is statistical correlations ... each error is reported independently, it is up to some entity to notice the higher level topology connection. There is enough information in the UEFI error record to do that (assuming that BIOS filled out the necessary fields). 2) There is a single reported error that spans more than one DIMM. This can happen with a UC error in a pair of lock-step DIMMs. Since the error is UC we know that two (or more) bits are bad. But we have no way to tell whether the bad bits came from the same DIMM, or one bit from each (because we don't know which bits are bad - if we knew that, we could fix them :-) The eMCA case should log two subsections in this case - one for each of the lockstep DIMMs involved. A user seeing this will should probably just replace both DIMMs to be safe. If they wanted to diagnose further they should swap DIMMs around so this pair are no longer lockstepped and see if they start seeing correctable errors from each of the split pair - or if the UC errors move with one or the other of the DIMMs -Tony -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Em Wed, 16 Oct 2013 20:35:32 +0000 "Luck, Tony" <tony.luck@intel.com> escreveu: > > Are there any changes with regards to that, like some enforcement policy > > for BIOS manufacturers to make it right? > > I am using a cricket bat to beat BIOS teams that implement eMCA to make > sure they get the labels in SMBIOS right. :-) :) > It's a non-trivial implementation effort to get all the decoding done for > the eMCA log. It's typically a simple string change to fix the SMBIOS > table ... there has been some resistance - but so far I have been victorious. > Wish me luck as I'm about to engage with another team. Yes, the DMI changes are trivial. Let's hope that they'll do it right. Yet, I think we should keep (at least for a while) a mechanism similar to what EDAC does, allowing those names to be overridden by loading a different table, where needed. Regards, Mauro -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Em Wed, 16 Oct 2013 20:47:05 +0000 "Luck, Tony" <tony.luck@intel.com> escreveu: > > Also, I suspect that, if an error happens to affect more than one DIMM > > (e. g. part of the location is not available for a given error), > > that the DIMM label will also not be properly shown. > > There are a couple of cases here: > > 1) There are a number of DIMMs behind some flaky h/w that introduces errors > that are apparently blamed onto each of those DIMMs. > > All we can do here is statistical correlations ... each error is reported independently, > it is up to some entity to notice the higher level topology connection. There is enough > information in the UEFI error record to do that (assuming that BIOS filled out the > necessary fields). > > 2) There is a single reported error that spans more than one DIMM. > > This can happen with a UC error in a pair of lock-step DIMMs. Since the error is UC > we know that two (or more) bits are bad. But we have no way to tell whether the > bad bits came from the same DIMM, or one bit from each (because we don't know > which bits are bad - if we knew that, we could fix them :-) The eMCA case should > log two subsections in this case - one for each of the lockstep DIMMs involved. A user > seeing this will should probably just replace both DIMMs to be safe. If they wanted to > diagnose further they should swap DIMMs around so this pair are no longer lockstepped > and see if they start seeing correctable errors from each of the split pair - or if the UC > errors move with one or the other of the DIMMs There's also a third case: mirrored memories. As a matter of coherency with hw-based reports, for cases (2) and (3), the error tracing should be displaying both memories that are affected by a UC error (or a CE error on a mirrored address space). Regards, Mauro -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> There's also a third case: mirrored memories.
Mirrors are currently something of a mess - we don't get any useful notification when one breaks.
We do need to fix this - and make sure reporting is properly integrated with everything else - I'm
just not sure how to do this.
-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/16/2013 04:19 PM, Borislav Petkov wrote: > Btw, I don't know what's the problem but when I hit reply-to-all to your > emails, mutt drops your email address from the To: and makes the CC: > list become the To: list. Strange. I'm seeing the same thing. Looking at the headers, Chen Gong's email includes a Mail-Follow-Up field which could be the problem. - Naveen -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 1465fa8..9ea343e 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -372,12 +372,17 @@ config ACPI_BGRT source "drivers/acpi/apei/Kconfig" +config EXTLOG_TRACE + def_bool n + config ACPI_EXTLOG tristate "Extended Error Log support" depends on X86 && X86_MCE + select EXTLOG_TRACE default n help This driver adds support for decoding extended errors from hardware. - which allows the operating system to obtain data from trace. + which allows the operating system to obtain data from trace. It will + appear under /sys/kernel/debug/tracing/ras/ . endif # ACPI diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index bce34af..a6e41b7 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -83,4 +83,8 @@ obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o obj-$(CONFIG_ACPI_APEI) += apei/ +# extended log support +acpi-$(CONFIG_EXTLOG_TRACE) += extlog_trace.o +CFLAGS_extlog_trace.o := -I$(src) + obj-$(CONFIG_ACPI_EXTLOG) += acpi_extlog.o diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 3e3e286..ca51eb0 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -26,6 +26,7 @@ #include <asm/mce.h> #include "apei/apei-internal.h" +#include "debug_extlog.h" #define EXT_ELOG_ENTRY_MASK 0xfffffffffffff /* elog entry address mask */ @@ -55,6 +56,8 @@ struct extlog_l1_head { static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295"; +static const uuid_le invalid_uuid = NULL_UUID_LE; + /* L1 table related physical address */ static u64 elog_base; static size_t elog_size; @@ -143,7 +146,12 @@ static int print_extlog_rcd(const char *pfx, static int extlog_print(const char *pfx, int cpu, int bank) { - struct acpi_generic_status *estatus; + struct acpi_generic_status *estatus, *tmp; + struct acpi_generic_data *gdata; + const uuid_le *fru_id = &invalid_uuid; + char *fru_text = ""; + uuid_le *sec_type; + static u64 err_count; int rc; estatus = extlog_elog_entry_check(cpu, bank); @@ -154,7 +162,23 @@ static int extlog_print(const char *pfx, int cpu, int bank) /* clear record status to enable BIOS to update it again */ estatus->block_status = 0; - rc = print_extlog_rcd(pfx, (struct acpi_generic_status *)elog_buf, cpu); + tmp = (struct acpi_generic_status *)elog_buf; + gdata = (struct acpi_generic_data *)(tmp + 1); + rc = print_extlog_rcd(pfx, tmp, cpu); + + /* trace extended error log */ + err_count++; + if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) + fru_id = (uuid_le *)gdata->fru_id; + if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) + fru_text = gdata->fru_text; + sec_type = (uuid_le *)gdata->section_type; + if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { + struct cper_sec_mem_err *mem_err = (void *)(gdata + 1); + if (gdata->error_data_length >= sizeof(*mem_err)) + trace_mem_error(fru_id, fru_text, err_count, + gdata->error_severity, mem_err); + } return rc; } diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c index 567410e..0b4cfad 100644 --- a/drivers/acpi/apei/cper.c +++ b/drivers/acpi/apei/cper.c @@ -56,11 +56,12 @@ static const char *cper_severity_strs[] = { "info", }; -static const char *cper_severity_str(unsigned int severity) +const char *cper_severity_str(unsigned int severity) { return severity < ARRAY_SIZE(cper_severity_strs) ? cper_severity_strs[severity] : "unknown"; } +EXPORT_SYMBOL_GPL(cper_severity_str); /* * cper_print_bits - print strings for set bits @@ -195,6 +196,13 @@ static const char *cper_mem_err_type_strs[] = { "Physical Memory Map-out event", }; +const char *cper_mem_err_type_str(unsigned int etype) +{ + return etype < ARRAY_SIZE(cper_mem_err_type_strs) ? + cper_mem_err_type_strs[etype] : "unknown"; +} +EXPORT_SYMBOL_GPL(cper_mem_err_type_str); + static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) { if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) @@ -232,8 +240,7 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { u8 etype = mem->error_type; printk("%s""error_type: %d, %s\n", pfx, etype, - etype < ARRAY_SIZE(cper_mem_err_type_strs) ? - cper_mem_err_type_strs[etype] : "unknown"); + cper_mem_err_type_str(etype)); } if (mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { const char *bank = NULL, *device = NULL; diff --git a/drivers/acpi/debug_extlog.h b/drivers/acpi/debug_extlog.h new file mode 100644 index 0000000..67bb2c5 --- /dev/null +++ b/drivers/acpi/debug_extlog.h @@ -0,0 +1,16 @@ +#ifndef __DEBUG_EXTLOG_H +#define __DEBUG_EXTLOG_H + +#include <linux/cper.h> + +#ifdef CONFIG_EXTLOG_TRACE +extern void trace_mem_error(const uuid_le *fru_id, char *fru_text, + u64 err_count, u32 severity, struct cper_sec_mem_err *mem); +#else +void trace_mem_error(const uuid_le *fru_id, char *fru_text, + u64 err_count, u32 severity, struct cper_sec_mem_err *mem) +{ +} +#endif + +#endif diff --git a/drivers/acpi/extlog_trace.c b/drivers/acpi/extlog_trace.c new file mode 100644 index 0000000..2b2824c --- /dev/null +++ b/drivers/acpi/extlog_trace.c @@ -0,0 +1,105 @@ +#include <linux/export.h> +#include <linux/dmi.h> +#include "debug_extlog.h" + +#define CREATE_TRACE_POINTS +#include "extlog_trace.h" + +static char mem_location[LOC_LEN]; +static char dimm_location[LOC_LEN]; + +static void mem_err_location(struct cper_sec_mem_err *mem) +{ + char *p; + u32 n = 0; + + memset(mem_location, 0, LOC_LEN); + p = mem_location; + if (mem->validation_bits & CPER_MEM_VALID_NODE) + n += sprintf(p + n, " node: %d", mem->node); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_CARD) + n += sprintf(p + n, " card: %d", mem->card); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_MODULE) + n += sprintf(p + n, " module: %d", mem->module); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER) + n += sprintf(p + n, " rank: %d", mem->rank); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_BANK) + n += sprintf(p + n, " bank: %d", mem->bank); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_DEVICE) + n += sprintf(p + n, " device: %d", mem->device); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_ROW) + n += sprintf(p + n, " row: %d", mem->row); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_COLUMN) + n += sprintf(p + n, " column: %d", mem->column); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) + n += sprintf(p + n, " bit_position: %d", mem->bit_pos); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) + n += sprintf(p + n, " requestor_id: 0x%016llx", + mem->requestor_id); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID) + n += sprintf(p + n, " responder_id: 0x%016llx", + mem->responder_id); + if (n >= LOC_LEN) + goto end; + if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID) + n += sprintf(p + n, " target_id: 0x%016llx", mem->target_id); +end: + return; +} + +static void dimm_err_location(struct cper_sec_mem_err *mem) +{ + memset(dimm_location, 0, LOC_LEN); + if (mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { + const char *bank = NULL, *device = NULL; + dmi_memdev_name(mem->mem_dev_handle, &bank, &device); + if (bank != NULL && device != NULL) + snprintf(dimm_location, LOC_LEN - 1, + "%s %s", bank, device); + else + snprintf(dimm_location, LOC_LEN - 1, + "DMI handle: 0x%.4x", mem->mem_dev_handle); + } +} + +void trace_mem_error(const uuid_le *fru_id, char *fru_text, + u64 err_count, u32 severity, struct cper_sec_mem_err *mem) +{ + u32 etype = ~0U; + u64 phy_addr = 0; + + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) + etype = mem->error_type; + if (mem->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { + phy_addr = mem->physical_addr; + if (mem->validation_bits & + CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) + phy_addr &= mem->physical_addr_mask; + } + mem_err_location(mem); + dimm_err_location(mem); + + trace_extlog_mem_event(etype, dimm_location, fru_id, fru_text, + err_count, severity, phy_addr, mem_location); +} +EXPORT_SYMBOL_GPL(trace_mem_error); diff --git a/drivers/acpi/extlog_trace.h b/drivers/acpi/extlog_trace.h new file mode 100644 index 0000000..21f0887 --- /dev/null +++ b/drivers/acpi/extlog_trace.h @@ -0,0 +1,77 @@ +#if !defined(_TRACE_EXTLOG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EXTLOG_H + +#include <linux/tracepoint.h> +#include <linux/cper.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM extlog + +/* + * MCE Extended Error Log Trace event + * + * These events are generated when hardware detects a corrected or + * uncorrected event. + * + */ + +/* memory trace event */ + +#define LOC_LEN 512 +#define MSG_LEN ((LOC_LEN) * 2) + +TRACE_EVENT(extlog_mem_event, + TP_PROTO(u32 etype, + char *dimm_loc, + const uuid_le *fru_id, + char *fru_text, + u64 error_count, + u32 severity, + u64 phy_addr, + char *mem_loc), + + TP_ARGS(etype, dimm_loc, fru_id, fru_text, error_count, severity, + phy_addr, mem_loc), + + TP_STRUCT__entry( + __field(u32, etype) + __dynamic_array(char, dimm_info, LOC_LEN) + __field(u64, error_count) + __field(u32, severity) + __dynamic_array(char, msg, MSG_LEN) + ), + + TP_fast_assign( + __entry->error_count = error_count; + __entry->severity = severity; + __entry->etype = etype; + if (dimm_loc[0] != '\0') + snprintf(__get_dynamic_array(dimm_info), LOC_LEN - 1, + "on %s", dimm_loc); + else + __assign_str(dimm_info, ""); + if (phy_addr != 0) + snprintf(__get_dynamic_array(msg), MSG_LEN - 1, + "(FRU: %pUl %.20s physical addr: 0x%016llx%s)", + fru_id, fru_text, phy_addr, mem_loc); + else + __assign_str(msg, ""); + ), + + TP_printk("%llu %s error%s:%s %s%s", + __entry->error_count, + cper_severity_str(__entry->severity), + __entry->error_count > 1 ? "s" : "", + cper_mem_err_type_str(__entry->etype), + __get_str(dimm_info), + __get_str(msg)) +); + +#endif /* _TRACE_EXTLOG_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE extlog_trace +#include <trace/define_trace.h> diff --git a/include/linux/cper.h b/include/linux/cper.h index bd01c9a..c00eb55 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -395,6 +395,8 @@ struct cper_sec_pcie { #pragma pack() u64 cper_next_record_id(void); +const char *cper_severity_str(unsigned int); +const char *cper_mem_err_type_str(unsigned int); void cper_print_bits(const char *prefix, unsigned int bits, const char *strs[], unsigned int strs_size);
Use trace interface to elaborate all H/W error related information. Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> --- drivers/acpi/Kconfig | 7 ++- drivers/acpi/Makefile | 4 ++ drivers/acpi/acpi_extlog.c | 28 +++++++++++- drivers/acpi/apei/cper.c | 13 ++++-- drivers/acpi/debug_extlog.h | 16 +++++++ drivers/acpi/extlog_trace.c | 105 ++++++++++++++++++++++++++++++++++++++++++++ drivers/acpi/extlog_trace.h | 77 ++++++++++++++++++++++++++++++++ include/linux/cper.h | 2 + 8 files changed, 246 insertions(+), 6 deletions(-) create mode 100644 drivers/acpi/debug_extlog.h create mode 100644 drivers/acpi/extlog_trace.c create mode 100644 drivers/acpi/extlog_trace.h