diff mbox

[V8,08/10] ras: acpi / apei: generate trace event for unrecognized CPER section

Message ID 1485969413-23577-9-git-send-email-tbaicar@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Tyler Baicar Feb. 1, 2017, 5:16 p.m. UTC
UEFI spec allows for non-standard section in Common Platform Error
Record. This is defined in section N.2.3 of UEFI version 2.5.

Currently if the CPER section's type (UUID) does not match with
any section type that the kernel knows how to parse, trace event
is not generated for such section. And thus user is not able to know
happening of such hardware error, including error record of
non-standard section.

This commit generates a trace event which contains raw error data
for unrecognized CPER section.

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
---
 drivers/acpi/apei/ghes.c | 22 ++++++++++++++++++++--
 drivers/ras/ras.c        |  1 +
 include/ras/ras_event.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 2 deletions(-)

Comments

kernel test robot Feb. 1, 2017, 11:20 p.m. UTC | #1
Hi Tyler,

[auto build test ERROR on pm/linux-next]
[also build test ERROR on v4.10-rc6 next-20170201]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tyler-Baicar/Add-UEFI-2-6-and-ACPI-6-1-updates-for-RAS-on-ARM64/20170202-020320
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
config: x86_64-randconfig-r0-02020102 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/built-in.o: In function `ghes_do_proc.isra.9':
>> ghes.c:(.text+0x92f3a): undefined reference to `__tracepoint_unknown_sec_event'
   ghes.c:(.text+0x92f73): undefined reference to `__tracepoint_unknown_sec_event'
   ghes.c:(.text+0x92fdb): undefined reference to `__tracepoint_unknown_sec_event'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Steven Rostedt Feb. 15, 2017, 3:52 p.m. UTC | #2
On Wed,  1 Feb 2017 10:16:51 -0700
Tyler Baicar <tbaicar@codeaurora.org> wrote:

> @@ -452,11 +454,21 @@ static void ghes_do_proc(struct ghes *ghes,
>  {
>  	int sev, sec_sev;
>  	struct acpi_hest_generic_data *gdata;
> +	uuid_le sec_type;
> +	uuid_le *fru_id = &NULL_UUID_LE;
> +	char *fru_text = "";
>  
>  	sev = ghes_severity(estatus->error_severity);
>  	apei_estatus_for_each_section(estatus, gdata) {
>  		sec_sev = ghes_severity(gdata->error_severity);
> -		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
> +		sec_type = *(uuid_le *)gdata->section_type;
> +
> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
> +			fru_id = (uuid_le *)gdata->fru_id;
> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
> +			fru_text = gdata->fru_text;
> +
> +		if (!uuid_le_cmp(sec_type,
>  				 CPER_SEC_PLATFORM_MEM)) {
>  			struct cper_sec_mem_err *mem_err;
>  
> @@ -467,7 +479,7 @@ static void ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_memory_failure(gdata, sev);
>  		}
>  #ifdef CONFIG_ACPI_APEI_PCIEAER
> -		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
> +		else if (!uuid_le_cmp(sec_type,
>  				      CPER_SEC_PCIE)) {
>  			struct cper_sec_pcie *pcie_err;
>  
> @@ -500,6 +512,12 @@ static void ghes_do_proc(struct ghes *ghes,
>  
>  		}
>  #endif
> +		else {

As an optimization, you can add:

		else if (trace_unknown_sec_event_enabled()) {

instead, as then this wont be called unless the tracepoint is
activated. Will keep the logic from doing anything with
acpi_hest_generic_data_payload().

Note, that trace_*_enabled() is activated via a jump_label, thus
there's no branches involved.

-- Steve

> +			void *unknown_err = acpi_hest_generic_data_payload(gdata);
> +			trace_unknown_sec_event(&sec_type,
> +					fru_id, fru_text, sec_sev,
> +					unknown_err, gdata->error_data_length);
> +		}
>  	}
>  }
>  
>
Tyler Baicar Feb. 15, 2017, 4:54 p.m. UTC | #3
On 2/15/2017 8:52 AM, Steven Rostedt wrote:
> On Wed,  1 Feb 2017 10:16:51 -0700
> Tyler Baicar <tbaicar@codeaurora.org> wrote:
>
>> @@ -452,11 +454,21 @@ static void ghes_do_proc(struct ghes *ghes,
>>   {
>>   	int sev, sec_sev;
>>   	struct acpi_hest_generic_data *gdata;
>> +	uuid_le sec_type;
>> +	uuid_le *fru_id = &NULL_UUID_LE;
>> +	char *fru_text = "";
>>   
>>   	sev = ghes_severity(estatus->error_severity);
>>   	apei_estatus_for_each_section(estatus, gdata) {
>>   		sec_sev = ghes_severity(gdata->error_severity);
>> -		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
>> +		sec_type = *(uuid_le *)gdata->section_type;
>> +
>> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
>> +			fru_id = (uuid_le *)gdata->fru_id;
>> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
>> +			fru_text = gdata->fru_text;
>> +
>> +		if (!uuid_le_cmp(sec_type,
>>   				 CPER_SEC_PLATFORM_MEM)) {
>>   			struct cper_sec_mem_err *mem_err;
>>   
>> @@ -467,7 +479,7 @@ static void ghes_do_proc(struct ghes *ghes,
>>   			ghes_handle_memory_failure(gdata, sev);
>>   		}
>>   #ifdef CONFIG_ACPI_APEI_PCIEAER
>> -		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
>> +		else if (!uuid_le_cmp(sec_type,
>>   				      CPER_SEC_PCIE)) {
>>   			struct cper_sec_pcie *pcie_err;
>>   
>> @@ -500,6 +512,12 @@ static void ghes_do_proc(struct ghes *ghes,
>>   
>>   		}
>>   #endif
>> +		else {
> As an optimization, you can add:
>
> 		else if (trace_unknown_sec_event_enabled()) {
>
> instead, as then this wont be called unless the tracepoint is
> activated. Will keep the logic from doing anything with
> acpi_hest_generic_data_payload().
>
> Note, that trace_*_enabled() is activated via a jump_label, thus
> there's no branches involved.
>
> -- Steve
Hello Steve,

In v9 I currently have this and the ARM trace event from this series 
both wrapped in an
ifdef verifying that CONFIG_RAS is enabled. This resolves the kbuild 
failures and
will have this code compiled out when that config isn't enabled. Do you 
think I
should use the ifdef or use these *_enabled() functions?

Thanks,
Tyler
>> +			void *unknown_err = acpi_hest_generic_data_payload(gdata);
>> +			trace_unknown_sec_event(&sec_type,
>> +					fru_id, fru_text, sec_sev,
>> +					unknown_err, gdata->error_data_length);
>> +		}
>>   	}
>>   }
>>   
>>
Steven Rostedt Feb. 15, 2017, 5:03 p.m. UTC | #4
On Wed, 15 Feb 2017 09:54:09 -0700
"Baicar, Tyler" <tbaicar@codeaurora.org> wrote:
\
> 
> In v9 I currently have this and the ARM trace event from this series 
> both wrapped in an
> ifdef verifying that CONFIG_RAS is enabled. This resolves the kbuild 
> failures and
> will have this code compiled out when that config isn't enabled. Do you 
> think I
> should use the ifdef or use these *_enabled() functions?
> 

I believe you need both. The *_enabled() functions wont prevent build
errors. But since tracing is seldom enabled (usually only when you want
to see what's happening), the *_enabled() functions optimize the code
to not perform tasks that are only needed when tracing is enabled. When
I say "enabled" I mean actively tracing, as supposed to being just
enabled in the kernel.

-- Steve
Tyler Baicar Feb. 15, 2017, 5:06 p.m. UTC | #5
On 2/15/2017 10:03 AM, Steven Rostedt wrote:
> On Wed, 15 Feb 2017 09:54:09 -0700
> "Baicar, Tyler" <tbaicar@codeaurora.org> wrote:
> \
>> In v9 I currently have this and the ARM trace event from this series
>> both wrapped in an
>> ifdef verifying that CONFIG_RAS is enabled. This resolves the kbuild
>> failures and
>> will have this code compiled out when that config isn't enabled. Do you
>> think I
>> should use the ifdef or use these *_enabled() functions?
>>
> I believe you need both. The *_enabled() functions wont prevent build
> errors. But since tracing is seldom enabled (usually only when you want
> to see what's happening), the *_enabled() functions optimize the code
> to not perform tasks that are only needed when tracing is enabled. When
> I say "enabled" I mean actively tracing, as supposed to being just
> enabled in the kernel.
>
> -- Steve
Got it, I will add both!

Thanks,
Tyler
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 86c1f15..a989345 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -44,11 +44,13 @@ 
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/nmi.h>
+#include <linux/uuid.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
 #include <asm/tlbflush.h>
+#include <ras/ras_event.h>
 
 #include "apei-internal.h"
 
@@ -452,11 +454,21 @@  static void ghes_do_proc(struct ghes *ghes,
 {
 	int sev, sec_sev;
 	struct acpi_hest_generic_data *gdata;
+	uuid_le sec_type;
+	uuid_le *fru_id = &NULL_UUID_LE;
+	char *fru_text = "";
 
 	sev = ghes_severity(estatus->error_severity);
 	apei_estatus_for_each_section(estatus, gdata) {
 		sec_sev = ghes_severity(gdata->error_severity);
-		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
+		sec_type = *(uuid_le *)gdata->section_type;
+
+		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+			fru_id = (uuid_le *)gdata->fru_id;
+		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+			fru_text = gdata->fru_text;
+
+		if (!uuid_le_cmp(sec_type,
 				 CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err;
 
@@ -467,7 +479,7 @@  static void ghes_do_proc(struct ghes *ghes,
 			ghes_handle_memory_failure(gdata, sev);
 		}
 #ifdef CONFIG_ACPI_APEI_PCIEAER
-		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
+		else if (!uuid_le_cmp(sec_type,
 				      CPER_SEC_PCIE)) {
 			struct cper_sec_pcie *pcie_err;
 
@@ -500,6 +512,12 @@  static void ghes_do_proc(struct ghes *ghes,
 
 		}
 #endif
+		else {
+			void *unknown_err = acpi_hest_generic_data_payload(gdata);
+			trace_unknown_sec_event(&sec_type,
+					fru_id, fru_text, sec_sev,
+					unknown_err, gdata->error_data_length);
+		}
 	}
 }
 
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd36..fb2500b 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,4 @@  static int __init ras_init(void)
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+EXPORT_TRACEPOINT_SYMBOL_GPL(unknown_sec_event);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 1791a12..5861b6f 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -162,6 +162,51 @@ 
 );
 
 /*
+ * Unknown Section Report
+ *
+ * This event is generated when hardware detected a hardware
+ * error event, which may be of non-standard section as defined
+ * in UEFI spec appendix "Common Platform Error Record", or may
+ * be of sections for which TRACE_EVENT is not defined.
+ *
+ */
+TRACE_EVENT(unknown_sec_event,
+
+	TP_PROTO(const uuid_le *sec_type,
+		 const uuid_le *fru_id,
+		 const char *fru_text,
+		 const u8 sev,
+		 const u8 *err,
+		 const u32 len),
+
+	TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
+
+	TP_STRUCT__entry(
+		__array(char, sec_type, 16)
+		__array(char, fru_id, 16)
+		__string(fru_text, fru_text)
+		__field(u8, sev)
+		__field(u32, len)
+		__dynamic_array(u8, buf, len)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->sec_type, sec_type, sizeof(uuid_le));
+		memcpy(__entry->fru_id, fru_id, sizeof(uuid_le));
+		__assign_str(fru_text, fru_text);
+		__entry->sev = sev;
+		__entry->len = len;
+		memcpy(__get_dynamic_array(buf), err, len);
+	),
+
+	TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
+		  __entry->sev, __entry->sec_type,
+		  __entry->fru_id, __get_str(fru_text),
+		  __entry->len,
+		  __print_hex(__get_dynamic_array(buf), __entry->len))
+);
+
+/*
  * PCIe AER Trace event
  *
  * These events are generated when hardware detects a corrected or