diff mbox

[v3,1/8] trace: ras: add ARM processor error information trace event

Message ID 1490869877-118713-11-git-send-email-xiexiuqi@huawei.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Xie XiuQi March 30, 2017, 10:31 a.m. UTC
Add a new trace event for ARM processor error information, so that
the user will know what error occurred. With this information the
user may take appropriate action.

These trace events are consistent with the ARM processor error
information table which defined in UEFI 2.6 spec section N.2.4.4.1.

---
v2: add trace enabled condition as Steven's suggestion.
    fix a typo.
---

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tyler Baicar <tbaicar@codeaurora.org>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
---
 drivers/acpi/apei/ghes.c | 10 ++++++
 include/linux/cper.h     |  5 +++
 include/ras/ras_event.h  | 87 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

Comments

Tyler Baicar April 14, 2017, 8:36 p.m. UTC | #1
On 3/30/2017 4:31 AM, Xie XiuQi wrote:
> Add a new trace event for ARM processor error information, so that
> the user will know what error occurred. With this information the
> user may take appropriate action.
>
> These trace events are consistent with the ARM processor error
> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>
> ---
> v2: add trace enabled condition as Steven's suggestion.
>      fix a typo.
> ---
>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Tyler Baicar <tbaicar@codeaurora.org>
> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
> ---
...
>   
> +#define ARM_PROC_ERR_TYPE	\
> +	EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )	\
> +	EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )	\
> +	EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )	\
> +	EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
> +
> +#define ARM_PROC_ERR_FLAGS	\
> +	EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )	\
> +	EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )	\
> +	EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )	\
> +	EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
> +
Hello Xie XiuQi,

This isn't compiling for me because of these definitions. Here you are 
using ARM_*, but below in the TP_printk you are using ARCH_*. The 
compiler complains the ARCH_* ones are undefined:

./include/trace/../../include/ras/ras_event.h:278:37: error: 
'ARCH_PROC_ERR_TYPE' undeclared (first use in this function)
      __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
./include/trace/../../include/ras/ras_event.h:280:38: error: 
'ARCH_PROC_ERR_FLAGS' undeclared (first use in this function)
      __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),

> +/*
> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
> + * via TRACE_DEFINE_ENUM().
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
> +#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
> +
> +ARM_PROC_ERR_TYPE
> +ARM_PROC_ERR_FLAGS
Are the above two lines supposed to be here?
> +
> +/*
> + * Now redefine the EM() and EMe() macros to map the enums to the strings
> + * that will be printed in the output.
> + */
> +#undef EM
> +#undef EMe
> +#define EM(a, b)		{ a, b },
> +#define EMe(a, b)	{ a, b }
> +
> +TRACE_EVENT(arm_proc_err,
I think it would be better to keep this similar to the naming of the 
current RAS trace events (right now we have mc_event, arm_event, 
aer_event, etc.). I would suggest using "arm_err_info_event" since this 
is handling the error information structures of the arm errors.
> +
> +	TP_PROTO(const struct cper_arm_err_info *err),
> +
> +	TP_ARGS(err),
> +
> +	TP_STRUCT__entry(
> +		__field(u8, type)
> +		__field(u16, multiple_error)
> +		__field(u8, flags)
> +		__field(u64, error_info)
> +		__field(u64, virt_fault_addr)
> +		__field(u64, physical_fault_addr)
Validation bits should also be a part of this structure that way user 
space tools will know which of these fields are valid.
> +	),
> +
> +	TP_fast_assign(
> +		__entry->type = err->type;
> +
> +		if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
> +			__entry->multiple_error = err->multiple_error;
> +		else
> +			__entry->multiple_error = ~0;
> +
> +		if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
> +			__entry->flags = err->flags;
> +		else
> +			__entry->flags = ~0;
> +
> +		if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
> +			__entry->error_info = err->error_info;
> +		else
> +			__entry->error_info = 0ULL;
> +
> +		if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
> +			__entry->virt_fault_addr = err->virt_fault_addr;
> +		else
> +			__entry->virt_fault_addr = 0ULL;
> +
> +		if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
> +			__entry->physical_fault_addr = err->physical_fault_addr;
> +		else
> +			__entry->physical_fault_addr = 0ULL;
> +	),
> +
> +	TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
I think the "ARM Processor Error:" part of this should just be removed. 
Here's the output with this removed and the trace event renamed to 
arm_err_info_event. I think this looks much cleaner and matches the 
style used with the arm_event.

           <idle>-0     [020] .ns.   366.592434: arm_event: affinity 
level: 2; MPIDR: 0000000000000000; MIDR: 00000000510f8000; running 
state: 1; PSCI state: 0
           <idle>-0     [020] .ns.   366.592437: arm_err_info_event: 
type cache error; count: 0; flags: 0x3; error info: 0000000000c20058; 
virtual address: 0000000000000000; physical address: 0000000000000000

Thanks,
Tyler
Xie XiuQi April 17, 2017, 3:08 a.m. UTC | #2
Hi Tyler,

Thanks for your comments and testing.

On 2017/4/15 4:36, Baicar, Tyler wrote:
> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>> Add a new trace event for ARM processor error information, so that
>> the user will know what error occurred. With this information the
>> user may take appropriate action.
>>
>> These trace events are consistent with the ARM processor error
>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>
>> ---
>> v2: add trace enabled condition as Steven's suggestion.
>>      fix a typo.
>> ---
>>
>> Cc: Steven Rostedt <rostedt@goodmis.org>
>> Cc: Tyler Baicar <tbaicar@codeaurora.org>
>> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
>> ---
> ...
>>   +#define ARM_PROC_ERR_TYPE    \
>> +    EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )    \
>> +    EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )    \
>> +    EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )    \
>> +    EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
>> +
>> +#define ARM_PROC_ERR_FLAGS    \
>> +    EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )    \
>> +    EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )    \
>> +    EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )    \
>> +    EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
>> +
> Hello Xie XiuQi,
> 
> This isn't compiling for me because of these definitions. Here you are using ARM_*, but below in the TP_printk you are using ARCH_*. The compiler complains the ARCH_* ones are undefined:
> 
> ./include/trace/../../include/ras/ras_event.h:278:37: error: 'ARCH_PROC_ERR_TYPE' undeclared (first use in this function)
>      __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
> ./include/trace/../../include/ras/ras_event.h:280:38: error: 'ARCH_PROC_ERR_FLAGS' undeclared (first use in this function)
>      __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),

Sorry, it's a typo. It should be ARM_xxx.

> 
>> +/*
>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>> + * via TRACE_DEFINE_ENUM().
>> + */
>> +#undef EM
>> +#undef EMe
>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>> +#define EMe(a, b)    TRACE_DEFINE_ENUM(a);
>> +
>> +ARM_PROC_ERR_TYPE
>> +ARM_PROC_ERR_FLAGS
> Are the above two lines supposed to be here?
>> +
>> +/*
>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>> + * that will be printed in the output.
>> + */
>> +#undef EM
>> +#undef EMe
>> +#define EM(a, b)        { a, b },
>> +#define EMe(a, b)    { a, b }
>> +
>> +TRACE_EVENT(arm_proc_err,
> I think it would be better to keep this similar to the naming of the current RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I would suggest using "arm_err_info_event" since this is handling the error information structures of the arm errors.
>> +
>> +    TP_PROTO(const struct cper_arm_err_info *err),
>> +
>> +    TP_ARGS(err),
>> +
>> +    TP_STRUCT__entry(
>> +        __field(u8, type)
>> +        __field(u16, multiple_error)
>> +        __field(u8, flags)
>> +        __field(u64, error_info)
>> +        __field(u64, virt_fault_addr)
>> +        __field(u64, physical_fault_addr)
> Validation bits should also be a part of this structure that way user space tools will know which of these fields are valid.

Could we use the default value to check the validation which we have checked in TP_fast_assign?

>> +    ),
>> +
>> +    TP_fast_assign(
>> +        __entry->type = err->type;
>> +
>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>> +            __entry->multiple_error = err->multiple_error;
>> +        else
>> +            __entry->multiple_error = ~0;
>> +
>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>> +            __entry->flags = err->flags;
>> +        else
>> +            __entry->flags = ~0;
>> +
>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>> +            __entry->error_info = err->error_info;
>> +        else
>> +            __entry->error_info = 0ULL;
>> +
>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>> +            __entry->virt_fault_addr = err->virt_fault_addr;
>> +        else
>> +            __entry->virt_fault_addr = 0ULL;
>> +
>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>> +            __entry->physical_fault_addr = err->physical_fault_addr;
>> +        else
>> +            __entry->physical_fault_addr = 0ULL;
>> +    ),
>> +
>> +    TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
> I think the "ARM Processor Error:" part of this should just be removed. Here's the output with this removed and the trace event renamed to arm_err_info_event. I think this looks much cleaner and matches the style used with the arm_event.
> 
>           <idle>-0     [020] .ns.   366.592434: arm_event: affinity level: 2; MPIDR: 0000000000000000; MIDR: 00000000510f8000; running state: 1; PSCI state: 0
>           <idle>-0     [020] .ns.   366.592437: arm_err_info_event: type cache error; count: 0; flags: 0x3; error info: 0000000000c20058; virtual address: 0000000000000000; physical address: 0000000000000000

I agree. It looks much better.

> 
> Thanks,
> Tyler
>
Xie XiuQi April 17, 2017, 3:16 a.m. UTC | #3
Hi Tyler,

On 2017/4/17 11:08, Xie XiuQi wrote:
> Hi Tyler,
> 
> Thanks for your comments and testing.
> 
> On 2017/4/15 4:36, Baicar, Tyler wrote:
>> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>>> Add a new trace event for ARM processor error information, so that
>>> the user will know what error occurred. With this information the
>>> user may take appropriate action.
>>>
>>> These trace events are consistent with the ARM processor error
>>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>>
>>> ---
>>> v2: add trace enabled condition as Steven's suggestion.
>>>      fix a typo.
>>> ---
>>>
>>> Cc: Steven Rostedt <rostedt@goodmis.org>
>>> Cc: Tyler Baicar <tbaicar@codeaurora.org>
>>> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
>>> ---
>> ...
>>>   +#define ARM_PROC_ERR_TYPE    \
>>> +    EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )    \
>>> +    EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )    \
>>> +    EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )    \
>>> +    EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
>>> +
>>> +#define ARM_PROC_ERR_FLAGS    \
>>> +    EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )    \
>>> +    EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )    \
>>> +    EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )    \
>>> +    EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
>>> +
>> Hello Xie XiuQi,
>>
>> This isn't compiling for me because of these definitions. Here you are using ARM_*, but below in the TP_printk you are using ARCH_*. The compiler complains the ARCH_* ones are undefined:
>>
>> ./include/trace/../../include/ras/ras_event.h:278:37: error: 'ARCH_PROC_ERR_TYPE' undeclared (first use in this function)
>>      __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
>> ./include/trace/../../include/ras/ras_event.h:280:38: error: 'ARCH_PROC_ERR_FLAGS' undeclared (first use in this function)
>>      __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),
> 
> Sorry, it's a typo. It should be ARM_xxx.
> 
>>
>>> +/*
>>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>>> + * via TRACE_DEFINE_ENUM().
>>> + */
>>> +#undef EM
>>> +#undef EMe
>>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>>> +#define EMe(a, b)    TRACE_DEFINE_ENUM(a);
>>> +
>>> +ARM_PROC_ERR_TYPE
>>> +ARM_PROC_ERR_FLAGS
>> Are the above two lines supposed to be here?
>>> +
>>> +/*
>>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>>> + * that will be printed in the output.
>>> + */
>>> +#undef EM
>>> +#undef EMe
>>> +#define EM(a, b)        { a, b },
>>> +#define EMe(a, b)    { a, b }
>>> +
>>> +TRACE_EVENT(arm_proc_err,
>> I think it would be better to keep this similar to the naming of the current RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I would suggest using "arm_err_info_event" since this is handling the error information structures of the arm errors.
>>> +
>>> +    TP_PROTO(const struct cper_arm_err_info *err),
>>> +
>>> +    TP_ARGS(err),
>>> +
>>> +    TP_STRUCT__entry(
>>> +        __field(u8, type)
>>> +        __field(u16, multiple_error)
>>> +        __field(u8, flags)
>>> +        __field(u64, error_info)
>>> +        __field(u64, virt_fault_addr)
>>> +        __field(u64, physical_fault_addr)
>> Validation bits should also be a part of this structure that way user space tools will know which of these fields are valid.
> 
> Could we use the default value to check the validation which we have checked in TP_fast_assign?
> 
>>> +    ),
>>> +
>>> +    TP_fast_assign(
>>> +        __entry->type = err->type;
>>> +
>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>>> +            __entry->multiple_error = err->multiple_error;
>>> +        else
>>> +            __entry->multiple_error = ~0;
>>> +
>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>>> +            __entry->flags = err->flags;
>>> +        else
>>> +            __entry->flags = ~0;
>>> +
>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>>> +            __entry->error_info = err->error_info;
>>> +        else
>>> +            __entry->error_info = 0ULL;
>>> +
>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>>> +            __entry->virt_fault_addr = err->virt_fault_addr;
>>> +        else
>>> +            __entry->virt_fault_addr = 0ULL;
>>> +
>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>>> +            __entry->physical_fault_addr = err->physical_fault_addr;
>>> +        else
>>> +            __entry->physical_fault_addr = 0ULL;
>>> +    ),
>>> +
>>> +    TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
>> I think the "ARM Processor Error:" part of this should just be removed. Here's the output with this removed and the trace event renamed to arm_err_info_event. I think this looks much cleaner and matches the style used with the arm_event.
>>
>>           <idle>-0     [020] .ns.   366.592434: arm_event: affinity level: 2; MPIDR: 0000000000000000; MIDR: 00000000510f8000; running state: 1; PSCI state: 0
>>           <idle>-0     [020] .ns.   366.592437: arm_err_info_event: type cache error; count: 0; flags: 0x3; error info: 0000000000c20058; virtual address: 0000000000000000; physical address: 0000000000000000
> 

As this section is ARM Processor Error Section, how about use arm_proc_err_event?

> I agree. It looks much better.
> 
>>
>> Thanks,
>> Tyler
>>
>
Tyler Baicar April 17, 2017, 5:18 p.m. UTC | #4
On 4/16/2017 9:16 PM, Xie XiuQi wrote:
> On 2017/4/17 11:08, Xie XiuQi wrote:
>>> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>>>> Add a new trace event for ARM processor error information, so that
>>>> the user will know what error occurred. With this information the
>>>> user may take appropriate action.
>>>>
>>>> These trace events are consistent with the ARM processor error
>>>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>>>
>>>> ---
>>>> v2: add trace enabled condition as Steven's suggestion.
>>>>       fix a typo.
>>>> ---
>>>>
>>>> Cc: Steven Rostedt <rostedt@goodmis.org>
>>>> Cc: Tyler Baicar <tbaicar@codeaurora.org>
>>>> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
>>>> ---
>>>
...
>>>> +/*
>>>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>>>> + * via TRACE_DEFINE_ENUM().
>>>> + */
>>>> +#undef EM
>>>> +#undef EMe
>>>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>>>> +#define EMe(a, b)    TRACE_DEFINE_ENUM(a);
>>>> +
>>>> +ARM_PROC_ERR_TYPE
>>>> +ARM_PROC_ERR_FLAGS
>>> Are the above two lines supposed to be here?
>>>> +
>>>> +/*
>>>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>>>> + * that will be printed in the output.
>>>> + */
>>>> +#undef EM
>>>> +#undef EMe
>>>> +#define EM(a, b)        { a, b },
>>>> +#define EMe(a, b)    { a, b }
>>>> +
>>>> +TRACE_EVENT(arm_proc_err,
>>> I think it would be better to keep this similar to the naming of the current RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I would suggest using "arm_err_info_event" since this is handling the error information structures of the arm errors.
>>>> +
>>>> +    TP_PROTO(const struct cper_arm_err_info *err),
>>>> +
>>>> +    TP_ARGS(err),
>>>> +
>>>> +    TP_STRUCT__entry(
>>>> +        __field(u8, type)
>>>> +        __field(u16, multiple_error)
>>>> +        __field(u8, flags)
>>>> +        __field(u64, error_info)
>>>> +        __field(u64, virt_fault_addr)
>>>> +        __field(u64, physical_fault_addr)
>>> Validation bits should also be a part of this structure that way user space tools will know which of these fields are valid.
>> Could we use the default value to check the validation which we have checked in TP_fast_assign?
Yes, true...I guess we really don't need the validation bits then.
>>>> +    ),
>>>> +
>>>> +    TP_fast_assign(
>>>> +        __entry->type = err->type;
>>>> +
>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>>>> +            __entry->multiple_error = err->multiple_error;
>>>> +        else
>>>> +            __entry->multiple_error = ~0;
>>>> +
>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>>>> +            __entry->flags = err->flags;
>>>> +        else
>>>> +            __entry->flags = ~0;
>>>> +
>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>>>> +            __entry->error_info = err->error_info;
>>>> +        else
>>>> +            __entry->error_info = 0ULL;
>>>> +
>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>>>> +            __entry->virt_fault_addr = err->virt_fault_addr;
>>>> +        else
>>>> +            __entry->virt_fault_addr = 0ULL;
>>>> +
>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>>>> +            __entry->physical_fault_addr = err->physical_fault_addr;
>>>> +        else
>>>> +            __entry->physical_fault_addr = 0ULL;
>>>> +    ),
>>>> +
>>>> +    TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
>>> I think the "ARM Processor Error:" part of this should just be removed. Here's the output with this removed and the trace event renamed to arm_err_info_event. I think this looks much cleaner and matches the style used with the arm_event.
>>>
>>>            <idle>-0     [020] .ns.   366.592434: arm_event: affinity level: 2; MPIDR: 0000000000000000; MIDR: 00000000510f8000; running state: 1; PSCI state: 0
>>>            <idle>-0     [020] .ns.   366.592437: arm_err_info_event: type cache error; count: 0; flags: 0x3; error info: 0000000000c20058; virtual address: 0000000000000000; physical address: 0000000000000000
> As this section is ARM Processor Error Section, how about use arm_proc_err_event?
This is not for the ARM Processor Error Section, that is what the 
arm_event is handling. What you are adding this trace support for here 
is called the ARM Processor Error Information (UEFI 2.6 spec section 
N.2.4.4.1). So I think your trace event here should be called 
arm_err_info_event. This will also be consistent with the other two 
trace events that I'm planning on adding:

arm_ctx_info_event: ARM Processor Context Information (UEFI 2.6 section 
N.2.4.4.2)
arm_vendor_info_event: This is the "Vendor Specific Error Information" 
in the ARM Processor Error Section (Table 260). It's possible I may just 
add this into the arm_event trace event, but I haven't looked into it 
enough yet.

Thanks,
Tyler
Xie XiuQi April 18, 2017, 2:22 a.m. UTC | #5
Hi Tyler,

On 2017/4/18 1:18, Baicar, Tyler wrote:
> On 4/16/2017 9:16 PM, Xie XiuQi wrote:
>> On 2017/4/17 11:08, Xie XiuQi wrote:
>>>> On 3/30/2017 4:31 AM, Xie XiuQi wrote:
>>>>> Add a new trace event for ARM processor error information, so that
>>>>> the user will know what error occurred. With this information the
>>>>> user may take appropriate action.
>>>>>
>>>>> These trace events are consistent with the ARM processor error
>>>>> information table which defined in UEFI 2.6 spec section N.2.4.4.1.
>>>>>
>>>>> ---
>>>>> v2: add trace enabled condition as Steven's suggestion.
>>>>>       fix a typo.
>>>>> ---
>>>>>
>>>>> Cc: Steven Rostedt <rostedt@goodmis.org>
>>>>> Cc: Tyler Baicar <tbaicar@codeaurora.org>
>>>>> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
>>>>> ---
>>>>
> ...
>>>>> +/*
>>>>> + * First define the enums in MM_ACTION_RESULT to be exported to userspace
>>>>> + * via TRACE_DEFINE_ENUM().
>>>>> + */
>>>>> +#undef EM
>>>>> +#undef EMe
>>>>> +#define EM(a, b) TRACE_DEFINE_ENUM(a);
>>>>> +#define EMe(a, b)    TRACE_DEFINE_ENUM(a);
>>>>> +
>>>>> +ARM_PROC_ERR_TYPE
>>>>> +ARM_PROC_ERR_FLAGS
>>>> Are the above two lines supposed to be here?
>>>>> +
>>>>> +/*
>>>>> + * Now redefine the EM() and EMe() macros to map the enums to the strings
>>>>> + * that will be printed in the output.
>>>>> + */
>>>>> +#undef EM
>>>>> +#undef EMe
>>>>> +#define EM(a, b)        { a, b },
>>>>> +#define EMe(a, b)    { a, b }
>>>>> +
>>>>> +TRACE_EVENT(arm_proc_err,
>>>> I think it would be better to keep this similar to the naming of the current RAS trace events (right now we have mc_event, arm_event, aer_event, etc.). I would suggest using "arm_err_info_event" since this is handling the error information structures of the arm errors.
>>>>> +
>>>>> +    TP_PROTO(const struct cper_arm_err_info *err),
>>>>> +
>>>>> +    TP_ARGS(err),
>>>>> +
>>>>> +    TP_STRUCT__entry(
>>>>> +        __field(u8, type)
>>>>> +        __field(u16, multiple_error)
>>>>> +        __field(u8, flags)
>>>>> +        __field(u64, error_info)
>>>>> +        __field(u64, virt_fault_addr)
>>>>> +        __field(u64, physical_fault_addr)
>>>> Validation bits should also be a part of this structure that way user space tools will know which of these fields are valid.
>>> Could we use the default value to check the validation which we have checked in TP_fast_assign?
> Yes, true...I guess we really don't need the validation bits then.
>>>>> +    ),
>>>>> +
>>>>> +    TP_fast_assign(
>>>>> +        __entry->type = err->type;
>>>>> +
>>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
>>>>> +            __entry->multiple_error = err->multiple_error;
>>>>> +        else
>>>>> +            __entry->multiple_error = ~0;
>>>>> +
>>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
>>>>> +            __entry->flags = err->flags;
>>>>> +        else
>>>>> +            __entry->flags = ~0;
>>>>> +
>>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
>>>>> +            __entry->error_info = err->error_info;
>>>>> +        else
>>>>> +            __entry->error_info = 0ULL;
>>>>> +
>>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
>>>>> +            __entry->virt_fault_addr = err->virt_fault_addr;
>>>>> +        else
>>>>> +            __entry->virt_fault_addr = 0ULL;
>>>>> +
>>>>> +        if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
>>>>> +            __entry->physical_fault_addr = err->physical_fault_addr;
>>>>> +        else
>>>>> +            __entry->physical_fault_addr = 0ULL;
>>>>> +    ),
>>>>> +
>>>>> +    TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
>>>> I think the "ARM Processor Error:" part of this should just be removed. Here's the output with this removed and the trace event renamed to arm_err_info_event. I think this looks much cleaner and matches the style used with the arm_event.
>>>>
>>>>            <idle>-0     [020] .ns.   366.592434: arm_event: affinity level: 2; MPIDR: 0000000000000000; MIDR: 00000000510f8000; running state: 1; PSCI state: 0
>>>>            <idle>-0     [020] .ns.   366.592437: arm_err_info_event: type cache error; count: 0; flags: 0x3; error info: 0000000000c20058; virtual address: 0000000000000000; physical address: 0000000000000000
>> As this section is ARM Processor Error Section, how about use arm_proc_err_event?
> This is not for the ARM Processor Error Section, that is what the arm_event is handling. What you are adding this trace support for here is called the ARM Processor Error Information (UEFI 2.6 spec section N.2.4.4.1). So I think your trace event here should be called arm_err_info_event. This will also be consistent with the other two trace events that I'm planning on adding:
> 
> arm_ctx_info_event: ARM Processor Context Information (UEFI 2.6 section N.2.4.4.2)
> arm_vendor_info_event: This is the "Vendor Specific Error Information" in the ARM Processor Error Section (Table 260). It's possible I may just add this into the arm_event trace event, but I haven't looked into it enough yet.
> 

OK, I see. Thanks for your explanation.

> Thanks,
> Tyler
>
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 81eabc6..6be0333 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -518,9 +518,19 @@  static void ghes_do_proc(struct ghes *ghes,
 		else if (!uuid_le_cmp(sec_type, CPER_SEC_PROC_ARM) &&
 			 trace_arm_event_enabled()) {
 			struct cper_sec_proc_arm *arm_err;
+			struct cper_arm_err_info *err_info;
+			int i;
 
 			arm_err = acpi_hest_generic_data_payload(gdata);
 			trace_arm_event(arm_err);
+
+			if (trace_arm_proc_err_enabled()) {
+				err_info = (struct cper_arm_err_info *)(arm_err + 1);
+				for (i = 0; i < arm_err->err_info_num; i++) {
+					trace_arm_proc_err(err_info);
+					err_info += 1;
+				}
+			}
 		} else if (trace_unknown_sec_event_enabled()) {
 			void *unknown_err = acpi_hest_generic_data_payload(gdata);
 			trace_unknown_sec_event(&sec_type,
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 85450f3..0cae900 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -270,6 +270,11 @@  enum {
 #define CPER_ARM_INFO_VALID_VIRT_ADDR		0x0008
 #define CPER_ARM_INFO_VALID_PHYSICAL_ADDR	0x0010
 
+#define CPER_ARM_INFO_TYPE_CACHE		0
+#define CPER_ARM_INFO_TYPE_TLB			1
+#define CPER_ARM_INFO_TYPE_BUS			2
+#define CPER_ARM_INFO_TYPE_UARCH		3
+
 #define CPER_ARM_INFO_FLAGS_FIRST		0x0001
 #define CPER_ARM_INFO_FLAGS_LAST		0x0002
 #define CPER_ARM_INFO_FLAGS_PROPAGATED		0x0004
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 13befad..026b094 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -206,6 +206,93 @@ 
 		  __entry->running_state, __entry->psci_state)
 );
 
+#define ARM_PROC_ERR_TYPE	\
+	EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" )	\
+	EM ( CPER_ARM_INFO_TYPE_TLB,  "TLB error" )	\
+	EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" )	\
+	EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" )
+
+#define ARM_PROC_ERR_FLAGS	\
+	EM ( CPER_ARM_INFO_FLAGS_FIRST, "First error captured" )	\
+	EM ( CPER_ARM_INFO_FLAGS_LAST,  "Last error captured" )	\
+	EM ( CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" )	\
+	EMe ( CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" )
+
+/*
+ * First define the enums in MM_ACTION_RESULT to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+ARM_PROC_ERR_TYPE
+ARM_PROC_ERR_FLAGS
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)		{ a, b },
+#define EMe(a, b)	{ a, b }
+
+TRACE_EVENT(arm_proc_err,
+
+	TP_PROTO(const struct cper_arm_err_info *err),
+
+	TP_ARGS(err),
+
+	TP_STRUCT__entry(
+		__field(u8, type)
+		__field(u16, multiple_error)
+		__field(u8, flags)
+		__field(u64, error_info)
+		__field(u64, virt_fault_addr)
+		__field(u64, physical_fault_addr)
+	),
+
+	TP_fast_assign(
+		__entry->type = err->type;
+
+		if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR)
+			__entry->multiple_error = err->multiple_error;
+		else
+			__entry->multiple_error = ~0;
+
+		if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS)
+			__entry->flags = err->flags;
+		else
+			__entry->flags = ~0;
+
+		if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
+			__entry->error_info = err->error_info;
+		else
+			__entry->error_info = 0ULL;
+
+		if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
+			__entry->virt_fault_addr = err->virt_fault_addr;
+		else
+			__entry->virt_fault_addr = 0ULL;
+
+		if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
+			__entry->physical_fault_addr = err->physical_fault_addr;
+		else
+			__entry->physical_fault_addr = 0ULL;
+	),
+
+	TP_printk("ARM Processor Error: type %s; count: %u; flags: %s;"
+		  " error info: %016llx; virtual address: %016llx;"
+		  " physical address: %016llx",
+		  __print_symbolic(__entry->type, ARCH_PROC_ERR_TYPE),
+		  __entry->multiple_error,
+		  __print_symbolic(__entry->flags, ARCH_PROC_ERR_FLAGS),
+		  __entry->error_info, __entry->virt_fault_addr,
+		  __entry->physical_fault_addr)
+);
+
 /*
  * Unknown Section Report
  *