diff mbox series

arm: perf: fix sw event kernel backtrace with unwind

Message ID 20210511130912.118504-1-shaolexi@huawei.com (mailing list archive)
State New, archived
Headers show
Series arm: perf: fix sw event kernel backtrace with unwind | expand

Commit Message

Lexi Shao May 11, 2021, 1:09 p.m. UTC
For perf sample not generated with interrupt/exception, there is no
pt_regs information, so perf_fetch_caller_regs() is used to fill
necessary information for sampling and backtracing.
perf_fetch_caller_regs() puts LR as PC to skip unwanted frame.

For ARM images with CONFIG_ARM_UNWIND=y, this does not work because
backtracing with unwind requires PC to match FP and SP, but we have the
previous PC with current FP and SP here. Therefore backtrace fails and
stack in kernel only has one entry.

Before this patch:
perf record -e sched:sched_switch -ag sleep 2
perf script
perf  1512 [000]    51.976113: sched:sched_switch: perf:1512 [120] S
	==> swapper/0:0 [120]
                c048d9e8 __schedule ([kernel.kallsyms])
                   d0618 __poll (/lib/libc-2.31.so)
                   6b678 [unknown] (/usr/bin/perf)
                    b0b0 [unknown] (/usr/bin/perf)
                   17700 __libc_start_main (/lib/libc-2.31.so)

Notice there is only one frame in kernel text.

After this patch, the kernel callchain is fully displayed:
perf  1545 [000] 10403.005915: sched:sched_switch: perf:1545 [120] S
	==> swapper/0:0 [120]
                c014bff0 perf_trace_sched_switch ([kernel.kallsyms])
                c048d9d8 __schedule ([kernel.kallsyms])
                c048df4c schedule ([kernel.kallsyms])
                c0491940 schedule_hrtimeout_range_clock ([kernel.kallsyms])
                c0255164 poll_schedule_timeout ([kernel.kallsyms])
                c02567dc do_sys_poll ([kernel.kallsyms])
                c02568f8 sys_poll ([kernel.kallsyms])
                c01084b0 __sys_trace_return ([kernel.kallsyms])
                   d0618 __poll (/lib/libc-2.31.so)
                   6b678 [unknown] (/usr/bin/perf)
                    b0b0 [unknown] (/usr/bin/perf)
                   17700 __libc_start_main (/lib/libc-2.31.so)

Fixes: b3eac0265bf62 ("arm: perf: Fix callchain parse error with kernel
tracepoint events") #v4.2+

Signed-off-by: Lexi Shao <shaolexi@huawei.com>
---
 arch/arm/include/asm/perf_event.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

Comments

Will Deacon June 1, 2021, 12:42 p.m. UTC | #1
On Tue, May 11, 2021 at 09:09:12PM +0800, Lexi Shao wrote:
> For perf sample not generated with interrupt/exception, there is no
> pt_regs information, so perf_fetch_caller_regs() is used to fill
> necessary information for sampling and backtracing.
> perf_fetch_caller_regs() puts LR as PC to skip unwanted frame.
> 
> For ARM images with CONFIG_ARM_UNWIND=y, this does not work because
> backtracing with unwind requires PC to match FP and SP, but we have the
> previous PC with current FP and SP here. Therefore backtrace fails and
> stack in kernel only has one entry.
> 
> Before this patch:
> perf record -e sched:sched_switch -ag sleep 2
> perf script
> perf  1512 [000]    51.976113: sched:sched_switch: perf:1512 [120] S
> 	==> swapper/0:0 [120]
>                 c048d9e8 __schedule ([kernel.kallsyms])
>                    d0618 __poll (/lib/libc-2.31.so)
>                    6b678 [unknown] (/usr/bin/perf)
>                     b0b0 [unknown] (/usr/bin/perf)
>                    17700 __libc_start_main (/lib/libc-2.31.so)
> 
> Notice there is only one frame in kernel text.
> 
> After this patch, the kernel callchain is fully displayed:
> perf  1545 [000] 10403.005915: sched:sched_switch: perf:1545 [120] S
> 	==> swapper/0:0 [120]
>                 c014bff0 perf_trace_sched_switch ([kernel.kallsyms])
>                 c048d9d8 __schedule ([kernel.kallsyms])
>                 c048df4c schedule ([kernel.kallsyms])
>                 c0491940 schedule_hrtimeout_range_clock ([kernel.kallsyms])
>                 c0255164 poll_schedule_timeout ([kernel.kallsyms])
>                 c02567dc do_sys_poll ([kernel.kallsyms])
>                 c02568f8 sys_poll ([kernel.kallsyms])
>                 c01084b0 __sys_trace_return ([kernel.kallsyms])
>                    d0618 __poll (/lib/libc-2.31.so)
>                    6b678 [unknown] (/usr/bin/perf)
>                     b0b0 [unknown] (/usr/bin/perf)
>                    17700 __libc_start_main (/lib/libc-2.31.so)
> 
> Fixes: b3eac0265bf62 ("arm: perf: Fix callchain parse error with kernel
> tracepoint events") #v4.2+
> 
> Signed-off-by: Lexi Shao <shaolexi@huawei.com>
> ---
>  arch/arm/include/asm/perf_event.h | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
> index fe87397c3d8c..e326520f9386 100644
> --- a/arch/arm/include/asm/perf_event.h
> +++ b/arch/arm/include/asm/perf_event.h
> @@ -15,8 +15,20 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
>  #define perf_misc_flags(regs)	perf_misc_flags(regs)
>  #endif
>  
> +#ifdef CONFIG_ARM_UNWIND
> +/*
> + * With ARM unwind, the pc must match with fp and sp, otherwise
> + * backtrace method unwind_frame() does not work properly.
> + * Get pc with assembly instead of using __ip.
> + */
> +#define perf_get_arm_pc(regs, __ip) \
> +	__asm__ __volatile__ ("mov %0, pc" : "=r"((regs)->ARM_pc)::)
> +#else
> +#define perf_get_arm_pc(regs, __ip)	((regs)->ARM_pc = (__ip))
> +#endif
> +
>  #define perf_arch_fetch_caller_regs(regs, __ip) { \
> -	(regs)->ARM_pc = (__ip); \
> +	perf_get_arm_pc(regs, __ip); \
>  	(regs)->ARM_fp = (unsigned long) __builtin_frame_address(0); \
>  	(regs)->ARM_sp = current_stack_pointer; \
>  	(regs)->ARM_cpsr = SVC_MODE; \

This doesn't feel like the right solution to me: the caller is passing us
'__ip' with the expectation that it corresponds to the PC, so that is what
we should be using, not recreating our own value like this.

In other words, if unwinding doesn't work using CALLER_ADDR0, then this
seems like an issue that would affect more than just perf.

Will
Lexi Shao June 3, 2021, 1:43 p.m. UTC | #2
>On Tue, May 11, 2021 at 09:09:12PM +0800, Lexi Shao wrote:
>> For perf sample not generated with interrupt/exception, there is no 
>> pt_regs information, so perf_fetch_caller_regs() is used to fill 
>> necessary information for sampling and backtracing.
>> perf_fetch_caller_regs() puts LR as PC to skip unwanted frame.
>> 
>> For ARM images with CONFIG_ARM_UNWIND=y, this does not work because 
>> backtracing with unwind requires PC to match FP and SP, but we have 
>> the previous PC with current FP and SP here. Therefore backtrace fails 
>> and stack in kernel only has one entry.
>> 
>> Before this patch:
>> perf record -e sched:sched_switch -ag sleep 2 perf script
>> perf  1512 [000]    51.976113: sched:sched_switch: perf:1512 [120] S
>> 	==> swapper/0:0 [120]
>>                 c048d9e8 __schedule ([kernel.kallsyms])
>>                    d0618 __poll (/lib/libc-2.31.so)
>>                    6b678 [unknown] (/usr/bin/perf)
>>                     b0b0 [unknown] (/usr/bin/perf)
>>                    17700 __libc_start_main (/lib/libc-2.31.so)
>> 
>> Notice there is only one frame in kernel text.
>> 
>> After this patch, the kernel callchain is fully displayed:
>> perf  1545 [000] 10403.005915: sched:sched_switch: perf:1545 [120] S
>> 	==> swapper/0:0 [120]
>>                 c014bff0 perf_trace_sched_switch ([kernel.kallsyms])
>>                 c048d9d8 __schedule ([kernel.kallsyms])
>>                 c048df4c schedule ([kernel.kallsyms])
>>                 c0491940 schedule_hrtimeout_range_clock ([kernel.kallsyms])
>>                 c0255164 poll_schedule_timeout ([kernel.kallsyms])
>>                 c02567dc do_sys_poll ([kernel.kallsyms])
>>                 c02568f8 sys_poll ([kernel.kallsyms])
>>                 c01084b0 __sys_trace_return ([kernel.kallsyms])
>>                    d0618 __poll (/lib/libc-2.31.so)
>>                    6b678 [unknown] (/usr/bin/perf)
>>                     b0b0 [unknown] (/usr/bin/perf)
>>                    17700 __libc_start_main (/lib/libc-2.31.so)
>> 
>> Fixes: b3eac0265bf62 ("arm: perf: Fix callchain parse error with 
>> kernel tracepoint events") #v4.2+
>> 
>> Signed-off-by: Lexi Shao <shaolexi@huawei.com>
>> ---
>>  arch/arm/include/asm/perf_event.h | 14 +++++++++++++-
>>  1 file changed, 13 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/arm/include/asm/perf_event.h 
>> b/arch/arm/include/asm/perf_event.h
>> index fe87397c3d8c..e326520f9386 100644
>> --- a/arch/arm/include/asm/perf_event.h
>> +++ b/arch/arm/include/asm/perf_event.h
>> @@ -15,8 +15,20 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
>>  #define perf_misc_flags(regs)	perf_misc_flags(regs)
>>  #endif
>>  
>> +#ifdef CONFIG_ARM_UNWIND
>> +/*
>> + * With ARM unwind, the pc must match with fp and sp, otherwise
>> + * backtrace method unwind_frame() does not work properly.
>> + * Get pc with assembly instead of using __ip.
>> + */
>> +#define perf_get_arm_pc(regs, __ip) \
>> +	__asm__ __volatile__ ("mov %0, pc" : "=r"((regs)->ARM_pc)::) #else
>> +#define perf_get_arm_pc(regs, __ip)	((regs)->ARM_pc = (__ip))
>> +#endif
>> +
>>  #define perf_arch_fetch_caller_regs(regs, __ip) { \
>> -	(regs)->ARM_pc = (__ip); \
>> +	perf_get_arm_pc(regs, __ip); \
>>  	(regs)->ARM_fp = (unsigned long) __builtin_frame_address(0); \
>>  	(regs)->ARM_sp = current_stack_pointer; \
>>  	(regs)->ARM_cpsr = SVC_MODE; \
>
>This doesn't feel like the right solution to me: the caller is passing us '__ip' with the expectation that it corresponds to the PC, so that is what we should be using, not recreating our own value like this.
>
>In other words, if unwinding doesn't work using CALLER_ADDR0, then this seems like an issue that would affect more than just perf.
>
>Will

CALLER_ADDR0 isn't used as PC for backtracing in other places. For arm backtracing function such as __save_stack_trace, function is marked as noinline and the address is used as PC. So it seems that only perf is affected.

I understand that arg __ip of macro perf_arch_fetch_caller_regs can cause confusion. Since perf_arch_fetch_caller_regs is only called in the wrapper function perf_fetch_caller_regs(struct pt_regs *regs), is it better to remove the __ip argument and let perf_arch_fetch_caller_regs to decide the pc value it uses ?

e.g.:
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs);
}

#define perf_arch_fetch_caller_regs(regs) { \
        (regs)->ARM_pc = CALLER_ADDR0; \
        (regs)->ARM_fp = (unsigned long) __builtin_frame_address(0); \
        (regs)->ARM_sp = current_stack_pointer; \
        (regs)->ARM_cpsr = SVC_MODE; \
}

Lexi
diff mbox series

Patch

diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index fe87397c3d8c..e326520f9386 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -15,8 +15,20 @@  extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 #endif
 
+#ifdef CONFIG_ARM_UNWIND
+/*
+ * With ARM unwind, the pc must match with fp and sp, otherwise
+ * backtrace method unwind_frame() does not work properly.
+ * Get pc with assembly instead of using __ip.
+ */
+#define perf_get_arm_pc(regs, __ip) \
+	__asm__ __volatile__ ("mov %0, pc" : "=r"((regs)->ARM_pc)::)
+#else
+#define perf_get_arm_pc(regs, __ip)	((regs)->ARM_pc = (__ip))
+#endif
+
 #define perf_arch_fetch_caller_regs(regs, __ip) { \
-	(regs)->ARM_pc = (__ip); \
+	perf_get_arm_pc(regs, __ip); \
 	(regs)->ARM_fp = (unsigned long) __builtin_frame_address(0); \
 	(regs)->ARM_sp = current_stack_pointer; \
 	(regs)->ARM_cpsr = SVC_MODE; \