diff mbox

[RFC,2/3] arm64: refactor save_stack_trace()

Message ID 1436765375-7119-3-git-send-email-takahiro.akashi@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

AKASHI Takahiro July 13, 2015, 5:29 a.m. UTC
Ftrace's stack tracer on arm64 returns wrong information about call stacks:

        Depth    Size   Location    (50 entries)
        -----    ----   --------
  0)     5256       0   notifier_call_chain+0x30/0x94
  1)     5256       0   ftrace_call+0x0/0x4
  2)     5256       0   notifier_call_chain+0x2c/0x94
  3)     5256       0   raw_notifier_call_chain+0x34/0x44
  4)     5256       0   timekeeping_update.constprop.9+0xb8/0x114
  5)     5256       0   update_wall_time+0x408/0x6dc

One of tracer functions, ftrace_call (or mcount), is unexpectedly listed.
The *bare* stack dump returned by save_stack_trace() is:
	save_stack_trace_tsk()
	save_stack_trace()
	stack_trace_call()
	ftrace_ops_no_ops()
	ftrace_call()
	notifier_call_chain()
	raw_notifier_call_chain()
	...

On arm64, save_stack_trace() calls save_stack_trace_tsk() and this will
result in putting additional stack frame in the returned list.
This behavior, however, conflicts with  stack stracer's assumption
that the number of functions to be skiped as part of tracer is 4, from
save_stack_trace() to mcount(), if ftrace_ops_list_func() is used.
The value is hard coded in check_patch().

This patch refactors save_stack_trace() and save_stack_trace_tsk() in order
to reduce the stack depth by making the common code inlined.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
---
 arch/arm64/kernel/stacktrace.c |   31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

Comments

Jungseok Lee July 14, 2015, 12:47 p.m. UTC | #1
On Jul 13, 2015, at 2:29 PM, AKASHI Takahiro wrote:

Hi, AKASHI

> Ftrace's stack tracer on arm64 returns wrong information about call stacks:
> 
>        Depth    Size   Location    (50 entries)
>        -----    ----   --------
>  0)     5256       0   notifier_call_chain+0x30/0x94
>  1)     5256       0   ftrace_call+0x0/0x4
>  2)     5256       0   notifier_call_chain+0x2c/0x94
>  3)     5256       0   raw_notifier_call_chain+0x34/0x44
>  4)     5256       0   timekeeping_update.constprop.9+0xb8/0x114
>  5)     5256       0   update_wall_time+0x408/0x6dc
> 
> One of tracer functions, ftrace_call (or mcount), is unexpectedly listed.
> The *bare* stack dump returned by save_stack_trace() is:
> 	save_stack_trace_tsk()
> 	save_stack_trace()
> 	stack_trace_call()
> 	ftrace_ops_no_ops()
> 	ftrace_call()
> 	notifier_call_chain()
> 	raw_notifier_call_chain()
> 	...

Is the below example an unexpected result?
Entry 17 and 18 are ftrace_call and ftrace_ops_no_ops, respectively.

        Depth    Size   Location    (51 entries)
        -----    ----   --------
  0)     5352      96   _raw_spin_unlock_irqrestore+0x1c/0x60
  1)     5256      48   gic_raise_softirq+0xa0/0xbc
  2)     5208      80   smp_cross_call+0x40/0xbc
  3)     5128      48   smp_send_reschedule+0x38/0x48
  4)     5080      32   trigger_load_balance+0x184/0x29c
  5)     5048     112   scheduler_tick+0xac/0x104
  6)     4936      64   update_process_times+0x5c/0x74
  7)     4872      32   tick_sched_handle.isra.15+0x38/0x7c
  8)     4840      48   tick_sched_timer+0x48/0x90
  9)     4792      48   __run_hrtimer+0x60/0x258
 10)     4744      64   hrtimer_interrupt+0xe8/0x260
 11)     4680     128   arch_timer_handler_virt+0x38/0x48
 12)     4552      32   handle_percpu_devid_irq+0x84/0x188
 13)     4520      64   generic_handle_irq+0x38/0x54
 14)     4456      32   __handle_domain_irq+0x68/0xbc
 15)     4424      64   gic_handle_irq+0x38/0x88
 16)     4360     280   el1_irq+0x64/0xd8
 17)     4080     168   ftrace_ops_no_ops+0xb4/0x16c
 18)     3912      32   ftrace_call+0x0/0x4
 19)     3880     144   __alloc_skb+0x48/0x180
 20)     3736      96   alloc_skb_with_frags+0x74/0x234
 21)     3640     112   sock_alloc_send_pskb+0x1d0/0x294
 22)     3528     160   sock_alloc_send_skb+0x44/0x54
 23)     3368      64   __ip_append_data.isra.40+0x78c/0xb48
 24)     3304     224   ip_append_data.part.42+0x98/0xe8
 25)     3080     112   ip_append_data+0x68/0x7c
 26)     2968      96   icmp_push_reply+0x7c/0x144
 27)     2872      96   icmp_send+0x3c0/0x3c8
 28)     2776     192   __udp4_lib_rcv+0x5b8/0x684
 29)     2584      96   udp_rcv+0x2c/0x3c
 30)     2488      32   ip_local_deliver+0xa0/0x224
 31)     2456      48   ip_rcv+0x360/0x57c
 32)     2408      64   __netif_receive_skb_core+0x4d0/0x80c
 33)     2344     128   __netif_receive_skb+0x24/0x84
 34)     2216      32   process_backlog+0x9c/0x15c
 35)     2184      80   net_rx_action+0x1ec/0x32c
 36)     2104     160   __do_softirq+0x114/0x2f0
 37)     1944     128   do_softirq+0x60/0x68
 38)     1816      32   __local_bh_enable_ip+0xb0/0xd4
 39)     1784      32   ip_finish_output+0x1f4/0xabc
 40)     1752      96   ip_output+0xf0/0x120
 41)     1656      64   ip_local_out_sk+0x44/0x54
 42)     1592      32   ip_send_skb+0x24/0xbc
 43)     1560      48   udp_send_skb+0x1b4/0x2f4
 44)     1512      80   udp_sendmsg+0x2a8/0x7a0
 45)     1432     272   inet_sendmsg+0xa0/0xd0
 46)     1160      48   sock_sendmsg+0x30/0x78
 47)     1112      32   ___sys_sendmsg+0x15c/0x26c
 48)     1080     400   __sys_sendmmsg+0x94/0x180
 49)      680     320   SyS_sendmmsg+0x38/0x54
 50)      360     360   el0_svc_naked+0x20/0x28

Best Regards
Jungseok Lee
Steven Rostedt July 14, 2015, 1:31 p.m. UTC | #2
On Tue, 14 Jul 2015 21:47:10 +0900
Jungseok Lee <jungseoklee85@gmail.com> wrote:

> Is the below example an unexpected result?
> Entry 17 and 18 are ftrace_call and ftrace_ops_no_ops, respectively.
> 
>         Depth    Size   Location    (51 entries)
>         -----    ----   --------
>   0)     5352      96   _raw_spin_unlock_irqrestore+0x1c/0x60
>   1)     5256      48   gic_raise_softirq+0xa0/0xbc
>   2)     5208      80   smp_cross_call+0x40/0xbc
>   3)     5128      48   smp_send_reschedule+0x38/0x48
>   4)     5080      32   trigger_load_balance+0x184/0x29c
>   5)     5048     112   scheduler_tick+0xac/0x104
>   6)     4936      64   update_process_times+0x5c/0x74
>   7)     4872      32   tick_sched_handle.isra.15+0x38/0x7c
>   8)     4840      48   tick_sched_timer+0x48/0x90
>   9)     4792      48   __run_hrtimer+0x60/0x258
>  10)     4744      64   hrtimer_interrupt+0xe8/0x260
>  11)     4680     128   arch_timer_handler_virt+0x38/0x48
>  12)     4552      32   handle_percpu_devid_irq+0x84/0x188
>  13)     4520      64   generic_handle_irq+0x38/0x54
>  14)     4456      32   __handle_domain_irq+0x68/0xbc
>  15)     4424      64   gic_handle_irq+0x38/0x88
>  16)     4360     280   el1_irq+0x64/0xd8

Note, function tracing does not disable interrupts. This looks to be
that an interrupt came in while __aloc_skb() was being traced.

-- Steve

>  17)     4080     168   ftrace_ops_no_ops+0xb4/0x16c
>  18)     3912      32   ftrace_call+0x0/0x4
>  19)     3880     144   __alloc_skb+0x48/0x180
>  20)     3736      96   alloc_skb_with_frags+0x74/0x234
>  21)     3640     112   sock_alloc_send_pskb+0x1d0/0x294
>  22)     3528     160   sock_alloc_send_skb+0x44/0x54
>  23)     3368      64   __ip_append_data.isra.40+0x78c/0xb48
>  24)     3304     224   ip_append_data.part.42+0x98/0xe8
>  25)     3080     112   ip_append_data+0x68/0x7c
>  26)     2968      96   icmp_push_reply+0x7c/0x144
>  27)     2872      96   icmp_send+0x3c0/0x3c8
>  28)     2776     192   __udp4_lib_rcv+0x5b8/0x684
>  29)     2584      96   udp_rcv+0x2c/0x3c
>  30)     2488      32   ip_local_deliver+0xa0/0x224
>  31)     2456      48   ip_rcv+0x360/0x57c
>  32)     2408      64   __netif_receive_skb_core+0x4d0/0x80c
>  33)     2344     128   __netif_receive_skb+0x24/0x84
>  34)     2216      32   process_backlog+0x9c/0x15c
>  35)     2184      80   net_rx_action+0x1ec/0x32c
>  36)     2104     160   __do_softirq+0x114/0x2f0
>  37)     1944     128   do_softirq+0x60/0x68
>  38)     1816      32   __local_bh_enable_ip+0xb0/0xd4
>  39)     1784      32   ip_finish_output+0x1f4/0xabc
>  40)     1752      96   ip_output+0xf0/0x120
>  41)     1656      64   ip_local_out_sk+0x44/0x54
>  42)     1592      32   ip_send_skb+0x24/0xbc
>  43)     1560      48   udp_send_skb+0x1b4/0x2f4
>  44)     1512      80   udp_sendmsg+0x2a8/0x7a0
>  45)     1432     272   inet_sendmsg+0xa0/0xd0
>  46)     1160      48   sock_sendmsg+0x30/0x78
>  47)     1112      32   ___sys_sendmsg+0x15c/0x26c
>  48)     1080     400   __sys_sendmmsg+0x94/0x180
>  49)      680     320   SyS_sendmmsg+0x38/0x54
>  50)      360     360   el0_svc_naked+0x20/0x28
> 
> Best Regards
> Jungseok Lee
AKASHI Takahiro July 15, 2015, 12:20 a.m. UTC | #3
On 07/14/2015 10:31 PM, Steven Rostedt wrote:
> On Tue, 14 Jul 2015 21:47:10 +0900
> Jungseok Lee <jungseoklee85@gmail.com> wrote:
>
>> Is the below example an unexpected result?
>> Entry 17 and 18 are ftrace_call and ftrace_ops_no_ops, respectively.

[snip]

> Note, function tracing does not disable interrupts. This looks to be
> that an interrupt came in while __aloc_skb() was being traced.

Yeah, I think so, too. But if my insight is correct, it's not __alloc_skb()
but one of functions that it calls. As I said in the commit log message
of patch[1/3], the exact traced function will not be listed by
save_stack_trace() because we don't create a stack frame at mcount().
I think this is a flaw in the current implementation (on x86).

what do you think, Steve?

-Takahiro AKASHI

>
> -- Steve
>
>>   17)     4080     168   ftrace_ops_no_ops+0xb4/0x16c
>>   18)     3912      32   ftrace_call+0x0/0x4
>>   19)     3880     144   __alloc_skb+0x48/0x180
>>   20)     3736      96   alloc_skb_with_frags+0x74/0x234
>>   21)     3640     112   sock_alloc_send_pskb+0x1d0/0x294
>>   22)     3528     160   sock_alloc_send_skb+0x44/0x54
>>   23)     3368      64   __ip_append_data.isra.40+0x78c/0xb48
>>   24)     3304     224   ip_append_data.part.42+0x98/0xe8
>>   25)     3080     112   ip_append_data+0x68/0x7c
>>   26)     2968      96   icmp_push_reply+0x7c/0x144
>>   27)     2872      96   icmp_send+0x3c0/0x3c8
>>   28)     2776     192   __udp4_lib_rcv+0x5b8/0x684
>>   29)     2584      96   udp_rcv+0x2c/0x3c
>>   30)     2488      32   ip_local_deliver+0xa0/0x224
>>   31)     2456      48   ip_rcv+0x360/0x57c
>>   32)     2408      64   __netif_receive_skb_core+0x4d0/0x80c
>>   33)     2344     128   __netif_receive_skb+0x24/0x84
>>   34)     2216      32   process_backlog+0x9c/0x15c
>>   35)     2184      80   net_rx_action+0x1ec/0x32c
>>   36)     2104     160   __do_softirq+0x114/0x2f0
>>   37)     1944     128   do_softirq+0x60/0x68
>>   38)     1816      32   __local_bh_enable_ip+0xb0/0xd4
>>   39)     1784      32   ip_finish_output+0x1f4/0xabc
>>   40)     1752      96   ip_output+0xf0/0x120
>>   41)     1656      64   ip_local_out_sk+0x44/0x54
>>   42)     1592      32   ip_send_skb+0x24/0xbc
>>   43)     1560      48   udp_send_skb+0x1b4/0x2f4
>>   44)     1512      80   udp_sendmsg+0x2a8/0x7a0
>>   45)     1432     272   inet_sendmsg+0xa0/0xd0
>>   46)     1160      48   sock_sendmsg+0x30/0x78
>>   47)     1112      32   ___sys_sendmsg+0x15c/0x26c
>>   48)     1080     400   __sys_sendmmsg+0x94/0x180
>>   49)      680     320   SyS_sendmmsg+0x38/0x54
>>   50)      360     360   el0_svc_naked+0x20/0x28
>>
>> Best Regards
>> Jungseok Lee
>
Steven Rostedt July 15, 2015, 2:51 a.m. UTC | #4
On Wed, 15 Jul 2015 09:20:42 +0900
AKASHI Takahiro <takahiro.akashi@linaro.org> wrote:

> On 07/14/2015 10:31 PM, Steven Rostedt wrote:
> > On Tue, 14 Jul 2015 21:47:10 +0900
> > Jungseok Lee <jungseoklee85@gmail.com> wrote:
> >
> >> Is the below example an unexpected result?
> >> Entry 17 and 18 are ftrace_call and ftrace_ops_no_ops, respectively.
> 
> [snip]
> 
> > Note, function tracing does not disable interrupts. This looks to be
> > that an interrupt came in while __aloc_skb() was being traced.
> 
> Yeah, I think so, too. But if my insight is correct, it's not __alloc_skb()
> but one of functions that it calls. As I said in the commit log message
> of patch[1/3], the exact traced function will not be listed by
> save_stack_trace() because we don't create a stack frame at mcount().
> I think this is a flaw in the current implementation (on x86).
> 
> what do you think, Steve?
> 

mcount (well ftrace_call actually) does indeed create a stack frame for
itself *and* for what called it. At least on x86_64. See mcount_64.S.

With -pg -mfentry, it creates a stack frame. Without -mfentry, mcount
is called after the current function's frame is made so we don't need
to do much.

Here's what the -mfentry version does:

	pushq %rbp
	pushq 8*2(%rsp)  /* this is the parent pointer */
	pushq %rbp
	movq %rsp, %rbp
	pushq 8*3(%rsp)   /* Return address to ftrace_call */
	pushq %rbp
	movq %rsp, %rbp


Thus the stack looks like this:

                                         <---+
	|                              |     |
	+------------------------------+     |
	| return address for func      |     |
	| return address for func_call |     |
	| original %rbp                |     |
	+------------------------------+     |
	| return address for func      |     |
	| ptr to parent frame (%rbp)   | ----+
 	+------------------------------| <-----+
	| return address for func_call |       |
        | ptr to next frame (%rbp)     | ------+
	+------------------------------+ <---+
                                             |
                                             |
 Current %rbp points to func_call frame -----+

 The first box isn't used as a frame, but is used by ftrace_call to save
 information to restore everything properly.

Thus, __alloc_skb() is what is currently being traced.


-- Steve
AKASHI Takahiro July 15, 2015, 11:41 a.m. UTC | #5
Steve,

On 07/15/2015 11:51 AM, Steven Rostedt wrote:
> On Wed, 15 Jul 2015 09:20:42 +0900
> AKASHI Takahiro <takahiro.akashi@linaro.org> wrote:
>
>> On 07/14/2015 10:31 PM, Steven Rostedt wrote:
>>> On Tue, 14 Jul 2015 21:47:10 +0900
>>> Jungseok Lee <jungseoklee85@gmail.com> wrote:
>>>
>>>> Is the below example an unexpected result?
>>>> Entry 17 and 18 are ftrace_call and ftrace_ops_no_ops, respectively.
>>
>> [snip]
>>
>>> Note, function tracing does not disable interrupts. This looks to be
>>> that an interrupt came in while __aloc_skb() was being traced.
>>
>> Yeah, I think so, too. But if my insight is correct, it's not __alloc_skb()
>> but one of functions that it calls. As I said in the commit log message
>> of patch[1/3], the exact traced function will not be listed by

not patch[1/3], but patch[3/3]

>> save_stack_trace() because we don't create a stack frame at mcount().
>> I think this is a flaw in the current implementation (on x86).
>>
>> what do you think, Steve?
>>
>
> mcount (well ftrace_call actually) does indeed create a stack frame for
> itself *and* for what called it. At least on x86_64. See mcount_64.S.
>
> With -pg -mfentry, it creates a stack frame. Without -mfentry, mcount
> is called after the current function's frame is made so we don't need
> to do much.

Thank you for the explanation. But what I don't really understand here
is why we need to add the "current function" to the stack dump list
returned by save_stack_trace():

In check_stack(),
 >        /*
 >         * Add the passed in ip from the function tracer.
 >         * Searching for this on the stack will skip over
 >         * most of the overhead from the stack tracer itself.
 >         */
 >        stack_dump_trace[0] = ip;
 >        max_stack_trace.nr_entries++;

I think that "ip" here is the "return address for func" in your
ascii art, and it should be already in the list if a frame is made
by mcount (or func_call).

In fact, stack tracer on arm64 works OK even without the patch[3/3]
if the code quoted above is commented out.
Even on x86, the code is conditional and not activated if the kernel
is compiled without -mfentry before the following commit:
     commit 4df297129f62 ("tracing: Remove most or all of stack tracer stack size from stack_max_size")

So what do I misunderstand here?

Thanks,
-Takahiro AKASHI

> Here's what the -mfentry version does:
>
> 	pushq %rbp
> 	pushq 8*2(%rsp)  /* this is the parent pointer */
> 	pushq %rbp
> 	movq %rsp, %rbp
> 	pushq 8*3(%rsp)   /* Return address to ftrace_call */
> 	pushq %rbp
> 	movq %rsp, %rbp
>
>
> Thus the stack looks like this:
>
>                                           <---+
> 	|                              |     |
> 	+------------------------------+     |
> 	| return address for func      |     |
> 	| return address for func_call |     |
> 	| original %rbp                |     |
> 	+------------------------------+     |
> 	| return address for func      |     |
> 	| ptr to parent frame (%rbp)   | ----+
>   	+------------------------------| <-----+
> 	| return address for func_call |       |
>          | ptr to next frame (%rbp)     | ------+
> 	+------------------------------+ <---+
>                                               |
>                                               |
>   Current %rbp points to func_call frame -----+
>
>   The first box isn't used as a frame, but is used by ftrace_call to save
>   information to restore everything properly.
>
> Thus, __alloc_skb() is what is currently being traced.
>
>
> -- Steve
>
Steven Rostedt July 15, 2015, 2:55 p.m. UTC | #6
On Wed, 15 Jul 2015 20:41:34 +0900
AKASHI Takahiro <takahiro.akashi@linaro.org> wrote:


> Thank you for the explanation. But what I don't really understand here
> is why we need to add the "current function" to the stack dump list
> returned by save_stack_trace():
> 
> In check_stack(),
>  >        /*
>  >         * Add the passed in ip from the function tracer.
>  >         * Searching for this on the stack will skip over
>  >         * most of the overhead from the stack tracer itself.
>  >         */
>  >        stack_dump_trace[0] = ip;
>  >        max_stack_trace.nr_entries++;
> 
> I think that "ip" here is the "return address for func" in your

Ah, you are correct (for fentry).

> ascii art, and it should be already in the list if a frame is made
> by mcount (or func_call).
> 
> In fact, stack tracer on arm64 works OK even without the patch[3/3]
> if the code quoted above is commented out.
> Even on x86, the code is conditional and not activated if the kernel
> is compiled without -mfentry before the following commit:
>      commit 4df297129f62 ("tracing: Remove most or all of stack tracer stack size from stack_max_size")
> 
> So what do I misunderstand here?
> 

Hmm, I haven't touched the stack trace code in a while. With the added
stack frame for fentry, the hacks there are probably not needed.

I'll take a look at it and try to clean up the code.

Thanks,

-- Steve
diff mbox

Patch

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 407991b..978c923 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -97,34 +97,49 @@  static int save_trace(struct stackframe *frame, void *d)
 	return trace->nr_entries >= trace->max_entries;
 }
 
-void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+static inline void __save_stack_trace(struct stackframe *frame,
+			struct stack_trace *trace, int no_sched)
 {
 	struct stack_trace_data data;
-	struct stackframe frame;
 
 	data.trace = trace;
 	data.skip = trace->skip;
+	data.no_sched_functions = no_sched;
+
+	walk_stackframe(frame, save_trace, &data);
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	struct stackframe frame;
+	int no_sched;
 
 	if (tsk != current) {
-		data.no_sched_functions = 1;
+		no_sched = 1;
 		frame.fp = thread_saved_fp(tsk);
 		frame.sp = thread_saved_sp(tsk);
 		frame.pc = thread_saved_pc(tsk);
 	} else {
-		data.no_sched_functions = 0;
+		no_sched = 0;
 		frame.fp = (unsigned long)__builtin_frame_address(0);
 		frame.sp = current_stack_pointer;
 		frame.pc = (unsigned long)save_stack_trace_tsk;
 	}
 
-	walk_stackframe(&frame, save_trace, &data);
-	if (trace->nr_entries < trace->max_entries)
-		trace->entries[trace->nr_entries++] = ULONG_MAX;
+	__save_stack_trace(&frame, trace, no_sched);
 }
 
 void save_stack_trace(struct stack_trace *trace)
 {
-	save_stack_trace_tsk(current, trace);
+	struct stackframe frame;
+
+	frame.fp = (unsigned long)__builtin_frame_address(0);
+	frame.sp = current_stack_pointer;
+	frame.pc = (unsigned long)save_stack_trace_tsk;
+
+	__save_stack_trace(&frame, trace, 0);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 #endif