diff mbox series

[v2] ARM: unwind: improve unwinders for noreturn case

Message ID 1710906278-23851-1-git-send-email-xiaojiangfeng@huawei.com (mailing list archive)
State Superseded
Headers show
Series [v2] ARM: unwind: improve unwinders for noreturn case | expand

Commit Message

Jiangfeng Xiao March 20, 2024, 3:44 a.m. UTC
This is an off-by-one bug which is common in unwinders,
due to the fact that the address on the stack points
to the return address rather than the call address.

So, for example, when the last instruction of a function
is a function call (e.g., to a noreturn function), it can
cause the unwinder to incorrectly try to unwind from
the function after the callee.

foo:
...
	bl	bar
... end of function and thus next function ...

which results in LR pointing into the next function.

Fixed this by subtracting 1 from frmae->pc in the call frame
(but not exception frames) like ORC on x86 does.

Refer to the unwind_next_frame function in the unwind_orc.c

Suggested-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lkml.kernel.org/lkml/20240305175846.qnyiru7uaa7itqba@treble/
Signed-off-by: Jiangfeng Xiao <xiaojiangfeng@huawei.com>
---
ChangeLog v1->v2
- stay printk("%s...", loglvl, ...)
---
 arch/arm/include/asm/stacktrace.h |  4 ----
 arch/arm/kernel/stacktrace.c      |  2 --
 arch/arm/kernel/traps.c           |  4 ++--
 arch/arm/kernel/unwind.c          | 18 +++++++++++++++---
 4 files changed, 17 insertions(+), 11 deletions(-)

Comments

Russell King (Oracle) March 20, 2024, 8:45 a.m. UTC | #1
On Wed, Mar 20, 2024 at 11:44:38AM +0800, Jiangfeng Xiao wrote:
> This is an off-by-one bug which is common in unwinders,
> due to the fact that the address on the stack points
> to the return address rather than the call address.
> 
> So, for example, when the last instruction of a function
> is a function call (e.g., to a noreturn function), it can
> cause the unwinder to incorrectly try to unwind from
> the function after the callee.
> 
> foo:
> ...
> 	bl	bar
> ... end of function and thus next function ...
> 
> which results in LR pointing into the next function.
> 
> Fixed this by subtracting 1 from frmae->pc in the call frame
> (but not exception frames) like ORC on x86 does.

The reason that I'm not accepting this patch is because the above says
that it fixes it by subtracting 1 from the PC value, but the patch is
*way* more complicated than that and there's no explanation why.

For example, the following are unexplained:

- Why do we always need ex_frame
- What is the purpose of the change in format string for the display of
  backtraces

> 
> Refer to the unwind_next_frame function in the unwind_orc.c
> 
> Suggested-by: Josh Poimboeuf <jpoimboe@kernel.org>
> Link: https://lkml.kernel.org/lkml/20240305175846.qnyiru7uaa7itqba@treble/
> Signed-off-by: Jiangfeng Xiao <xiaojiangfeng@huawei.com>
> ---
> ChangeLog v1->v2
> - stay printk("%s...", loglvl, ...)
> ---
>  arch/arm/include/asm/stacktrace.h |  4 ----
>  arch/arm/kernel/stacktrace.c      |  2 --
>  arch/arm/kernel/traps.c           |  4 ++--
>  arch/arm/kernel/unwind.c          | 18 +++++++++++++++---
>  4 files changed, 17 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm/include/asm/stacktrace.h b/arch/arm/include/asm/stacktrace.h
> index 360f0d2..07e4c16 100644
> --- a/arch/arm/include/asm/stacktrace.h
> +++ b/arch/arm/include/asm/stacktrace.h
> @@ -21,9 +21,7 @@ struct stackframe {
>  	struct llist_node *kr_cur;
>  	struct task_struct *tsk;
>  #endif
> -#ifdef CONFIG_UNWINDER_FRAME_POINTER
>  	bool ex_frame;
> -#endif
>  };
>  
>  static __always_inline
> @@ -37,9 +35,7 @@ void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe *frame)
>  		frame->kr_cur = NULL;
>  		frame->tsk = current;
>  #endif
> -#ifdef CONFIG_UNWINDER_FRAME_POINTER
>  		frame->ex_frame = in_entry_text(frame->pc);
> -#endif
>  }
>  
>  extern int unwind_frame(struct stackframe *frame);
> diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c
> index 620aa82..1abd4f9 100644
> --- a/arch/arm/kernel/stacktrace.c
> +++ b/arch/arm/kernel/stacktrace.c
> @@ -154,9 +154,7 @@ static void start_stack_trace(struct stackframe *frame, struct task_struct *task
>  	frame->kr_cur = NULL;
>  	frame->tsk = task;
>  #endif
> -#ifdef CONFIG_UNWINDER_FRAME_POINTER
>  	frame->ex_frame = in_entry_text(frame->pc);
> -#endif
>  }
>  
>  void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
> diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
> index 3bad79d..46a5b1e 100644
> --- a/arch/arm/kernel/traps.c
> +++ b/arch/arm/kernel/traps.c
> @@ -84,10 +84,10 @@ void dump_backtrace_entry(unsigned long where, unsigned long from,
>  	printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n",
>  		loglvl, where, from);
>  #elif defined CONFIG_BACKTRACE_VERBOSE
> -	printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n",
> +	printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pB)\n",
>  		loglvl, where, (void *)where, from, (void *)from);
>  #else
> -	printk("%s %ps from %pS\n", loglvl, (void *)where, (void *)from);
> +	printk("%s %ps from %pB\n", loglvl, (void *)where, (void *)from);
>  #endif
>  
>  	if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE))
> diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
> index 9d21921..f2baf92 100644
> --- a/arch/arm/kernel/unwind.c
> +++ b/arch/arm/kernel/unwind.c
> @@ -30,6 +30,7 @@
>  #include <linux/list.h>
>  #include <linux/module.h>
>  
> +#include <asm/sections.h>
>  #include <asm/stacktrace.h>
>  #include <asm/traps.h>
>  #include <asm/unwind.h>
> @@ -416,8 +417,14 @@ int unwind_frame(struct stackframe *frame)
>  
>  	pr_debug("%s(pc = %08lx lr = %08lx sp = %08lx)\n", __func__,
>  		 frame->pc, frame->lr, frame->sp);
> -
> -	idx = unwind_find_idx(frame->pc);
> +	/*
> +	 * For a call frame (as opposed to a exception frame), when the last
> +	 * instruction of a function is a function call (e.g., to a noreturn
> +	 * function), it can cause the unwinder incorrectly try to unwind
> +	 * from the function after the callee, fixed this by subtracting 1
> +	 * from frame->pc in the call frame like ORC on x86 does.
> +	 */
> +	idx = unwind_find_idx(frame->ex_frame ? frame->pc : frame->pc - 1);
>  	if (!idx) {
>  		if (frame->pc && kernel_text_address(frame->pc)) {
>  			if (in_module_plt(frame->pc) && frame->pc != frame->lr) {
> @@ -427,6 +434,7 @@ int unwind_frame(struct stackframe *frame)
>  				 * the state of the stack or the register file
>  				 */
>  				frame->pc = frame->lr;
> +				frame->ex_frame = in_entry_text(frame->pc);
>  				return URC_OK;
>  			}
>  			pr_warn("unwind: Index not found %08lx\n", frame->pc);
> @@ -454,6 +462,7 @@ int unwind_frame(struct stackframe *frame)
>  		if (frame->pc == frame->lr)
>  			return -URC_FAILURE;
>  		frame->pc = frame->lr;
> +		frame->ex_frame = in_entry_text(frame->pc);
>  		return URC_OK;
>  	} else if ((idx->insn & 0x80000000) == 0)
>  		/* prel31 to the unwind table */
> @@ -515,6 +524,7 @@ int unwind_frame(struct stackframe *frame)
>  	frame->lr = ctrl.vrs[LR];
>  	frame->pc = ctrl.vrs[PC];
>  	frame->lr_addr = ctrl.lr_addr;
> +	frame->ex_frame = in_entry_text(frame->pc);
>  
>  	return URC_OK;
>  }
> @@ -544,6 +554,7 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk,
>  		 */
>  here:
>  		frame.pc = (unsigned long)&&here;
> +		frame.ex_frame = false;
>  	} else {
>  		/* task blocked in __switch_to */
>  		frame.fp = thread_saved_fp(tsk);
> @@ -554,11 +565,12 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk,
>  		 */
>  		frame.lr = 0;
>  		frame.pc = thread_saved_pc(tsk);
> +		frame.ex_frame = false;
>  	}
>  
>  	while (1) {
>  		int urc;
> -		unsigned long where = frame.pc;
> +		unsigned long where = frame.ex_frame ? frame.pc : frame.pc - 1;
>  
>  		urc = unwind_frame(&frame);
>  		if (urc < 0)
> -- 
> 1.8.5.6
> 
>
Jiangfeng Xiao March 20, 2024, 3:30 p.m. UTC | #2
On 2024/3/20 16:45, Russell King (Oracle) wrote:
> On Wed, Mar 20, 2024 at 11:44:38AM +0800, Jiangfeng Xiao wrote:
>> This is an off-by-one bug which is common in unwinders,
>> due to the fact that the address on the stack points
>> to the return address rather than the call address.
>>
>> So, for example, when the last instruction of a function
>> is a function call (e.g., to a noreturn function), it can
>> cause the unwinder to incorrectly try to unwind from
>> the function after the callee.
>>
>> foo:
>> ...
>> 	bl	bar
>> ... end of function and thus next function ...
>>
>> which results in LR pointing into the next function.
>>
>> Fixed this by subtracting 1 from frmae->pc in the call frame
>> (but not exception frames) like ORC on x86 does.
> 
> The reason that I'm not accepting this patch is because the above says
> that it fixes it by subtracting 1 from the PC value, but the patch is
> *way* more complicated than that and there's no explanation why.
> 
> For example, the following are unexplained:
> 
> - Why do we always need ex_frame

```
bar:
...
... end of function bar ...

foo:
    BUG();
... end of function foo ...
```

For example, when the first instruction of function 'foo'
is a undefined instruction, after function 'foo' is executed
to trigger an exception, 'arm_get_current_stackframe' assigns
'regs->ARM_pc' to 'frame.pc'.

If we always decrement frame.pc by 1, unwinder will incorrectly
try to unwind from the function 'bar' before the function 'foo'.

So we need to 'ext_frame' to distinguish this case
where we don't need to subtract 1.


> - What is the purpose of the change in format string for the display of
>   backtraces
```
unwind_frame(&frame);
dump_backtrace_entry(...from) //from = frame.pc
	printk("...%pS\n", ...(void *)from);
```
%pB will do sprint_backtrace and print the symbol at (from - 1) address
%pS will do sprint_symbol_build_id and print the symbol at (from) address

In unwind_frame, although we use 'frame->pc - 1' to execute unwind_find_idx,
but frame->pc itself does not change, in the noreturn case, frame->pc still
pointing into the next function. So this is going to be replaced by %pB.

> 
>>
>> Refer to the unwind_next_frame function in the unwind_orc.c
>>
>> Suggested-by: Josh Poimboeuf <jpoimboe@kernel.org>
>> Link: https://lkml.kernel.org/lkml/20240305175846.qnyiru7uaa7itqba@treble/
>> Signed-off-by: Jiangfeng Xiao <xiaojiangfeng@huawei.com>
>> ---
>> ChangeLog v1->v2
>> - stay printk("%s...", loglvl, ...)


Thank you for your suggestion.
I'll change the code to be more concise in my [patch v3].


>> -- 
>> 1.8.5.6
>>
>>
>
Russell King (Oracle) March 20, 2024, 7:40 p.m. UTC | #3
On Wed, Mar 20, 2024 at 11:30:05PM +0800, Jiangfeng Xiao wrote:
> 
> 
> On 2024/3/20 16:45, Russell King (Oracle) wrote:
> > On Wed, Mar 20, 2024 at 11:44:38AM +0800, Jiangfeng Xiao wrote:
> >> This is an off-by-one bug which is common in unwinders,
> >> due to the fact that the address on the stack points
> >> to the return address rather than the call address.
> >>
> >> So, for example, when the last instruction of a function
> >> is a function call (e.g., to a noreturn function), it can
> >> cause the unwinder to incorrectly try to unwind from
> >> the function after the callee.
> >>
> >> foo:
> >> ...
> >> 	bl	bar
> >> ... end of function and thus next function ...
> >>
> >> which results in LR pointing into the next function.
> >>
> >> Fixed this by subtracting 1 from frmae->pc in the call frame
> >> (but not exception frames) like ORC on x86 does.
> > 
> > The reason that I'm not accepting this patch is because the above says
> > that it fixes it by subtracting 1 from the PC value, but the patch is
> > *way* more complicated than that and there's no explanation why.
> > 
> > For example, the following are unexplained:
> > 
> > - Why do we always need ex_frame
> 
> ```
> bar:
> ...
> ... end of function bar ...
> 
> foo:
>     BUG();
> ... end of function foo ...
> ```
> 
> For example, when the first instruction of function 'foo'
> is a undefined instruction, after function 'foo' is executed
> to trigger an exception, 'arm_get_current_stackframe' assigns
> 'regs->ARM_pc' to 'frame.pc'.
> 
> If we always decrement frame.pc by 1, unwinder will incorrectly
> try to unwind from the function 'bar' before the function 'foo'.
> 
> So we need to 'ext_frame' to distinguish this case
> where we don't need to subtract 1.

This just sounds wrong to me. We have two different cases:

1) we're unwinding a frame where PC points at the offending instruction.
   This may or may not be in the exception text.
2) we're unwinding a frame that has been created because of a branch,
   where the PC points at the next instruction _after_ that callsite.

While we're unwinding, we will mostly hit the second type of frames, but
we'll only hit the first type on the initial frame. Some exception
entries will have the PC pointing at the next instruction to be
executed, others will have it pointing at the offending instruction
(e.g. if it needs to be retried.)

So, I don't see what being in the exception/entry text really has much
to do with any decision making here. I think you've found that it works
for your case, but it won't _always_ work and you're just shifting the
"bug" with how these traces work from one issue to a different but
similar issue.

> > - What is the purpose of the change in format string for the display of
> >   backtraces
> ```
> unwind_frame(&frame);
> dump_backtrace_entry(...from) //from = frame.pc
> 	printk("...%pS\n", ...(void *)from);
> ```
> %pB will do sprint_backtrace and print the symbol at (from - 1) address
> %pS will do sprint_symbol_build_id and print the symbol at (from) address

The quote in printk-formats.rst states:

        %pS     versatile_init+0x0/0x110
	%pB     prev_fn_of_versatile_init+0x88/0x88

This is rather ambiguous on its own, since the definition of "previous
function" is ambiguous. Given the offset and size stated there, it's
also not obvious what pointer was passed. It would be nice if these
examples actually said what the pointer passed in actually was.

I had been interpreting "prev_fn_of_" to mean the caller of
versatile_init() but it could also be the preceeding function in the
kernel text to versatile_init() - that is where the ambiguity comes
from.

So, the question I now have is... if %pB prints the symbol corresponding
with "from - 1", then

- with the frame pointer walker, from will always be the return address
  found on the stack for the function we are currently in.
- with the unwinder it will be whatever the unwinder computes as the LR
  register unless the unwind instructions place a non-zero value in PC.

Is there a case where the unwinder gets this wrong?

I think what would help is if you split this patch up, and addressed
each part separately, describing the issue that each part is addressing
giving an example that clearly explains what the patch is doing.
However, please note my comments above that using the fact that we're
in an exception frame doesn't actually tell you anything about whether
you need to correct the PC value or not.
Jiangfeng Xiao March 21, 2024, 9:44 a.m. UTC | #4
On 2024/3/21 3:40, Russell King (Oracle) wrote:
> On Wed, Mar 20, 2024 at 11:30:05PM +0800, Jiangfeng Xiao wrote:
>>
>>
>> On 2024/3/20 16:45, Russell King (Oracle) wrote:
>>> On Wed, Mar 20, 2024 at 11:44:38AM +0800, Jiangfeng Xiao wrote:
>>>> This is an off-by-one bug which is common in unwinders,
>>>> due to the fact that the address on the stack points
>>>> to the return address rather than the call address.
>>>>
>>>> So, for example, when the last instruction of a function
>>>> is a function call (e.g., to a noreturn function), it can
>>>> cause the unwinder to incorrectly try to unwind from
>>>> the function after the callee.
>>>>
>>>> foo:
>>>> ...
>>>> 	bl	bar
>>>> ... end of function and thus next function ...
>>>>
>>>> which results in LR pointing into the next function.
>>>>
>>>> Fixed this by subtracting 1 from frmae->pc in the call frame
>>>> (but not exception frames) like ORC on x86 does.
>>>
>>> The reason that I'm not accepting this patch is because the above says
>>> that it fixes it by subtracting 1 from the PC value, but the patch is
>>> *way* more complicated than that and there's no explanation why.
>>>
>>> For example, the following are unexplained:
>>>
>>> - Why do we always need ex_frame
>>
>> ```
>> bar:
>> ...
>> ... end of function bar ...
>>
>> foo:
>>     BUG();
>> ... end of function foo ...
>> ```
>>
>> For example, when the first instruction of function 'foo'
>> is a undefined instruction, after function 'foo' is executed
>> to trigger an exception, 'arm_get_current_stackframe' assigns
>> 'regs->ARM_pc' to 'frame.pc'.
>>
>> If we always decrement frame.pc by 1, unwinder will incorrectly
>> try to unwind from the function 'bar' before the function 'foo'.
>>
>> So we need to 'ext_frame' to distinguish this case
>> where we don't need to subtract 1.
> 
> This just sounds wrong to me. We have two different cases:
> 
> 1) we're unwinding a frame where PC points at the offending instruction.
>    This may or may not be in the exception text.
> 2) we're unwinding a frame that has been created because of a branch,
>    where the PC points at the next instruction _after_ that callsite.
> 
> While we're unwinding, we will mostly hit the second type of frames, but
> we'll only hit the first type on the initial frame. Some exception
> entries will have the PC pointing at the next instruction to be
> executed, others will have it pointing at the offending instruction
> (e.g. if it needs to be retried.)


Thank you for your summary.

Now we try to enumerate all cases:

1) When we hit the first type of frames

1.1) If the pc pointing at the next instruction
     of the offending instruction

1.1.1) If the offending instruction is the first instruction
       of the function
       pc:   cause to unwind from current function
       pc-1: casue to unwind from current function

1.1.2) If the offending instruction is neither the first
       instruction nor the last instruction of the function
       pc:   cause to unwind from current function
       pc-1: casue to unwind from current function

1.1.3) If the offending instruction is the last instruction
       of the function
       pc:   cause to unwind from next    function
       pc-1: casue to unwind from current function

1.2) If the pc pointing at the offending instruction

1.2.1) If the offending instruction is the first instruction
       of the function
       pc:   cause to unwind from current  function
       pc-1: casue to unwind from previous function

1.2.2) If the offending instruction is neither the first
       instruction nor the last instruction of the function
       pc:   cause to unwind from current function
       pc-1: casue to unwind from current function

1.2.3) If the offending instruction is the last instruction
       of the function
       pc:   cause to unwind from current function
       pc-1: casue to unwind from current function

2) When we hit the second type of frames
2.1) pc always pointing at the next instruction after that callsite
2.1.1) If the callsite is the first instruction
       pc:   cause to unwind from current function
       pc-1: casue to unwind from current function

2.1.2) If the callsite is neither the first nor last instruction
       pc:   cause to unwind from current function
       pc-1: casue to unwind from current function

2.1.3) If the callsite is the last instruction
       pc:   cause to unwind from next    function
       pc-1: casue to unwind from current function


All in all, I think you are right.

In case 2), We can always unwind by 'pc-1'.

In case 1), If we unwind by 'pc', case 1.1.3) is problematic.
If we unwind by 'pc-1', 1.2.1) is problematic.


> 
> So, I don't see what being in the exception/entry text really has much
> to do with any decision making here. I think you've found that it works
> for your case, but it won't _always_ work and you're just shifting the
> "bug" with how these traces work from one issue to a different but
> similar issue.
David Laight March 21, 2024, 10:22 a.m. UTC | #5
How aggressively does the compiler optimise 'noreturn' functions?
Consider:
void f(...)
{
	...
	if () {
		...
		noreturn(...);
	}
}

Without the noreturn() call it is a leaf function.
So the compiler doesn't need to save 'lr' on stack
(or the save could be deferred to inside the conditional).
Since noreturn() doesn't return it can be jumped to.
Additionally 'lr' can be used as a scratch register prior to
the noreturn() call.

So it is likely that inside noreturn() (and anything it
might call) you don't have a valid 'lr' chain at all.
No amount of picking between 'pc' and 'pc-1' is going to fix that.
The only way you can find a return address is by searching the
stack and hoping to find something that works.

So you need the compiler to 'not believe' the 'noreturn' attribute.
Setup a normal call frame and put a faulting instruction after the
call in case it returns.
That would give you half a chance of generating a backtrace.

Without that I suspect you are playing whack-a-mole.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Russell King (Oracle) March 21, 2024, 11:23 a.m. UTC | #6
On Thu, Mar 21, 2024 at 10:22:30AM +0000, David Laight wrote:
> How aggressively does the compiler optimise 'noreturn' functions?

I've seen cases where the compiler emits a BL instruction as the very
last thing in the function, and nothing after it.

This is where the problem lies - because the link register value
created by the BL instruction will point to the instruction after the
BL which will _not_ part of the function that invoked the BL. That
will probably cause issues for the ELF unwinder, which means this
issue probably goes beyond _just_ printing the function name.

I have vague memories that Ard has been involved in the unwinder,
maybe he could comment on this problem? Maybe we need the unwinder
itself to do the correction? I also wonder whether we should only
do the correction if we detect that we're pointing at the first
instruction of a function, and the previous instruction in the
text segment was a BL.
David Laight March 21, 2024, 12:07 p.m. UTC | #7
From: Russell King
> Sent: 21 March 2024 11:24
> 
> On Thu, Mar 21, 2024 at 10:22:30AM +0000, David Laight wrote:
> > How aggressively does the compiler optimise 'noreturn' functions?
> 
> I've seen cases where the compiler emits a BL instruction as the very
> last thing in the function, and nothing after it.

I've also seen the compiler defer generating a stack frame until
after an initial conditional.
That might mean you can get the BL in the middle of a function
but where the following instruction is for the 'no stack frame'
side of the branch.
That is very likely to break any stack offset calculations. 

> This is where the problem lies - because the link register value
> created by the BL instruction will point to the instruction after the
> BL which will _not_ part of the function that invoked the BL. That
> will probably cause issues for the ELF unwinder, which means this
> issue probably goes beyond _just_ printing the function name.

Isn't this already in the unwinder?
A BL itself isn't going to fault with PC = next-instruction.

For pretty much all code isn't *(LR-4) going to be BL?
On arm that is probably testable.
(It is pretty much impossible to detect a ACLL on x86.)
If it is a direct BL then you'd normally expect to the be
a call the function containing the current 'PC'.
The obvious exception is if there was a tail call, and printing
the called address would then be useful.
(It might help with leaf functions that don't generate a stack frame.)

I remember issues with the solaris sparc backtrace that used to
get confused by leaf functions...

> I have vague memories that Ard has been involved in the unwinder,
> maybe he could comment on this problem? Maybe we need the unwinder
> itself to do the correction? I also wonder whether we should only
> do the correction if we detect that we're pointing at the first
> instruction of a function, and the previous instruction in the
> text segment was a BL.

It might be enough to depend on whether the address is that of a
fault (where the instruction could be retried) or from a call/trap
instruction where it will be the following instruction.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Russell King (Oracle) March 21, 2024, 12:22 p.m. UTC | #8
On Thu, Mar 21, 2024 at 12:07:51PM +0000, David Laight wrote:
> From: Russell King
> > Sent: 21 March 2024 11:24
> > 
> > On Thu, Mar 21, 2024 at 10:22:30AM +0000, David Laight wrote:
> > > How aggressively does the compiler optimise 'noreturn' functions?
> > 
> > I've seen cases where the compiler emits a BL instruction as the very
> > last thing in the function, and nothing after it.
> 
> I've also seen the compiler defer generating a stack frame until
> after an initial conditional.

.. which is why we pass -mno-sched-prolog to GCC.

> That might mean you can get the BL in the middle of a function
> but where the following instruction is for the 'no stack frame'
> side of the branch.
> That is very likely to break any stack offset calculations. 

No it can't. At any one point in the function, the stack has to be in
a well defined state, so that access to local variables can work, and
also the stack can be correctly unwound. If there exists a point in
the function body which can be reached where the stack could be in two
different states, then the stack can't be restored to the parent
context.

> > This is where the problem lies - because the link register value
> > created by the BL instruction will point to the instruction after the
> > BL which will _not_ part of the function that invoked the BL. That
> > will probably cause issues for the ELF unwinder, which means this
> > issue probably goes beyond _just_ printing the function name.
> 
> Isn't this already in the unwinder?
> A BL itself isn't going to fault with PC = next-instruction.

You are missing the fact that the PC can be the saved LR, and thus
can very well be the next instruction.
David Laight March 21, 2024, 12:57 p.m. UTC | #9
From: Russell King
> Sent: 21 March 2024 12:23
...
> > That might mean you can get the BL in the middle of a function
> > but where the following instruction is for the 'no stack frame'
> > side of the branch.
> > That is very likely to break any stack offset calculations.
> 
> No it can't. At any one point in the function, the stack has to be in
> a well defined state, so that access to local variables can work, and
> also the stack can be correctly unwound. If there exists a point in
> the function body which can be reached where the stack could be in two
> different states, then the stack can't be restored to the parent
> context.

Actually you can get there with a function that has a lot of args.
So you can have:
	if (...) {
		push x
		bl func
		add %sp, #8
	}
	code;
which is fine.
But if 'func' is 'noreturn' then the 'add %sp, #8' can be discarded
and then the saved LR is that of 'code' - but the stack offset is wrong.

> > > This is where the problem lies - because the link register value
> > > created by the BL instruction will point to the instruction after the
> > > BL which will _not_ part of the function that invoked the BL. That
> > > will probably cause issues for the ELF unwinder, which means this
> > > issue probably goes beyond _just_ printing the function name.
> >
> > Isn't this already in the unwinder?
> > A BL itself isn't going to fault with PC = next-instruction.
> 
> You are missing the fact that the PC can be the saved LR, and thus
> can very well be the next instruction.

A PC from LR will always be the next instruction.
It is only the PC from a fault frame that is the current one.
The unwinder probably need to be told which one it has.
(Or add 4 the fault frame PC so that the unwinder can subtract
4 from it.)

At least (I don't think) there are any functions where the
called code is responsible for removing arguments.
That is a whole different bag of worms.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Russell King (Oracle) March 21, 2024, 1:08 p.m. UTC | #10
On Thu, Mar 21, 2024 at 12:57:07PM +0000, David Laight wrote:
> From: Russell King
> > Sent: 21 March 2024 12:23
> ...
> > > That might mean you can get the BL in the middle of a function
> > > but where the following instruction is for the 'no stack frame'
> > > side of the branch.
> > > That is very likely to break any stack offset calculations.
> > 
> > No it can't. At any one point in the function, the stack has to be in
> > a well defined state, so that access to local variables can work, and
> > also the stack can be correctly unwound. If there exists a point in
> > the function body which can be reached where the stack could be in two
> > different states, then the stack can't be restored to the parent
> > context.
> 
> Actually you can get there with a function that has a lot of args.
> So you can have:
> 	if (...) {
> 		push x
> 		bl func
> 		add %sp, #8
> 	}
> 	code;
> which is fine.

No you can't.... and that isn't even Arm code. Arm doesn't use %sp.
Moreover, that "bl" will stomp over the link register, meaning this
function can not return.

> But if 'func' is 'noreturn' then the 'add %sp, #8' can be discarded
> and then the saved LR is that of 'code' - but the stack offset is wrong.

If func is noreturn, then the remainder of that path isn't expected
to be executed, so anything that happens after the "bl" is irrelevant.

> A PC from LR will always be the next instruction.
> It is only the PC from a fault frame that is the current one.

That sentence makes no sense to me, as I don't think it's even proper
English, so I can't parse it.

> The unwinder probably need to be told which one it has.
> (Or add 4 the fault frame PC so that the unwinder can subtract
> 4 from it.)

That's basically what I said.
David Laight March 21, 2024, 2:37 p.m. UTC | #11
From: Russell King
> Sent: 21 March 2024 13:08
> 
> On Thu, Mar 21, 2024 at 12:57:07PM +0000, David Laight wrote:
> > From: Russell King
> > > Sent: 21 March 2024 12:23
> > ...
> > > > That might mean you can get the BL in the middle of a function
> > > > but where the following instruction is for the 'no stack frame'
> > > > side of the branch.
> > > > That is very likely to break any stack offset calculations.
> > >
> > > No it can't. At any one point in the function, the stack has to be in
> > > a well defined state, so that access to local variables can work, and
> > > also the stack can be correctly unwound. If there exists a point in
> > > the function body which can be reached where the stack could be in two
> > > different states, then the stack can't be restored to the parent
> > > context.
> >
> > Actually you can get there with a function that has a lot of args.
> > So you can have:
> > 	if (...) {
> > 		push x
> > 		bl func
> > 		add %sp, #8
> > 	}
> > 	code;
> > which is fine.
> 
> No you can't.... and that isn't even Arm code. Arm doesn't use %sp.
> Moreover, that "bl" will stomp over the link register, meaning this
> function can not return.

With 9+ arguments they spill to see https://godbolt.org/z/Yj3ovd8bY

Where the compiler generates:
f9:
        cmp     w0, 0
        ble     .L2
        sub     sp, sp, #32
        mov     w7, w0
        mov     w6, w0
        mov     w5, w0
        mov     w4, w0
        mov     w3, w0
        stp     x29, x30, [sp, 16]
        add     x29, sp, 16
        mov     w2, w0
        mov     w1, w0
        str     w0, [sp]
        bl      f
.L2:
        ret


A traceback from inside f() definitely needs to use LR-4
for the stack offset.

(arm64 doesn't seem to support -mno-sched-prolog).

I've failed to get different sized stack frames for the true/false
sides of the branch.
The compiler seems to pre-allocate the space for extra args rather
than using 'push' type instructions.
This was certainly better for some x86 cpu (p-pro?) but has now
gone out of fashion.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Russell King (Oracle) March 21, 2024, 2:56 p.m. UTC | #12
On Thu, Mar 21, 2024 at 02:37:28PM +0000, David Laight wrote:
> From: Russell King
> > Sent: 21 March 2024 13:08
> > 
> > On Thu, Mar 21, 2024 at 12:57:07PM +0000, David Laight wrote:
> > > From: Russell King
> > > > Sent: 21 March 2024 12:23
> > > ...
> > > > > That might mean you can get the BL in the middle of a function
> > > > > but where the following instruction is for the 'no stack frame'
> > > > > side of the branch.
> > > > > That is very likely to break any stack offset calculations.
> > > >
> > > > No it can't. At any one point in the function, the stack has to be in
> > > > a well defined state, so that access to local variables can work, and
> > > > also the stack can be correctly unwound. If there exists a point in
> > > > the function body which can be reached where the stack could be in two
> > > > different states, then the stack can't be restored to the parent
> > > > context.
> > >
> > > Actually you can get there with a function that has a lot of args.
> > > So you can have:
> > > 	if (...) {
> > > 		push x
> > > 		bl func
> > > 		add %sp, #8
> > > 	}
> > > 	code;
> > > which is fine.
> > 
> > No you can't.... and that isn't even Arm code. Arm doesn't use %sp.
> > Moreover, that "bl" will stomp over the link register, meaning this
> > function can not return.
> 
> With 9+ arguments they spill to see https://godbolt.org/z/Yj3ovd8bY
> 
> Where the compiler generates:
> f9:
>         cmp     w0, 0
>         ble     .L2
>         sub     sp, sp, #32
>         mov     w7, w0
>         mov     w6, w0
>         mov     w5, w0
>         mov     w4, w0
>         mov     w3, w0
>         stp     x29, x30, [sp, 16]
>         add     x29, sp, 16
>         mov     w2, w0
>         mov     w1, w0
>         str     w0, [sp]
>         bl      f
> .L2:
>         ret

Don't show me Arm64 assembly when we're discussing Arm32.
David Laight March 21, 2024, 3:20 p.m. UTC | #13
From: Russell King
> Sent: 21 March 2024 14:56
> 
> On Thu, Mar 21, 2024 at 02:37:28PM +0000, David Laight wrote:
> > From: Russell King
> > > Sent: 21 March 2024 13:08
> > >
> > > On Thu, Mar 21, 2024 at 12:57:07PM +0000, David Laight wrote:
> > > > From: Russell King
> > > > > Sent: 21 March 2024 12:23
> > > > ...
> > > > > > That might mean you can get the BL in the middle of a function
> > > > > > but where the following instruction is for the 'no stack frame'
> > > > > > side of the branch.
> > > > > > That is very likely to break any stack offset calculations.
> > > > >
> > > > > No it can't. At any one point in the function, the stack has to be in
> > > > > a well defined state, so that access to local variables can work, and
> > > > > also the stack can be correctly unwound. If there exists a point in
> > > > > the function body which can be reached where the stack could be in two
> > > > > different states, then the stack can't be restored to the parent
> > > > > context.
> > > >
> > > > Actually you can get there with a function that has a lot of args.
> > > > So you can have:
> > > > 	if (...) {
> > > > 		push x
> > > > 		bl func
> > > > 		add %sp, #8
> > > > 	}
> > > > 	code;
> > > > which is fine.
> > >
> > > No you can't.... and that isn't even Arm code. Arm doesn't use %sp.
> > > Moreover, that "bl" will stomp over the link register, meaning this
> > > function can not return.
> >
...
> 
> Don't show me Arm64 assembly when we're discussing Arm32.

Oops - I'd assumed no one did 32bit :-)
In any case it is much the same, see https://godbolt.org/z/7dcbKrs76

f4:
        push    {r3, lr}
        subs    r3, r0, #0
        ble     .L2
        mov     r2, r3
        mov     r1, r3
        bl      f
.L2:
        pop     {r3, pc}

f5:
        subs    r3, r0, #0
        ble     .L6
        push    {lr}
        sub     sp, sp, #12
        mov     r2, r3
        mov     r1, r3
        str     r3, [sp]
        bl      f
.L6:
        bx      lr

That is with -mno-sched-prolog but with 5+ args they spill to stack
and the %sp change is pulled into the conditional.

It does look like %lr is being saved (and for arm64 I think).

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Russell King (Oracle) March 21, 2024, 3:33 p.m. UTC | #14
On Thu, Mar 21, 2024 at 03:20:57PM +0000, David Laight wrote:
> From: Russell King
> > Sent: 21 March 2024 14:56
> > 
> > On Thu, Mar 21, 2024 at 02:37:28PM +0000, David Laight wrote:
> > > From: Russell King
> > > > Sent: 21 March 2024 13:08
> > > >
> > > > On Thu, Mar 21, 2024 at 12:57:07PM +0000, David Laight wrote:
> > > > > From: Russell King
> > > > > > Sent: 21 March 2024 12:23
> > > > > ...
> > > > > > > That might mean you can get the BL in the middle of a function
> > > > > > > but where the following instruction is for the 'no stack frame'
> > > > > > > side of the branch.
> > > > > > > That is very likely to break any stack offset calculations.
> > > > > >
> > > > > > No it can't. At any one point in the function, the stack has to be in
> > > > > > a well defined state, so that access to local variables can work, and
> > > > > > also the stack can be correctly unwound. If there exists a point in
> > > > > > the function body which can be reached where the stack could be in two
> > > > > > different states, then the stack can't be restored to the parent
> > > > > > context.
> > > > >
> > > > > Actually you can get there with a function that has a lot of args.
> > > > > So you can have:
> > > > > 	if (...) {
> > > > > 		push x
> > > > > 		bl func
> > > > > 		add %sp, #8
> > > > > 	}
> > > > > 	code;
> > > > > which is fine.
> > > >
> > > > No you can't.... and that isn't even Arm code. Arm doesn't use %sp.
> > > > Moreover, that "bl" will stomp over the link register, meaning this
> > > > function can not return.
> > >
> ...
> > 
> > Don't show me Arm64 assembly when we're discussing Arm32.
> 
> Oops - I'd assumed no one did 32bit :-)
> In any case it is much the same, see https://godbolt.org/z/7dcbKrs76
> 
> f4:
>         push    {r3, lr}
>         subs    r3, r0, #0
>         ble     .L2
>         mov     r2, r3
>         mov     r1, r3
>         bl      f
> .L2:
>         pop     {r3, pc}
> 
> f5:
>         subs    r3, r0, #0
>         ble     .L6
>         push    {lr}
>         sub     sp, sp, #12
>         mov     r2, r3
>         mov     r1, r3
>         str     r3, [sp]
>         bl      f
> .L6:
>         bx      lr
> 
> That is with -mno-sched-prolog but with 5+ args they spill to stack
> and the %sp change is pulled into the conditional.
> 
> It does look like %lr is being saved (and for arm64 I think).

I see nothing that contradicts anything I've said in your example
output.

You have been previously refering to a "bl" in the prologue, which is
what I thought you were going to give an example of. There is no "bl"
in the prologue of f5, the "ble" instruction is a normal branch for
less-than-or-equal. It's b + le not bl + e.

At .L6, there will be a difference in stack, but as f() is declared
as no-return, anything that comes after it is utterly irrelevant as
control is not expected to reach any following instruction via that
path. If it _were_ to, then in the example you give above, because
"lr" points at the bx lr instruction, the result would be to endlessly
spin executing bx lr instructions.
Ard Biesheuvel March 21, 2024, 10:43 p.m. UTC | #15
On Thu, 21 Mar 2024 at 12:24, Russell King (Oracle)
<linux@armlinux.org.uk> wrote:
>
> On Thu, Mar 21, 2024 at 10:22:30AM +0000, David Laight wrote:
> > How aggressively does the compiler optimise 'noreturn' functions?
>
> I've seen cases where the compiler emits a BL instruction as the very
> last thing in the function, and nothing after it.
>
> This is where the problem lies - because the link register value
> created by the BL instruction will point to the instruction after the
> BL which will _not_ part of the function that invoked the BL. That
> will probably cause issues for the ELF unwinder, which means this
> issue probably goes beyond _just_ printing the function name.
>
> I have vague memories that Ard has been involved in the unwinder,
> maybe he could comment on this problem? Maybe we need the unwinder
> itself to do the correction? I also wonder whether we should only
> do the correction if we detect that we're pointing at the first
> instruction of a function, and the previous instruction in the
> text segment was a BL.
>

First of all, I consider this to be a defect in the toolchain. Given
that the compiler knows that the BL is not going to return, it should
simply emit a BRK instruction or similar after it. This would catch
inadvertent returns as well as eliminate this kind of confusion.

The ARM unwind format is not really suitable for asynchronous
unwinding, so unwinding across exceptions is always going to be hit
and miss. Also, we should consider what the unwind information
actually tells us: for debugging, it is useful to understand where we
came from, i.e., how we ended up in the situation where the backtrace
was triggered, but what it actually tells us is where we would have
gone next had execution not been interrupted. The latter notion is
important for things like reliable stacktrace and live patch, which
need to know where a task might be returning to.

Returning to the first instruction of a function is unusual, but in
the light of the above, I don't think we can assume that it means that
the call originated from the preceding function, even when it ends in
a BL instruction (although it would be highly likely, of course). The
tracers and other instrumentation might insert probes in the return
path, and I suspect that this may result in a similar situation in
some cases.

Given that this particular issue would just disappear if the compiler
would just insert a BRK after the BL, I'd prefer to explore first
whether we can get this fixed on the compiler side.
Russell King (Oracle) March 22, 2024, 12:08 a.m. UTC | #16
On Thu, Mar 21, 2024 at 11:43:41PM +0100, Ard Biesheuvel wrote:
> Given that this particular issue would just disappear if the compiler
> would just insert a BRK after the BL, I'd prefer to explore first
> whether we can get this fixed on the compiler side.

Arm32 doesn't have a BRK instruction. What would be appropriate after
the no-return BL would be OS specific.

Maybe a branch to "abort" would be a good idea though.
David Laight March 22, 2024, 9:24 a.m. UTC | #17
From: Russell King
> Sent: 22 March 2024 00:09
> 
> On Thu, Mar 21, 2024 at 11:43:41PM +0100, Ard Biesheuvel wrote:
> > Given that this particular issue would just disappear if the compiler
> > would just insert a BRK after the BL, I'd prefer to explore first
> > whether we can get this fixed on the compiler side.
> 
> Arm32 doesn't have a BRK instruction. What would be appropriate after
> the no-return BL would be OS specific.

It would need to depend on what was being compiled.
For the kernel it could be much the same as BUG().
(Probably without any extra data.)
I suspect that arm32 could use 'swi' in kernel space,
but you wouldn't want to use that in userspace.

Looks like armv5 has a bkpt instruction - could that be used?
Or does the kernel need to support armv4?

The last arm I wrote anything for was a strongarm.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Russell King (Oracle) March 22, 2024, 9:52 a.m. UTC | #18
On Fri, Mar 22, 2024 at 09:24:20AM +0000, David Laight wrote:
> From: Russell King
> > Sent: 22 March 2024 00:09
> > 
> > On Thu, Mar 21, 2024 at 11:43:41PM +0100, Ard Biesheuvel wrote:
> > > Given that this particular issue would just disappear if the compiler
> > > would just insert a BRK after the BL, I'd prefer to explore first
> > > whether we can get this fixed on the compiler side.
> > 
> > Arm32 doesn't have a BRK instruction. What would be appropriate after
> > the no-return BL would be OS specific.
> 
> It would need to depend on what was being compiled.

Yes, but as for the rest...

> For the kernel it could be much the same as BUG().
> (Probably without any extra data.)
> I suspect that arm32 could use 'swi' in kernel space,
> but you wouldn't want to use that in userspace.
> 
> Looks like armv5 has a bkpt instruction - could that be used?
> Or does the kernel need to support armv4?
> 
> The last arm I wrote anything for was a strongarm.

Thank you David, but remember - I have programmed 32-bit Arm since 1992,
and wrote the majority of the 32-bit Arm kernel support. I think I know
what I'm walking about by now.

The compiler can't do the same as BUG() - that is a kernel specific
construct and not an architecture one. It is an undefined instruction
specifically chosen to be undefined on both 32-bit and 16-bit Arm ISAs.

As for your idea of using "swi" in kernel space, no that's never going
to happen - to shoe-horn that into the SWI exception path for the sake
of the compiler would be totally idiotic - it would cause userspace
performance regressions for something that never happens. Moreover,
with EABI the "comment" field in the "swi" instruction is ignored so
all SWIs under EABI are treated the same. So no, that's not going to
work without causing inefficiencies - again - for a case that will
likely never happen.

Whereas we already provide an abort() function because iirc the
compiler used to emit branches to that due to noreturn functions. If
correct, there's previous convention for doing this - and abort() is
still exists in the kernel and in userspace since it's part of ANSI
C. This would be a more reliable and portable solution, but probably
not for embedded platforms - and that's probably why it got removed.

There isn't going to be a single solution to this which satisfies
everyone, and I don't blame the compiler people for deciding to
basically give up with putting any instruction after a call to a
no-return function - because there isn't an instruction defined in
the architecture that _could_ be put there that would work everywhere.
Jiangfeng Xiao March 22, 2024, 12:54 p.m. UTC | #19
On 2024/3/22 17:52, Russell King (Oracle) wrote:
> On Fri, Mar 22, 2024 at 09:24:20AM +0000, David Laight wrote:
>> From: Russell King
>>> Sent: 22 March 2024 00:09
>>>
>>> On Thu, Mar 21, 2024 at 11:43:41PM +0100, Ard Biesheuvel wrote:
>>>> Given that this particular issue would just disappear if the compiler
>>>> would just insert a BRK after the BL, I'd prefer to explore first
>>>> whether we can get this fixed on the compiler side.
>>>
>>> Arm32 doesn't have a BRK instruction. What would be appropriate after
>>> the no-return BL would be OS specific.
>>
>> It would need to depend on what was being compiled.
> 
> Yes, but as for the rest...
> 
>> For the kernel it could be much the same as BUG().
>> (Probably without any extra data.)
>> I suspect that arm32 could use 'swi' in kernel space,
>> but you wouldn't want to use that in userspace.
>>
>> Looks like armv5 has a bkpt instruction - could that be used?
>> Or does the kernel need to support armv4?
>>
>> The last arm I wrote anything for was a strongarm.
> 
> Thank you David, but remember - I have programmed 32-bit Arm since 1992,
> and wrote the majority of the 32-bit Arm kernel support. I think I know
> what I'm walking about by now.
> 
> The compiler can't do the same as BUG() - that is a kernel specific
> construct and not an architecture one. It is an undefined instruction
> specifically chosen to be undefined on both 32-bit and 16-bit Arm ISAs.
> 
> As for your idea of using "swi" in kernel space, no that's never going
> to happen - to shoe-horn that into the SWI exception path for the sake
> of the compiler would be totally idiotic - it would cause userspace
> performance regressions for something that never happens. Moreover,
> with EABI the "comment" field in the "swi" instruction is ignored so
> all SWIs under EABI are treated the same. So no, that's not going to
> work without causing inefficiencies - again - for a case that will
> likely never happen.
> 
> Whereas we already provide an abort() function because iirc the
> compiler used to emit branches to that due to noreturn functions. If
> correct, there's previous convention for doing this - and abort() is
> still exists in the kernel and in userspace since it's part of ANSI
> C. This would be a more reliable and portable solution, but probably
> not for embedded platforms - and that's probably why it got removed.
> 
> There isn't going to be a single solution to this which satisfies
> everyone, and I don't blame the compiler people for deciding to
> basically give up with putting any instruction after a call to a
> no-return function - because there isn't an instruction defined in
> the architecture that _could_ be put there that would work everywhere.
> 


If the compiler inserts (a branch to 'abort') behind (no-return BL)
that does not apply to ARM32 embedded platforms, do you think the
 "[PATCH v3] ARM: unwind: improve unwinders for noreturn case"
submitted the day before yesterday can be used as a
complementary solution?

2) we're unwinding a frame that has been created because of a branch,
   where the PC points at the next instruction _after_ that callsite.

When we hit the second type of frame, "pc-1" is closer to callsite,
and no problem is introduced. In addition, the issue of the 'noreturn'
can be solved.
David Laight March 22, 2024, 2:16 p.m. UTC | #20
...
> Whereas we already provide an abort() function because iirc the
> compiler used to emit branches to that due to noreturn functions. If
> correct, there's previous convention for doing this - and abort() is
> still exists in the kernel and in userspace since it's part of ANSI
> C. This would be a more reliable and portable solution, but probably
> not for embedded platforms - and that's probably why it got removed.

Wouldn't you want it to do a 'bl abort' so that you could do a backtrace
to find out which 'noreturn' function had returned?

But that leaves you with another 'noreturn' function that you have
difficulty generating a backtrace from.

So you'd need a compiler option to specify what to put there.
I'd suspect linux would like something that generates an 'undefined
instruction' trap - since that would be expected to fault with the
saved PC pointing to the instruction itself (but architectures may vary).

'One size' definitely doesn't 'fit all' :-)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
diff mbox series

Patch

diff --git a/arch/arm/include/asm/stacktrace.h b/arch/arm/include/asm/stacktrace.h
index 360f0d2..07e4c16 100644
--- a/arch/arm/include/asm/stacktrace.h
+++ b/arch/arm/include/asm/stacktrace.h
@@ -21,9 +21,7 @@  struct stackframe {
 	struct llist_node *kr_cur;
 	struct task_struct *tsk;
 #endif
-#ifdef CONFIG_UNWINDER_FRAME_POINTER
 	bool ex_frame;
-#endif
 };
 
 static __always_inline
@@ -37,9 +35,7 @@  void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe *frame)
 		frame->kr_cur = NULL;
 		frame->tsk = current;
 #endif
-#ifdef CONFIG_UNWINDER_FRAME_POINTER
 		frame->ex_frame = in_entry_text(frame->pc);
-#endif
 }
 
 extern int unwind_frame(struct stackframe *frame);
diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c
index 620aa82..1abd4f9 100644
--- a/arch/arm/kernel/stacktrace.c
+++ b/arch/arm/kernel/stacktrace.c
@@ -154,9 +154,7 @@  static void start_stack_trace(struct stackframe *frame, struct task_struct *task
 	frame->kr_cur = NULL;
 	frame->tsk = task;
 #endif
-#ifdef CONFIG_UNWINDER_FRAME_POINTER
 	frame->ex_frame = in_entry_text(frame->pc);
-#endif
 }
 
 void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 3bad79d..46a5b1e 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -84,10 +84,10 @@  void dump_backtrace_entry(unsigned long where, unsigned long from,
 	printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n",
 		loglvl, where, from);
 #elif defined CONFIG_BACKTRACE_VERBOSE
-	printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n",
+	printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pB)\n",
 		loglvl, where, (void *)where, from, (void *)from);
 #else
-	printk("%s %ps from %pS\n", loglvl, (void *)where, (void *)from);
+	printk("%s %ps from %pB\n", loglvl, (void *)where, (void *)from);
 #endif
 
 	if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE))
diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
index 9d21921..f2baf92 100644
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c
@@ -30,6 +30,7 @@ 
 #include <linux/list.h>
 #include <linux/module.h>
 
+#include <asm/sections.h>
 #include <asm/stacktrace.h>
 #include <asm/traps.h>
 #include <asm/unwind.h>
@@ -416,8 +417,14 @@  int unwind_frame(struct stackframe *frame)
 
 	pr_debug("%s(pc = %08lx lr = %08lx sp = %08lx)\n", __func__,
 		 frame->pc, frame->lr, frame->sp);
-
-	idx = unwind_find_idx(frame->pc);
+	/*
+	 * For a call frame (as opposed to a exception frame), when the last
+	 * instruction of a function is a function call (e.g., to a noreturn
+	 * function), it can cause the unwinder incorrectly try to unwind
+	 * from the function after the callee, fixed this by subtracting 1
+	 * from frame->pc in the call frame like ORC on x86 does.
+	 */
+	idx = unwind_find_idx(frame->ex_frame ? frame->pc : frame->pc - 1);
 	if (!idx) {
 		if (frame->pc && kernel_text_address(frame->pc)) {
 			if (in_module_plt(frame->pc) && frame->pc != frame->lr) {
@@ -427,6 +434,7 @@  int unwind_frame(struct stackframe *frame)
 				 * the state of the stack or the register file
 				 */
 				frame->pc = frame->lr;
+				frame->ex_frame = in_entry_text(frame->pc);
 				return URC_OK;
 			}
 			pr_warn("unwind: Index not found %08lx\n", frame->pc);
@@ -454,6 +462,7 @@  int unwind_frame(struct stackframe *frame)
 		if (frame->pc == frame->lr)
 			return -URC_FAILURE;
 		frame->pc = frame->lr;
+		frame->ex_frame = in_entry_text(frame->pc);
 		return URC_OK;
 	} else if ((idx->insn & 0x80000000) == 0)
 		/* prel31 to the unwind table */
@@ -515,6 +524,7 @@  int unwind_frame(struct stackframe *frame)
 	frame->lr = ctrl.vrs[LR];
 	frame->pc = ctrl.vrs[PC];
 	frame->lr_addr = ctrl.lr_addr;
+	frame->ex_frame = in_entry_text(frame->pc);
 
 	return URC_OK;
 }
@@ -544,6 +554,7 @@  void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 		 */
 here:
 		frame.pc = (unsigned long)&&here;
+		frame.ex_frame = false;
 	} else {
 		/* task blocked in __switch_to */
 		frame.fp = thread_saved_fp(tsk);
@@ -554,11 +565,12 @@  void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 		 */
 		frame.lr = 0;
 		frame.pc = thread_saved_pc(tsk);
+		frame.ex_frame = false;
 	}
 
 	while (1) {
 		int urc;
-		unsigned long where = frame.pc;
+		unsigned long where = frame.ex_frame ? frame.pc : frame.pc - 1;
 
 		urc = unwind_frame(&frame);
 		if (urc < 0)