diff mbox series

arm64: stacktrace: skip reporting LR at exception boundaries

Message ID 20241209110351.1876804-1-mark.rutland@arm.com (mailing list archive)
State New
Headers show
Series arm64: stacktrace: skip reporting LR at exception boundaries | expand

Commit Message

Mark Rutland Dec. 9, 2024, 11:03 a.m. UTC
Recently the arm64 stacktrace code was modified to report the LR at
exception boundaries, which interacts poorly with fgraph tracing. It is
possible for the LR to contain the start address of return_to_handler()
even when the LR is not live, and in such cases attempts to recover the
return address via ftrace_graph_ret_addr() may fail, triggering a
WARN_ON_ONCE() in kunwind_recover_return_address() and aborting the
unwind. This has resulted in test failures and unexpected warnings, as
reported by Aishwarya and Kent.

Handling unreliable LR values in these cases is likely to require some
larger rework, so for the moment avoid this problem by restoring the old
behaviour of skipping the LR at exception boundaries, as we did prior to
commit:

  c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")

This commit is effectively a partial revert, keeping the structures and
logic to explicitly identify exception boundaries while still skipping
reporting of the LR. The logic to explicitly identify exception
boundaries is still useful for general robustness and as a building
block for future support for reliably stacktracing.

Fixes: c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Aishwarya TCV <aishwarya.tcv@arm.com>
Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/stacktrace.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

Comments

Mark Rutland Dec. 9, 2024, 11:54 a.m. UTC | #1
On Mon, Dec 09, 2024 at 11:03:51AM +0000, Mark Rutland wrote:
> Recently the arm64 stacktrace code was modified to report the LR at
> exception boundaries, which interacts poorly with fgraph tracing. It is
> possible for the LR to contain the start address of return_to_handler()
> even when the LR is not live, and in such cases attempts to recover the
> return address via ftrace_graph_ret_addr() may fail, triggering a
> WARN_ON_ONCE() in kunwind_recover_return_address() and aborting the
> unwind. This has resulted in test failures and unexpected warnings, as
> reported by Aishwarya and Kent.

To clarify, the issue reported by Kent at:

  http://lore.kernel.org/linux-arm-kernel/zbwbgkuvvciezpmigcp6gaahfxwm7cwhpzus7gtbfnbzsjb2n3@kfbdppbd74o4

... seems to be a distinct issue, and I has misunderstood the report
while writing up this commit message.

Regardless of that, I think this patch is still justified, as it does
address the issue that Aishwarya reported.

Mark.

> 
> Handling unreliable LR values in these cases is likely to require some
> larger rework, so for the moment avoid this problem by restoring the old
> behaviour of skipping the LR at exception boundaries, as we did prior to
> commit:
> 
>   c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")
> 
> This commit is effectively a partial revert, keeping the structures and
> logic to explicitly identify exception boundaries while still skipping
> reporting of the LR. The logic to explicitly identify exception
> boundaries is still useful for general robustness and as a building
> block for future support for reliably stacktracing.
> 
> Fixes: c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")
> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
> Reported-by: Aishwarya TCV <aishwarya.tcv@arm.com>
> Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> ---
>  arch/arm64/kernel/stacktrace.c | 24 ++----------------------
>  1 file changed, 2 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> index caef85462acb6..4a08ad8158380 100644
> --- a/arch/arm64/kernel/stacktrace.c
> +++ b/arch/arm64/kernel/stacktrace.c
> @@ -26,7 +26,6 @@ enum kunwind_source {
>  	KUNWIND_SOURCE_CALLER,
>  	KUNWIND_SOURCE_TASK,
>  	KUNWIND_SOURCE_REGS_PC,
> -	KUNWIND_SOURCE_REGS_LR,
>  };
>  
>  union unwind_flags {
> @@ -178,23 +177,8 @@ int kunwind_next_regs_pc(struct kunwind_state *state)
>  	state->regs = regs;
>  	state->common.pc = regs->pc;
>  	state->common.fp = regs->regs[29];
> -	state->source = KUNWIND_SOURCE_REGS_PC;
> -	return 0;
> -}
> -
> -static __always_inline int
> -kunwind_next_regs_lr(struct kunwind_state *state)
> -{
> -	/*
> -	 * The stack for the regs was consumed by kunwind_next_regs_pc(), so we
> -	 * cannot consume that again here, but we know the regs are safe to
> -	 * access.
> -	 */
> -	state->common.pc = state->regs->regs[30];
> -	state->common.fp = state->regs->regs[29];
>  	state->regs = NULL;
> -	state->source = KUNWIND_SOURCE_REGS_LR;
> -
> +	state->source = KUNWIND_SOURCE_REGS_PC;
>  	return 0;
>  }
>  
> @@ -274,11 +258,8 @@ kunwind_next(struct kunwind_state *state)
>  	case KUNWIND_SOURCE_FRAME:
>  	case KUNWIND_SOURCE_CALLER:
>  	case KUNWIND_SOURCE_TASK:
> -	case KUNWIND_SOURCE_REGS_LR:
> -		err = kunwind_next_frame_record(state);
> -		break;
>  	case KUNWIND_SOURCE_REGS_PC:
> -		err = kunwind_next_regs_lr(state);
> +		err = kunwind_next_frame_record(state);
>  		break;
>  	default:
>  		err = -EINVAL;
> @@ -436,7 +417,6 @@ static const char *state_source_string(const struct kunwind_state *state)
>  	case KUNWIND_SOURCE_CALLER:	return "C";
>  	case KUNWIND_SOURCE_TASK:	return "T";
>  	case KUNWIND_SOURCE_REGS_PC:	return "P";
> -	case KUNWIND_SOURCE_REGS_LR:	return "L";
>  	default:			return "U";
>  	}
>  }
> -- 
> 2.30.2
>
Mark Rutland Dec. 10, 2024, 1:33 p.m. UTC | #2
Hi Catalin, Will,

Please disregard this patch for now -- there are some related issues
with unwinding distinct tasks, and I will send a new parch/series
shortly which will supersede this.

Mark.

On Mon, Dec 09, 2024 at 11:03:51AM +0000, Mark Rutland wrote:
> Recently the arm64 stacktrace code was modified to report the LR at
> exception boundaries, which interacts poorly with fgraph tracing. It is
> possible for the LR to contain the start address of return_to_handler()
> even when the LR is not live, and in such cases attempts to recover the
> return address via ftrace_graph_ret_addr() may fail, triggering a
> WARN_ON_ONCE() in kunwind_recover_return_address() and aborting the
> unwind. This has resulted in test failures and unexpected warnings, as
> reported by Aishwarya and Kent.
> 
> Handling unreliable LR values in these cases is likely to require some
> larger rework, so for the moment avoid this problem by restoring the old
> behaviour of skipping the LR at exception boundaries, as we did prior to
> commit:
> 
>   c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")
> 
> This commit is effectively a partial revert, keeping the structures and
> logic to explicitly identify exception boundaries while still skipping
> reporting of the LR. The logic to explicitly identify exception
> boundaries is still useful for general robustness and as a building
> block for future support for reliably stacktracing.
> 
> Fixes: c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")
> Signed-off-by: Mark Rutland <mark.rutland@arm.com>
> Reported-by: Aishwarya TCV <aishwarya.tcv@arm.com>
> Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> ---
>  arch/arm64/kernel/stacktrace.c | 24 ++----------------------
>  1 file changed, 2 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> index caef85462acb6..4a08ad8158380 100644
> --- a/arch/arm64/kernel/stacktrace.c
> +++ b/arch/arm64/kernel/stacktrace.c
> @@ -26,7 +26,6 @@ enum kunwind_source {
>  	KUNWIND_SOURCE_CALLER,
>  	KUNWIND_SOURCE_TASK,
>  	KUNWIND_SOURCE_REGS_PC,
> -	KUNWIND_SOURCE_REGS_LR,
>  };
>  
>  union unwind_flags {
> @@ -178,23 +177,8 @@ int kunwind_next_regs_pc(struct kunwind_state *state)
>  	state->regs = regs;
>  	state->common.pc = regs->pc;
>  	state->common.fp = regs->regs[29];
> -	state->source = KUNWIND_SOURCE_REGS_PC;
> -	return 0;
> -}
> -
> -static __always_inline int
> -kunwind_next_regs_lr(struct kunwind_state *state)
> -{
> -	/*
> -	 * The stack for the regs was consumed by kunwind_next_regs_pc(), so we
> -	 * cannot consume that again here, but we know the regs are safe to
> -	 * access.
> -	 */
> -	state->common.pc = state->regs->regs[30];
> -	state->common.fp = state->regs->regs[29];
>  	state->regs = NULL;
> -	state->source = KUNWIND_SOURCE_REGS_LR;
> -
> +	state->source = KUNWIND_SOURCE_REGS_PC;
>  	return 0;
>  }
>  
> @@ -274,11 +258,8 @@ kunwind_next(struct kunwind_state *state)
>  	case KUNWIND_SOURCE_FRAME:
>  	case KUNWIND_SOURCE_CALLER:
>  	case KUNWIND_SOURCE_TASK:
> -	case KUNWIND_SOURCE_REGS_LR:
> -		err = kunwind_next_frame_record(state);
> -		break;
>  	case KUNWIND_SOURCE_REGS_PC:
> -		err = kunwind_next_regs_lr(state);
> +		err = kunwind_next_frame_record(state);
>  		break;
>  	default:
>  		err = -EINVAL;
> @@ -436,7 +417,6 @@ static const char *state_source_string(const struct kunwind_state *state)
>  	case KUNWIND_SOURCE_CALLER:	return "C";
>  	case KUNWIND_SOURCE_TASK:	return "T";
>  	case KUNWIND_SOURCE_REGS_PC:	return "P";
> -	case KUNWIND_SOURCE_REGS_LR:	return "L";
>  	default:			return "U";
>  	}
>  }
> -- 
> 2.30.2
>
diff mbox series

Patch

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index caef85462acb6..4a08ad8158380 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -26,7 +26,6 @@  enum kunwind_source {
 	KUNWIND_SOURCE_CALLER,
 	KUNWIND_SOURCE_TASK,
 	KUNWIND_SOURCE_REGS_PC,
-	KUNWIND_SOURCE_REGS_LR,
 };
 
 union unwind_flags {
@@ -178,23 +177,8 @@  int kunwind_next_regs_pc(struct kunwind_state *state)
 	state->regs = regs;
 	state->common.pc = regs->pc;
 	state->common.fp = regs->regs[29];
-	state->source = KUNWIND_SOURCE_REGS_PC;
-	return 0;
-}
-
-static __always_inline int
-kunwind_next_regs_lr(struct kunwind_state *state)
-{
-	/*
-	 * The stack for the regs was consumed by kunwind_next_regs_pc(), so we
-	 * cannot consume that again here, but we know the regs are safe to
-	 * access.
-	 */
-	state->common.pc = state->regs->regs[30];
-	state->common.fp = state->regs->regs[29];
 	state->regs = NULL;
-	state->source = KUNWIND_SOURCE_REGS_LR;
-
+	state->source = KUNWIND_SOURCE_REGS_PC;
 	return 0;
 }
 
@@ -274,11 +258,8 @@  kunwind_next(struct kunwind_state *state)
 	case KUNWIND_SOURCE_FRAME:
 	case KUNWIND_SOURCE_CALLER:
 	case KUNWIND_SOURCE_TASK:
-	case KUNWIND_SOURCE_REGS_LR:
-		err = kunwind_next_frame_record(state);
-		break;
 	case KUNWIND_SOURCE_REGS_PC:
-		err = kunwind_next_regs_lr(state);
+		err = kunwind_next_frame_record(state);
 		break;
 	default:
 		err = -EINVAL;
@@ -436,7 +417,6 @@  static const char *state_source_string(const struct kunwind_state *state)
 	case KUNWIND_SOURCE_CALLER:	return "C";
 	case KUNWIND_SOURCE_TASK:	return "T";
 	case KUNWIND_SOURCE_REGS_PC:	return "P";
-	case KUNWIND_SOURCE_REGS_LR:	return "L";
 	default:			return "U";
 	}
 }