diff mbox

[RFC,10/10] arm64: kernel: add support for virtually mapped stacks

Message ID 20170712144424.19528-11-ard.biesheuvel@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Ard Biesheuvel July 12, 2017, 2:44 p.m. UTC
Add code that checks whether an exception taken from EL1 was caused
by a faulting stack access before proceeding to save the interrupted
context to the stack.

This involves checking whether the faulting address coincides with the
guard page below the task stack of 'current'. This uses tpidrro_el0 and
sp_el0 as scratch registers, so we can free up a couple of general
purpose registers for use in the code that performs this check. If it
turns out we are dealing with a stack overflow, switch to a special
per-CPU overflow stack so we can at least call panic().

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/thread_info.h |  2 +
 arch/arm64/kernel/entry.S            | 49 ++++++++++++++++++++
 arch/arm64/mm/fault.c                |  9 ++++
 4 files changed, 61 insertions(+)

Comments

Mark Rutland July 12, 2017, 10:59 p.m. UTC | #1
On Wed, Jul 12, 2017 at 03:44:23PM +0100, Ard Biesheuvel wrote:
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 2ba3185b1c78..4c3e82d6e2f2 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -392,6 +392,20 @@ ENDPROC(el1_error_invalid)
>   */
>  	.align	6
>  el1_sync:
> +#ifdef CONFIG_VMAP_STACK
> +	/*
> +	 * When taking an exception from EL1, we need to check whether it is
> +	 * caused by a faulting out-of-bounds access to the virtually mapped
> +	 * stack before we can attempt to preserve the interrupted context.
> +	 */
> +	msr	tpidrro_el0, x0			// stash x0
> +	mrs	x0, far_el1			// get faulting address
> +	tbnz	x0, #63, .Lcheck_stack_ovf	// check if not user address

One thing I don't like about this is that we're reading FAR_EL1 even
though we don't know whether it's valid. It could contain junk, and we
could spuriously jump into the slow path. That's liable to make
performance erratic.

Given we have to stash a GPR anyway, we can check whether the SP is
valid in the fast-path, and that should be the only check we require. I
think we can also get rid of the overflow heuristic by checking that the
SP is strictly in the THREAD_SIZE stack area.

[...]

> @@ -234,6 +239,10 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
>  	 */
>  	if (addr >= (u64)current->stack - PAGE_SIZE &&
>  	    addr < (u64)current->stack) {
> +
> +		/* fix up regs->sp, we stashed the faulting value in sp_el0 */
> +		regs->sp = read_sysreg(sp_el0);
> +

This also seems a bit "magic", as we're relying on what the entry asm
will have done in a very specific case.

Thanks,
Mark.
Mark Rutland July 13, 2017, 9:12 a.m. UTC | #2
On Wed, Jul 12, 2017 at 11:59:38PM +0100, Mark Rutland wrote:
> On Wed, Jul 12, 2017 at 03:44:23PM +0100, Ard Biesheuvel wrote:
> > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > index 2ba3185b1c78..4c3e82d6e2f2 100644
> > --- a/arch/arm64/kernel/entry.S
> > +++ b/arch/arm64/kernel/entry.S
> > @@ -392,6 +392,20 @@ ENDPROC(el1_error_invalid)
> >   */
> >  	.align	6
> >  el1_sync:
> > +#ifdef CONFIG_VMAP_STACK
> > +	/*
> > +	 * When taking an exception from EL1, we need to check whether it is
> > +	 * caused by a faulting out-of-bounds access to the virtually mapped
> > +	 * stack before we can attempt to preserve the interrupted context.
> > +	 */
> > +	msr	tpidrro_el0, x0			// stash x0
> > +	mrs	x0, far_el1			// get faulting address
> > +	tbnz	x0, #63, .Lcheck_stack_ovf	// check if not user address
> 
> One thing I don't like about this is that we're reading FAR_EL1 even
> though we don't know whether it's valid. It could contain junk, and we
> could spuriously jump into the slow path. That's liable to make
> performance erratic.

Once I sent this, I realised what I said wasn't quite right.

It's true that FAR_EL1 is UNKNOWN in some cases, but today those are all
fatal (e.g. if we take an UNDEFINED exception due to a bad instruction
at EL1).

So my performance concern was spurious; sorry for the noise.

However, so as to not mis-report those fatal cases, we either need to
inspect the SP or ESR_EL1 to determine that FAR_EL1 is valid and/or this
was a stack-overflow

If we follow the approach of this series, we can move those checks into
the check_stack_ovf slow path.

Thanks,
Mark.
Dave Martin July 13, 2017, 10:35 a.m. UTC | #3
On Wed, Jul 12, 2017 at 03:44:23PM +0100, Ard Biesheuvel wrote:
> Add code that checks whether an exception taken from EL1 was caused
> by a faulting stack access before proceeding to save the interrupted
> context to the stack.
> 
> This involves checking whether the faulting address coincides with the
> guard page below the task stack of 'current'. This uses tpidrro_el0 and
> sp_el0 as scratch registers, so we can free up a couple of general
> purpose registers for use in the code that performs this check. If it
> turns out we are dealing with a stack overflow, switch to a special
> per-CPU overflow stack so we can at least call panic().
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm64/Kconfig                   |  1 +
>  arch/arm64/include/asm/thread_info.h |  2 +
>  arch/arm64/kernel/entry.S            | 49 ++++++++++++++++++++
>  arch/arm64/mm/fault.c                |  9 ++++
>  4 files changed, 61 insertions(+)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index b52db8bb1270..50caf63099c8 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -73,6 +73,7 @@ config ARM64
>  	select HAVE_ARCH_SECCOMP_FILTER
>  	select HAVE_ARCH_TRACEHOOK
>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
> +	select HAVE_ARCH_VMAP_STACK
>  	select HAVE_ARM_SMCCC
>  	select HAVE_EBPF_JIT
>  	select HAVE_C_RECORDMCOUNT
> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
> index 46c3b93cf865..1c3e0a3bf87a 100644
> --- a/arch/arm64/include/asm/thread_info.h
> +++ b/arch/arm64/include/asm/thread_info.h
> @@ -32,6 +32,8 @@
>  #define THREAD_SIZE		16384
>  #define THREAD_START_SP		(THREAD_SIZE - 16)
>  
> +#define OVERFLOW_STACK_SIZE	1024
> +

How was this number chosen?

Kernel stack overflow is not going to get routinely tested -- after all
it should only get exercised when things go horribly wrong.

So, if the panic code (or compiler-generated stack frames) become
bloated enough, we will silently overrun this limit.

Maybe it doesn't matter if we trash a load of unrelated percpu data when
we panic.


>  #ifndef __ASSEMBLY__
>  
>  struct task_struct;
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 2ba3185b1c78..4c3e82d6e2f2 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -392,6 +392,20 @@ ENDPROC(el1_error_invalid)
>   */
>  	.align	6
>  el1_sync:
> +#ifdef CONFIG_VMAP_STACK
> +	/*
> +	 * When taking an exception from EL1, we need to check whether it is
> +	 * caused by a faulting out-of-bounds access to the virtually mapped
> +	 * stack before we can attempt to preserve the interrupted context.
> +	 */
> +	msr	tpidrro_el0, x0			// stash x0
> +	mrs	x0, far_el1			// get faulting address
> +	tbnz	x0, #63, .Lcheck_stack_ovf	// check if not user address
> +
> +.Lcheck_stack_resume:
> +	mrs	x0, tpidrro_el0			// restore x0
> +#endif
> +

Unless I'm missing something, this leaves tpidrr0_el0 trashed (and
containing kernel data that userspace can subsequently read).

This could be solved by moving the tpidrro_el0 load out of context
switch and into ret_to_user.

[...]

Cheers
---Dave
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b52db8bb1270..50caf63099c8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -73,6 +73,7 @@  config ARM64
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_VMAP_STACK
 	select HAVE_ARM_SMCCC
 	select HAVE_EBPF_JIT
 	select HAVE_C_RECORDMCOUNT
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 46c3b93cf865..1c3e0a3bf87a 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -32,6 +32,8 @@ 
 #define THREAD_SIZE		16384
 #define THREAD_START_SP		(THREAD_SIZE - 16)
 
+#define OVERFLOW_STACK_SIZE	1024
+
 #ifndef __ASSEMBLY__
 
 struct task_struct;
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2ba3185b1c78..4c3e82d6e2f2 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -392,6 +392,20 @@  ENDPROC(el1_error_invalid)
  */
 	.align	6
 el1_sync:
+#ifdef CONFIG_VMAP_STACK
+	/*
+	 * When taking an exception from EL1, we need to check whether it is
+	 * caused by a faulting out-of-bounds access to the virtually mapped
+	 * stack before we can attempt to preserve the interrupted context.
+	 */
+	msr	tpidrro_el0, x0			// stash x0
+	mrs	x0, far_el1			// get faulting address
+	tbnz	x0, #63, .Lcheck_stack_ovf	// check if not user address
+
+.Lcheck_stack_resume:
+	mrs	x0, tpidrro_el0			// restore x0
+#endif
+
 	kernel_entry 1
 	mrs	x1, esr_el1			// read the syndrome register
 	lsr	x24, x1, #ESR_ELx_EC_SHIFT	// exception class
@@ -411,6 +425,41 @@  el1_sync:
 	b.ge	el1_dbg
 	b	el1_inv
 
+#ifdef CONFIG_VMAP_STACK
+.Lcheck_stack_ovf:
+	/*
+	 * Check if the faulting address is above PAGE_OFFSET, which rules out
+	 * the vmapped stacks living in the VMALLOC region.
+	 */
+	tbnz	x0, #(VA_BITS - 2), .Lcheck_stack_resume
+
+	/*
+	 * Check whether the faulting address hit a guard page below our
+	 * virtually mapped stack. This is a strong hint that we may be
+	 * dealing with a stack overflow.
+	 */
+	msr	sp_el0, x1			// stash x1
+	ldr	x1, [tsk, #TSK_STACK]		// get task's stack base
+	sub	x1, x1, x0			// subtract FAR from stack base
+	tst	x1, #~(PAGE_SIZE - 1)		// disregard bits within page
+	mrs	x1, sp_el0			// restore x1
+	b.ne	.Lcheck_stack_resume		// proceed if no stack overflow
+
+	/*
+	 * We are not going to recover from a stack overflow in kernel mode,
+	 * but we would like to report this condition to the user, which means
+	 * we need another stack.
+	 */
+	mov	x0, sp
+	msr	sp_el0, x0			// stash the faulting sp
+
+	adr_l	x0, overflow_stack + OVERFLOW_STACK_SIZE
+	sub	sp, x0, #S_FRAME_SIZE
+	mrs	x0, tpidr_el1
+	add	sp, sp, x0
+	b	.Lcheck_stack_resume
+#endif
+
 el1_ia:
 	/*
 	 * Fall through to the Data abort case
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index b3317e5ff5dd..9ecd47572656 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -45,6 +45,11 @@ 
 
 #include <acpi/ghes.h>
 
+#ifdef CONFIG_VMAP_STACK
+DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)
+	 __aligned(16);
+#endif
+
 struct fault_info {
 	int	(*fn)(unsigned long addr, unsigned int esr,
 		      struct pt_regs *regs);
@@ -234,6 +239,10 @@  static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	 */
 	if (addr >= (u64)current->stack - PAGE_SIZE &&
 	    addr < (u64)current->stack) {
+
+		/* fix up regs->sp, we stashed the faulting value in sp_el0 */
+		regs->sp = read_sysreg(sp_el0);
+
 		printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
 			 (void *)addr, current->stack,
 			 (char *)current->stack + THREAD_SIZE - 1);