diff mbox series

[v11,01/12] add support for Clang's Shadow Call Stack (SCS)

Message ID 20200416161245.148813-2-samitolvanen@google.com (mailing list archive)
State New, archived
Headers show
Series add support for Clang's Shadow Call Stack | expand

Commit Message

Sami Tolvanen April 16, 2020, 4:12 p.m. UTC
This change adds generic support for Clang's Shadow Call Stack,
which uses a shadow stack to protect return addresses from being
overwritten by an attacker. Details are available here:

  https://clang.llvm.org/docs/ShadowCallStack.html

Note that security guarantees in the kernel differ from the
ones documented for user space. The kernel must store addresses
of shadow stacks used by other tasks and interrupt handlers in
memory, which means an attacker capable reading and writing
arbitrary memory may be able to locate them and hijack control
flow by modifying shadow stacks that are not currently in use.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
---
 Makefile                       |   6 ++
 arch/Kconfig                   |  34 ++++++
 include/linux/compiler-clang.h |   6 ++
 include/linux/compiler_types.h |   4 +
 include/linux/scs.h            |  57 ++++++++++
 init/init_task.c               |   8 ++
 kernel/Makefile                |   1 +
 kernel/fork.c                  |   9 ++
 kernel/sched/core.c            |   2 +
 kernel/scs.c                   | 187 +++++++++++++++++++++++++++++++++
 10 files changed, 314 insertions(+)
 create mode 100644 include/linux/scs.h
 create mode 100644 kernel/scs.c

Comments

Will Deacon April 20, 2020, 5:17 p.m. UTC | #1
Hi Sami,

Comments inline.

On Thu, Apr 16, 2020 at 09:12:34AM -0700, Sami Tolvanen wrote:
> This change adds generic support for Clang's Shadow Call Stack,
> which uses a shadow stack to protect return addresses from being
> overwritten by an attacker. Details are available here:
> 
>   https://clang.llvm.org/docs/ShadowCallStack.html
> 
> Note that security guarantees in the kernel differ from the
> ones documented for user space. The kernel must store addresses
> of shadow stacks used by other tasks and interrupt handlers in
> memory, which means an attacker capable reading and writing
> arbitrary memory may be able to locate them and hijack control
> flow by modifying shadow stacks that are not currently in use.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> Reviewed-by: Kees Cook <keescook@chromium.org>
> Reviewed-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
> ---
>  Makefile                       |   6 ++
>  arch/Kconfig                   |  34 ++++++
>  include/linux/compiler-clang.h |   6 ++
>  include/linux/compiler_types.h |   4 +
>  include/linux/scs.h            |  57 ++++++++++
>  init/init_task.c               |   8 ++
>  kernel/Makefile                |   1 +
>  kernel/fork.c                  |   9 ++
>  kernel/sched/core.c            |   2 +
>  kernel/scs.c                   | 187 +++++++++++++++++++++++++++++++++
>  10 files changed, 314 insertions(+)
>  create mode 100644 include/linux/scs.h
>  create mode 100644 kernel/scs.c
> 
> diff --git a/Makefile b/Makefile
> index 70def4907036..baea6024b409 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -866,6 +866,12 @@ ifdef CONFIG_LIVEPATCH
>  KBUILD_CFLAGS += $(call cc-option, -flive-patching=inline-clone)
>  endif
>  
> +ifdef CONFIG_SHADOW_CALL_STACK
> +CC_FLAGS_SCS	:= -fsanitize=shadow-call-stack
> +KBUILD_CFLAGS	+= $(CC_FLAGS_SCS)
> +export CC_FLAGS_SCS
> +endif

CFLAGS_SCS would seem more natural to me, although I see ftrace does it this
way.

>  # arch Makefile may override CC so keep this after arch Makefile is included
>  NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
>  
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 786a85d4ad40..691a552c2cc3 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -533,6 +533,40 @@ config STACKPROTECTOR_STRONG
>  	  about 20% of all kernel functions, which increases the kernel code
>  	  size by about 2%.
>  
> +config ARCH_SUPPORTS_SHADOW_CALL_STACK
> +	bool
> +	help
> +	  An architecture should select this if it supports Clang's Shadow
> +	  Call Stack, has asm/scs.h, and implements runtime support for shadow
> +	  stack switching.
> +
> +config SHADOW_CALL_STACK
> +	bool "Clang Shadow Call Stack"
> +	depends on ARCH_SUPPORTS_SHADOW_CALL_STACK
> +	help
> +	  This option enables Clang's Shadow Call Stack, which uses a
> +	  shadow stack to protect function return addresses from being
> +	  overwritten by an attacker. More information can be found in
> +	  Clang's documentation:
> +
> +	    https://clang.llvm.org/docs/ShadowCallStack.html
> +
> +	  Note that security guarantees in the kernel differ from the ones
> +	  documented for user space. The kernel must store addresses of shadow
> +	  stacks used by other tasks and interrupt handlers in memory, which
> +	  means an attacker capable of reading and writing arbitrary memory
> +	  may be able to locate them and hijack control flow by modifying
> +	  shadow stacks that are not currently in use.

Shouldn't some of this depend on CC_IS_CLANG?

> +config SHADOW_CALL_STACK_VMAP
> +	bool "Use virtually mapped shadow call stacks"
> +	depends on SHADOW_CALL_STACK
> +	help
> +	  Use virtually mapped shadow call stacks. Selecting this option
> +	  provides better stack exhaustion protection, but increases per-thread
> +	  memory consumption as a full page is allocated for each shadow stack.

Given that this feature applies only to arm64 kernels built with clang, it
feels weird to further segment that userbase with another config option.
Does Android enable SHADOW_CALL_STACK_VMAP? If not, maybe we should ditch
it for now and add it when we have a user.

> diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
> index e970f97a7fcb..97b62f47a80d 100644
> --- a/include/linux/compiler_types.h
> +++ b/include/linux/compiler_types.h
> @@ -193,6 +193,10 @@ struct ftrace_likely_data {
>  # define randomized_struct_fields_end
>  #endif
>  
> +#ifndef __noscs
> +# define __noscs
> +#endif
> +
>  #ifndef asm_volatile_goto
>  #define asm_volatile_goto(x...) asm goto(x)
>  #endif
> diff --git a/include/linux/scs.h b/include/linux/scs.h
> new file mode 100644
> index 000000000000..c5572fd770b0
> --- /dev/null
> +++ b/include/linux/scs.h
> @@ -0,0 +1,57 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Shadow Call Stack support.
> + *
> + * Copyright (C) 2019 Google LLC
> + */
> +
> +#ifndef _LINUX_SCS_H
> +#define _LINUX_SCS_H
> +
> +#include <linux/gfp.h>
> +#include <linux/sched.h>
> +#include <asm/page.h>
> +
> +#ifdef CONFIG_SHADOW_CALL_STACK
> +
> +/*
> + * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit
> + * architecture) provided ~40% safety margin on stack usage while keeping
> + * memory allocation overhead reasonable.
> + */
> +#define SCS_SIZE	1024UL
> +#define GFP_SCS		(GFP_KERNEL | __GFP_ZERO)
> +
> +/*
> + * A random number outside the kernel's virtual address space to mark the
> + * end of the shadow stack.
> + */
> +#define SCS_END_MAGIC	0xaf0194819b1635f6UL

This seems like it might be arm64-specific. Why not choose something based
off CONFIG_ILLEGAL_POINTER_VALUE (see linux/poison.h)?

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3a61a3b8eaa9..c99620c1ec20 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -11,6 +11,7 @@
>  #include <linux/nospec.h>
>  
>  #include <linux/kcov.h>
> +#include <linux/scs.h>
>  
>  #include <asm/switch_to.h>
>  #include <asm/tlb.h>
> @@ -6045,6 +6046,7 @@ void init_idle(struct task_struct *idle, int cpu)
>  	idle->se.exec_start = sched_clock();
>  	idle->flags |= PF_IDLE;
>  
> +	scs_task_reset(idle);
>  	kasan_unpoison_task_stack(idle);
>  
>  #ifdef CONFIG_SMP
> diff --git a/kernel/scs.c b/kernel/scs.c
> new file mode 100644
> index 000000000000..28abed21950c
> --- /dev/null
> +++ b/kernel/scs.c
> @@ -0,0 +1,187 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Shadow Call Stack support.
> + *
> + * Copyright (C) 2019 Google LLC
> + */
> +
> +#include <linux/cpuhotplug.h>
> +#include <linux/kasan.h>
> +#include <linux/mm.h>
> +#include <linux/mmzone.h>
> +#include <linux/scs.h>
> +#include <linux/slab.h>
> +#include <linux/vmalloc.h>
> +#include <asm/scs.h>
> +
> +static inline void *__scs_base(struct task_struct *tsk)

Please avoid using 'inline' in C files unless there's a good reason not
to let the compiler figure it out.

> +{
> +	/*
> +	 * To minimize risk the of exposure, architectures may clear a

Should be "the risk of exposure".

> +	 * task's thread_info::shadow_call_stack while that task is
> +	 * running, and only save/restore the active shadow call stack
> +	 * pointer when the usual register may be clobbered (e.g. across
> +	 * context switches).
> +	 *
> +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> +	 * upwards, so we can mask out the low bits to extract the base
> +	 * when the task is not running.
> +	 */
> +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));

Could we avoid forcing this alignment it we stored the SCS pointer as a
(base,offset) pair instead? That might be friendlier on the allocations
later on.

> +}
> +
> +static inline unsigned long *scs_magic(void *s)
> +{
> +	return (unsigned long *)(s + SCS_SIZE) - 1;
> +}
> +
> +static inline void scs_set_magic(void *s)
> +{
> +	*scs_magic(s) = SCS_END_MAGIC;

You added task_set_scs() for this sort of thing, so I'm not convinced you
need this extra helper.

> +}
> +
> +#ifdef CONFIG_SHADOW_CALL_STACK_VMAP
> +
> +/* Matches NR_CACHED_STACKS for VMAP_STACK */
> +#define NR_CACHED_SCS 2

Then they should probably both be derived from the same thing?

> +static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
> +
> +static void *scs_alloc(int node)
> +{
> +	int i;
> +	void *s;
> +
> +	for (i = 0; i < NR_CACHED_SCS; i++) {
> +		s = this_cpu_xchg(scs_cache[i], NULL);

Might be worth a comment about the re-entrancy here.

> +		if (s) {
> +			memset(s, 0, SCS_SIZE);
> +			goto out;
> +		}
> +	}
> +
> +	/*
> +	 * We allocate a full page for the shadow stack, which should be
> +	 * more than we need. Check the assumption nevertheless.
> +	 */
> +	BUILD_BUG_ON(SCS_SIZE > PAGE_SIZE);
> +
> +	s = __vmalloc_node_range(PAGE_SIZE, SCS_SIZE,
> +				 VMALLOC_START, VMALLOC_END,
> +				 GFP_SCS, PAGE_KERNEL, 0,
> +				 node, __builtin_return_address(0));
> +
> +out:
> +	if (s)
> +		scs_set_magic(s);
> +	/* TODO: poison for KASAN, unpoison in scs_free */

We don't usually commit these. What's missing?

> +
> +	return s;
> +}
> +
> +static void scs_free(void *s)
> +{
> +	int i;
> +
> +	for (i = 0; i < NR_CACHED_SCS; i++)
> +		if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
> +			return;
> +
> +	vfree_atomic(s);
> +}
> +
> +static int scs_cleanup(unsigned int cpu)
> +{
> +	int i;
> +	void **cache = per_cpu_ptr(scs_cache, cpu);
> +
> +	for (i = 0; i < NR_CACHED_SCS; i++) {
> +		vfree(cache[i]);
> +		cache[i] = NULL;
> +	}

Hmm, can this run concurrently with another CPU doing a stack allocation
with this_cpu_cmpxchg()? It probably works out on arm64 thanks to the use
of atomics, but we shouldn't be relying on that in core code.

> +
> +	return 0;
> +}
> +
> +void __init scs_init(void)
> +{
> +	WARN_ON(cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
> +			scs_cleanup) < 0);
> +}
> +
> +#else /* !CONFIG_SHADOW_CALL_STACK_VMAP */
> +
> +static struct kmem_cache *scs_cache;
> +
> +static inline void *scs_alloc(int node)
> +{
> +	void *s;
> +
> +	s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
> +	if (s) {
> +		scs_set_magic(s);
> +		/*
> +		 * Poison the allocation to catch unintentional accesses to
> +		 * the shadow stack when KASAN is enabled.
> +		 */
> +		kasan_poison_object_data(scs_cache, s);
> +	}
> +
> +	return s;
> +}
> +
> +static inline void scs_free(void *s)
> +{
> +	kasan_unpoison_object_data(scs_cache, s);
> +	kmem_cache_free(scs_cache, s);
> +}
> +
> +void __init scs_init(void)
> +{
> +	scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, SCS_SIZE,
> +				0, NULL);
> +	WARN_ON(!scs_cache);

Memory allocation failure should be noisy enough without this.

> +}
> +
> +#endif /* CONFIG_SHADOW_CALL_STACK_VMAP */
> +
> +void scs_task_reset(struct task_struct *tsk)
> +{
> +	/*
> +	 * Reset the shadow stack to the base address in case the task
> +	 * is reused.
> +	 */
> +	task_set_scs(tsk, __scs_base(tsk));
> +}

Why isn't this in the header?

> +
> +int scs_prepare(struct task_struct *tsk, int node)
> +{
> +	void *s;
> +
> +	s = scs_alloc(node);
> +	if (!s)
> +		return -ENOMEM;
> +
> +	task_set_scs(tsk, s);
> +	return 0;
> +}
> +
> +bool scs_corrupted(struct task_struct *tsk)
> +{
> +	unsigned long *magic = scs_magic(__scs_base(tsk));
> +
> +	return READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC;
> +}

Same here.

> +
> +void scs_release(struct task_struct *tsk)
> +{
> +	void *s;
> +
> +	s = __scs_base(tsk);
> +	if (!s)
> +		return;
> +
> +	WARN_ON(scs_corrupted(tsk));
> +
> +	task_set_scs(tsk, NULL);

Aren't we about to free the task here? What does clearing the scs pointer
achieve?

Will
Sami Tolvanen April 20, 2020, 9:18 p.m. UTC | #2
On Mon, Apr 20, 2020 at 06:17:28PM +0100, Will Deacon wrote:
> > +ifdef CONFIG_SHADOW_CALL_STACK
> > +CC_FLAGS_SCS	:= -fsanitize=shadow-call-stack
> > +KBUILD_CFLAGS	+= $(CC_FLAGS_SCS)
> > +export CC_FLAGS_SCS
> > +endif
> 
> CFLAGS_SCS would seem more natural to me, although I see ftrace does it this
> way.

Right, I followed ftrace's example here.

> > +config SHADOW_CALL_STACK
> > +	bool "Clang Shadow Call Stack"
> > +	depends on ARCH_SUPPORTS_SHADOW_CALL_STACK
> > +	help
> > +	  This option enables Clang's Shadow Call Stack, which uses a
> > +	  shadow stack to protect function return addresses from being
> > +	  overwritten by an attacker. More information can be found in
> > +	  Clang's documentation:
> > +
> > +	    https://clang.llvm.org/docs/ShadowCallStack.html
> > +
> > +	  Note that security guarantees in the kernel differ from the ones
> > +	  documented for user space. The kernel must store addresses of shadow
> > +	  stacks used by other tasks and interrupt handlers in memory, which
> > +	  means an attacker capable of reading and writing arbitrary memory
> > +	  may be able to locate them and hijack control flow by modifying
> > +	  shadow stacks that are not currently in use.
> 
> Shouldn't some of this depend on CC_IS_CLANG?

Sure, I'll add CC_IS_CLANG here in the next version. Note that we do
check for compiler support before selecting ARCH_SUPPORTS_SHADOW_CALL_STACK.
The flags are architecture-specific, so the check is done in the arch Kconfig.

> > +config SHADOW_CALL_STACK_VMAP
> > +	bool "Use virtually mapped shadow call stacks"
> > +	depends on SHADOW_CALL_STACK
> > +	help
> > +	  Use virtually mapped shadow call stacks. Selecting this option
> > +	  provides better stack exhaustion protection, but increases per-thread
> > +	  memory consumption as a full page is allocated for each shadow stack.
> 
> Given that this feature applies only to arm64 kernels built with clang, it
> feels weird to further segment that userbase with another config option.
> Does Android enable SHADOW_CALL_STACK_VMAP? If not, maybe we should ditch
> it for now and add it when we have a user.

Android doesn't enable the VMAP option right now due to increased memory
overhead. I'll drop it from v12.

> > +/*
> > + * A random number outside the kernel's virtual address space to mark the
> > + * end of the shadow stack.
> > + */
> > +#define SCS_END_MAGIC	0xaf0194819b1635f6UL
> 
> This seems like it might be arm64-specific. Why not choose something based
> off CONFIG_ILLEGAL_POINTER_VALUE (see linux/poison.h)?

Sure, I'll use POISON_POINTER_DELTA here.

> > +static inline void *__scs_base(struct task_struct *tsk)
> 
> Please avoid using 'inline' in C files unless there's a good reason not
> to let the compiler figure it out.

Ack.

> > +{
> > +	/*
> > +	 * To minimize risk the of exposure, architectures may clear a
> 
> Should be "the risk of exposure".

Thanks.

> > +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> > +	 * upwards, so we can mask out the low bits to extract the base
> > +	 * when the task is not running.
> > +	 */
> > +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
> 
> Could we avoid forcing this alignment it we stored the SCS pointer as a
> (base,offset) pair instead? That might be friendlier on the allocations
> later on.

The idea is to avoid storing the current task's shadow stack address in
memory, which is why I would rather not store the base address either.

> > +static inline void scs_set_magic(void *s)
> > +{
> > +	*scs_magic(s) = SCS_END_MAGIC;
> 
> You added task_set_scs() for this sort of thing, so I'm not convinced you
> need this extra helper.

Agreed, I'll drop this.

> > +	if (s)
> > +		scs_set_magic(s);
> > +	/* TODO: poison for KASAN, unpoison in scs_free */
> 
> We don't usually commit these. What's missing?

At the time, KASAN didn't support poisoning vmalloc'ed memory, but looks
like that was fixed a while back.

> > +static int scs_cleanup(unsigned int cpu)
> > +{
> > +	int i;
> > +	void **cache = per_cpu_ptr(scs_cache, cpu);
> > +
> > +	for (i = 0; i < NR_CACHED_SCS; i++) {
> > +		vfree(cache[i]);
> > +		cache[i] = NULL;
> > +	}
> 
> Hmm, can this run concurrently with another CPU doing a stack allocation
> with this_cpu_cmpxchg()? It probably works out on arm64 thanks to the use
> of atomics, but we shouldn't be relying on that in core code.

This is essentially identical to the code in kernel/fork.c. Anyway, all
of this code goes away with the VMAP option.

> > +void __init scs_init(void)
> > +{
> > +	scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, SCS_SIZE,
> > +				0, NULL);
> > +	WARN_ON(!scs_cache);
> 
> Memory allocation failure should be noisy enough without this.

Sure, I'll remove the warning.

> > +void scs_task_reset(struct task_struct *tsk)
> > +{
> > +	/*
> > +	 * Reset the shadow stack to the base address in case the task
> > +	 * is reused.
> > +	 */
> > +	task_set_scs(tsk, __scs_base(tsk));
> > +}
> 
> Why isn't this in the header?

> > +bool scs_corrupted(struct task_struct *tsk)
> > +{
> > +	unsigned long *magic = scs_magic(__scs_base(tsk));
> > +
> > +	return READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC;
> > +}
> 
> Same here.

I'll move both to the header file.

> > +void scs_release(struct task_struct *tsk)
> > +{
> > +	void *s;
> > +
> > +	s = __scs_base(tsk);
> > +	if (!s)
> > +		return;
> > +
> > +	WARN_ON(scs_corrupted(tsk));
> > +
> > +	task_set_scs(tsk, NULL);
> 
> Aren't we about to free the task here? What does clearing the scs pointer
> achieve?

True, it doesn't achieve much, only leaves one fewer shadow stack pointer
in memory. I'll drop this from the next version.

Sami
Steven Rostedt April 21, 2020, 1:12 a.m. UTC | #3
On Mon, 20 Apr 2020 18:17:28 +0100
Will Deacon <will@kernel.org> wrote:

> > +ifdef CONFIG_SHADOW_CALL_STACK
> > +CC_FLAGS_SCS	:= -fsanitize=shadow-call-stack
> > +KBUILD_CFLAGS	+= $(CC_FLAGS_SCS)
> > +export CC_FLAGS_SCS
> > +endif  
> 
> CFLAGS_SCS would seem more natural to me, although I see ftrace does it this
> way.

The CC_FLAGS_FTRACE was added by Heiko Carstens, and the "CC_FLAGS_"
appears to be a common usage in s390 :-)

That said, I like the CC_FLAGS_ notation, because the Linux build
system uses CFLAGS_* as commands:

  CFLAGS_foo.o = x
  CFLAGS_REMOVE_foo.o = y

And "CC_FLAGS_" is only for new flags and easy to search for.

-- Steve
Will Deacon April 22, 2020, 5:39 p.m. UTC | #4
On Mon, Apr 20, 2020 at 02:18:30PM -0700, Sami Tolvanen wrote:
> On Mon, Apr 20, 2020 at 06:17:28PM +0100, Will Deacon wrote:
> > > +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> > > +	 * upwards, so we can mask out the low bits to extract the base
> > > +	 * when the task is not running.
> > > +	 */
> > > +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
> > 
> > Could we avoid forcing this alignment it we stored the SCS pointer as a
> > (base,offset) pair instead? That might be friendlier on the allocations
> > later on.
> 
> The idea is to avoid storing the current task's shadow stack address in
> memory, which is why I would rather not store the base address either.

What I mean is that, instead of storing the current shadow stack pointer,
we instead store a base and an offset. We can still clear the base, as you
do with the pointer today, and I don't see that the offset is useful to
an attacker on its own.

But more generally, is it really worthwhile to do this clearing at all? Can
you (or Kees?) provide some justification for it, please? We don't do it
for anything else, e.g. the pointer authentication keys, so something
feels amiss here.

Thanks,

Will
Kees Cook April 22, 2020, 5:51 p.m. UTC | #5
On Wed, Apr 22, 2020 at 06:39:47PM +0100, Will Deacon wrote:
> On Mon, Apr 20, 2020 at 02:18:30PM -0700, Sami Tolvanen wrote:
> > On Mon, Apr 20, 2020 at 06:17:28PM +0100, Will Deacon wrote:
> > > > +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> > > > +	 * upwards, so we can mask out the low bits to extract the base
> > > > +	 * when the task is not running.
> > > > +	 */
> > > > +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
> > > 
> > > Could we avoid forcing this alignment it we stored the SCS pointer as a
> > > (base,offset) pair instead? That might be friendlier on the allocations
> > > later on.
> > 
> > The idea is to avoid storing the current task's shadow stack address in
> > memory, which is why I would rather not store the base address either.
> 
> What I mean is that, instead of storing the current shadow stack pointer,
> we instead store a base and an offset. We can still clear the base, as you
> do with the pointer today, and I don't see that the offset is useful to
> an attacker on its own.
> 
> But more generally, is it really worthwhile to do this clearing at all? Can
> you (or Kees?) provide some justification for it, please? We don't do it
> for anything else, e.g. the pointer authentication keys, so something
> feels amiss here.

It's a hardening step to just reduce the lifetime of a valid address
exposed in memory. In fact, since there is a cache, I think it should be
wiped even in scs_release().
Will Deacon April 22, 2020, 6:01 p.m. UTC | #6
On Wed, Apr 22, 2020 at 10:51:02AM -0700, Kees Cook wrote:
> On Wed, Apr 22, 2020 at 06:39:47PM +0100, Will Deacon wrote:
> > On Mon, Apr 20, 2020 at 02:18:30PM -0700, Sami Tolvanen wrote:
> > > On Mon, Apr 20, 2020 at 06:17:28PM +0100, Will Deacon wrote:
> > > > > +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> > > > > +	 * upwards, so we can mask out the low bits to extract the base
> > > > > +	 * when the task is not running.
> > > > > +	 */
> > > > > +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
> > > > 
> > > > Could we avoid forcing this alignment it we stored the SCS pointer as a
> > > > (base,offset) pair instead? That might be friendlier on the allocations
> > > > later on.
> > > 
> > > The idea is to avoid storing the current task's shadow stack address in
> > > memory, which is why I would rather not store the base address either.
> > 
> > What I mean is that, instead of storing the current shadow stack pointer,
> > we instead store a base and an offset. We can still clear the base, as you
> > do with the pointer today, and I don't see that the offset is useful to
> > an attacker on its own.
> > 
> > But more generally, is it really worthwhile to do this clearing at all? Can
> > you (or Kees?) provide some justification for it, please? We don't do it
> > for anything else, e.g. the pointer authentication keys, so something
> > feels amiss here.
> 
> It's a hardening step to just reduce the lifetime of a valid address
> exposed in memory. In fact, since there is a cache, I think it should be
> wiped even in scs_release().

But we don't do this for /anything/ else and it forces alignment
restrictions on the SCS allocation. Please either do it consistently, or
not at all.

Will
Sami Tolvanen April 22, 2020, 11:51 p.m. UTC | #7
On Wed, Apr 22, 2020 at 06:39:47PM +0100, Will Deacon wrote:
> On Mon, Apr 20, 2020 at 02:18:30PM -0700, Sami Tolvanen wrote:
> > On Mon, Apr 20, 2020 at 06:17:28PM +0100, Will Deacon wrote:
> > > > +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> > > > +	 * upwards, so we can mask out the low bits to extract the base
> > > > +	 * when the task is not running.
> > > > +	 */
> > > > +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
> > > 
> > > Could we avoid forcing this alignment it we stored the SCS pointer as a
> > > (base,offset) pair instead? That might be friendlier on the allocations
> > > later on.
> > 
> > The idea is to avoid storing the current task's shadow stack address in
> > memory, which is why I would rather not store the base address either.
> 
> What I mean is that, instead of storing the current shadow stack pointer,
> we instead store a base and an offset. We can still clear the base, as you
> do with the pointer today, and I don't see that the offset is useful to
> an attacker on its own.

I see what you mean. However, even if we store the base address +
the offset, we still need aligned allocation if we want to clear
the address. This would basically just move __scs_base() logic to
cpu_switch_to() / scs_save().

> But more generally, is it really worthwhile to do this clearing at all? Can
> you (or Kees?) provide some justification for it, please? We don't do it
> for anything else, e.g. the pointer authentication keys, so something
> feels amiss here.

Like Kees pointed out, this makes it slightly harder to locate the
current task's shadow stack pointer. I realize there are other useful
targets in thread_info, but we would rather not make this any easier
than necessary. Is your primary concern here the cost of doing this,
or just that it doesn't sufficiently improve security?

Sami
Kees Cook April 23, 2020, 6:28 p.m. UTC | #8
On Wed, Apr 22, 2020 at 04:51:34PM -0700, Sami Tolvanen wrote:
> On Wed, Apr 22, 2020 at 06:39:47PM +0100, Will Deacon wrote:
> > On Mon, Apr 20, 2020 at 02:18:30PM -0700, Sami Tolvanen wrote:
> > > On Mon, Apr 20, 2020 at 06:17:28PM +0100, Will Deacon wrote:
> > > > > +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> > > > > +	 * upwards, so we can mask out the low bits to extract the base
> > > > > +	 * when the task is not running.
> > > > > +	 */
> > > > > +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
> > > > 
> > > > Could we avoid forcing this alignment it we stored the SCS pointer as a
> > > > (base,offset) pair instead? That might be friendlier on the allocations
> > > > later on.
> > > 
> > > The idea is to avoid storing the current task's shadow stack address in
> > > memory, which is why I would rather not store the base address either.
> > 
> > What I mean is that, instead of storing the current shadow stack pointer,
> > we instead store a base and an offset. We can still clear the base, as you
> > do with the pointer today, and I don't see that the offset is useful to
> > an attacker on its own.
> 
> I see what you mean. However, even if we store the base address +
> the offset, we still need aligned allocation if we want to clear
> the address. This would basically just move __scs_base() logic to
> cpu_switch_to() / scs_save().

Okay, so, I feel like this has gotten off into the weeds, or I'm really
dense (or both). :) Going back to the original comment:

> > > > Could we avoid forcing this alignment it we stored the SCS
> > > > pointer as a (base,offset) pair instead? That might be friendlier
> > > > on the allocations later on.

I think there was some confusion about mixing the "we want to be able to
wipe the value" combined with the masking in __scs_base(). These are
unrelated, as was correctly observed with "We can still clear the base".

What I don't understand here is the suggestion to store two values:

Why is two better than storing one? With one, we only need a single access.

Why would storing the base be "friendlier on the allocations later on"?
This is coming out of a single kmem cache, in 1K chunks. They will be
naturally aligned to 1K (unless redzoing has been turned on for some
slab debugging reason). The base masking is a way to avoid needing to
store two values, and only happens at task death.

Storing two values eats memory for all tasks for seemingly no meaningful
common benefit. What am I missing here?
Will Deacon April 24, 2020, 11:21 a.m. UTC | #9
On Thu, Apr 23, 2020 at 11:28:40AM -0700, Kees Cook wrote:
> On Wed, Apr 22, 2020 at 04:51:34PM -0700, Sami Tolvanen wrote:
> > On Wed, Apr 22, 2020 at 06:39:47PM +0100, Will Deacon wrote:
> > > On Mon, Apr 20, 2020 at 02:18:30PM -0700, Sami Tolvanen wrote:
> > > > On Mon, Apr 20, 2020 at 06:17:28PM +0100, Will Deacon wrote:
> > > > > > +	 * The shadow call stack is aligned to SCS_SIZE, and grows
> > > > > > +	 * upwards, so we can mask out the low bits to extract the base
> > > > > > +	 * when the task is not running.
> > > > > > +	 */
> > > > > > +	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
> > > > > 
> > > > > Could we avoid forcing this alignment it we stored the SCS pointer as a
> > > > > (base,offset) pair instead? That might be friendlier on the allocations
> > > > > later on.
> > > > 
> > > > The idea is to avoid storing the current task's shadow stack address in
> > > > memory, which is why I would rather not store the base address either.
> > > 
> > > What I mean is that, instead of storing the current shadow stack pointer,
> > > we instead store a base and an offset. We can still clear the base, as you
> > > do with the pointer today, and I don't see that the offset is useful to
> > > an attacker on its own.
> > 
> > I see what you mean. However, even if we store the base address +
> > the offset, we still need aligned allocation if we want to clear
> > the address. This would basically just move __scs_base() logic to
> > cpu_switch_to() / scs_save().
> 
> Okay, so, I feel like this has gotten off into the weeds, or I'm really
> dense (or both). :) Going back to the original comment:
> 
> > > > > Could we avoid forcing this alignment it we stored the SCS
> > > > > pointer as a (base,offset) pair instead? That might be friendlier
> > > > > on the allocations later on.
> 
> I think there was some confusion about mixing the "we want to be able to
> wipe the value" combined with the masking in __scs_base(). These are
> unrelated, as was correctly observed with "We can still clear the base".

Having just tried to implement this, it turns out they *are* related
and we can't still clear the base, I was wrong about that :( See below.

> What I don't understand here is the suggestion to store two values:
> 
> Why is two better than storing one? With one, we only need a single access.
> 
> Why would storing the base be "friendlier on the allocations later on"?
> This is coming out of a single kmem cache, in 1K chunks. They will be
> naturally aligned to 1K (unless redzoing has been turned on for some
> slab debugging reason). The base masking is a way to avoid needing to
> store two values, and only happens at task death.

Fair enough about the kmem_cache, although I'm still worried about these
things getting bigger in future and the alignment having to increase at
the same time. We also have a bunch of static/percpu allocations that don't
use this cache.

Also, since you mentioned the lack of redzoning, isn't it a bit dodgy
allocating blindly out of the kmem_cache? It means we don't have a redzone
or a guard page, so if you can trigger something like a recursion bug then
could you scribble past the SCS before the main stack overflows? Would this
clobber somebody else's SCS? The vmap version that I asked Sami to drop
is at least better in this regard, although the guard page is at the wrong
end of the stack and we just hope that the allocation below us didn't pass
VM_NO_GUARD. Looks like the same story for vmap stack :/

> Storing two values eats memory for all tasks for seemingly no meaningful
> common benefit. What am I missing here?

I would like to remove the alignment requirements for the static and percpu
allocations. AFAICT, the only reason the alignment is needed is because you
want to convert an SCS pointer into the base pointer. The only reason *that*
is needed is because of the questionable wiping of the pointer in the
thread_info, but I really don't see the benefit of this. Unlike a crypto
secret (which was your analogy), the SCS pointer is stored in memory in
at least the following situations:

  * The task isn't running
  * The task is running in userspace
  * The task is running a vCPU in KVM
  * We're calling into EFI
  * On exception entry from EL1, as part of stacking x18
  * During CPU suspend

If we split the pointer in two (base, offset) then we could leave the
base live in the thread_info, not require alignment of the stacks (which
may allow for unconditional redzoning?) and then just update the offset
value on context switch, which could be trivially checked as part of the
existing stack overflow checking on kernel entry.

The base and offset can live in the same cacheline and be loaded with ldp,
so I don't see there being an access cost compared to a single variable.

Am I missing something (modulo us not agreeing on the utility of wiping
the pointer)?

Will
Sami Tolvanen April 27, 2020, 8:45 p.m. UTC | #10
On Fri, Apr 24, 2020 at 12:21:14PM +0100, Will Deacon wrote:
> Also, since you mentioned the lack of redzoning, isn't it a bit dodgy
> allocating blindly out of the kmem_cache? It means we don't have a redzone
> or a guard page, so if you can trigger something like a recursion bug then
> could you scribble past the SCS before the main stack overflows? Would this
> clobber somebody else's SCS?

I agree that allocating from a kmem_cache isn't ideal for safety. It's a
compromise to reduce memory overhead.

> The vmap version that I asked Sami to drop
> is at least better in this regard, although the guard page is at the wrong
> end of the stack and we just hope that the allocation below us didn't pass
> VM_NO_GUARD. Looks like the same story for vmap stack :/

SCS grows up and the guard page is after the allocation, so how is it at
the wrong end? Am I missing something here?

> If we split the pointer in two (base, offset) then we could leave the
> base live in the thread_info, not require alignment of the stacks (which
> may allow for unconditional redzoning?) and then just update the offset
> value on context switch, which could be trivially checked as part of the
> existing stack overflow checking on kernel entry.

I sent out v13 with split pointers, but I'm not sure it's convenient to
add an overflow check to kernel_ventry where the VMAP_STACK check is
done. I suppose I could add a check to kernel_entry after we load x18
from tsk. Thoughts?

Sami
Will Deacon May 4, 2020, 4:52 p.m. UTC | #11
On Mon, Apr 27, 2020 at 01:45:46PM -0700, Sami Tolvanen wrote:
> On Fri, Apr 24, 2020 at 12:21:14PM +0100, Will Deacon wrote:
> > Also, since you mentioned the lack of redzoning, isn't it a bit dodgy
> > allocating blindly out of the kmem_cache? It means we don't have a redzone
> > or a guard page, so if you can trigger something like a recursion bug then
> > could you scribble past the SCS before the main stack overflows? Would this
> > clobber somebody else's SCS?
> 
> I agree that allocating from a kmem_cache isn't ideal for safety. It's a
> compromise to reduce memory overhead.

Do you think it would be a problem if we always allocated a page for the
SCS?

> > The vmap version that I asked Sami to drop
> > is at least better in this regard, although the guard page is at the wrong
> > end of the stack and we just hope that the allocation below us didn't pass
> > VM_NO_GUARD. Looks like the same story for vmap stack :/
> 
> SCS grows up and the guard page is after the allocation, so how is it at
> the wrong end? Am I missing something here?

Sorry, I'd got the SCS upside-down in my head (hey, that second 'S' stands
for 'Stack'!). But I think I'm right about vmap stack, which feels a
little fragile even though it seems to work out today with the very limited
uses of VM_NO_GUARD.

> > If we split the pointer in two (base, offset) then we could leave the
> > base live in the thread_info, not require alignment of the stacks (which
> > may allow for unconditional redzoning?) and then just update the offset
> > value on context switch, which could be trivially checked as part of the
> > existing stack overflow checking on kernel entry.
> 
> I sent out v13 with split pointers, but I'm not sure it's convenient to
> add an overflow check to kernel_ventry where the VMAP_STACK check is
> done. I suppose I could add a check to kernel_entry after we load x18
> from tsk. Thoughts?

I'll take a look at v13, since at this stage I'm keen to get something
queued up so that we can use it as a base for further improvements without
you having to repost the whole stack every time.

Cheers,

Will
Sami Tolvanen May 4, 2020, 5:33 p.m. UTC | #12
On Mon, May 04, 2020 at 05:52:28PM +0100, Will Deacon wrote:
> On Mon, Apr 27, 2020 at 01:45:46PM -0700, Sami Tolvanen wrote:
> > I agree that allocating from a kmem_cache isn't ideal for safety. It's a
> > compromise to reduce memory overhead.
> 
> Do you think it would be a problem if we always allocated a page for the
> SCS?

Yes, the memory overhead was deemed too large for Android devices, which
have thousands of threads running.

Sami
Jann Horn May 4, 2020, 6:03 p.m. UTC | #13
On Mon, May 4, 2020 at 6:52 PM Will Deacon <will@kernel.org> wrote:
> On Mon, Apr 27, 2020 at 01:45:46PM -0700, Sami Tolvanen wrote:
> > On Fri, Apr 24, 2020 at 12:21:14PM +0100, Will Deacon wrote:
> > > Also, since you mentioned the lack of redzoning, isn't it a bit dodgy
> > > allocating blindly out of the kmem_cache? It means we don't have a redzone
> > > or a guard page, so if you can trigger something like a recursion bug then
> > > could you scribble past the SCS before the main stack overflows? Would this
> > > clobber somebody else's SCS?
> >
> > I agree that allocating from a kmem_cache isn't ideal for safety. It's a
> > compromise to reduce memory overhead.
>
> Do you think it would be a problem if we always allocated a page for the
> SCS?

I guess doing this safely and without wasting a page per task would
only be possible in an elegant way once MTE lands on devices?

I wonder how bad context switch latency would be if the actual SCS was
percpu and vmapped (starting at an offset inside the page such that
the SCS can only grow up to something like 0x400 bytes before
panicking the CPU) and the context switch path saved/restored the used
part of the vmapped SCS into a smaller allocation from the slab
allocator... presumably the SCS will usually just be something like
one cacheline big? That probably only costs a moderate amount of time
to copy...
Or as an extension of that, if the SCS copying turns out to be too
costly, there could be a percpu LRU cache consisting of vmapped SCS
pages, and whenever a task gets scheduled that doesn't have a vmapped
SCS, it "swaps out" the contents of the least recently used vmapped
SCS into the corresponding task's slab SCS, and "swaps in" from its
own slab SCS into the vmapped SCS. And task migration would force
"swapping out".

Not sure if this is a good idea, or if I'm just making things worse by
suggesting extra complexity...
Kees Cook May 4, 2020, 6:06 p.m. UTC | #14
On Mon, May 04, 2020 at 05:52:28PM +0100, Will Deacon wrote:
> On Mon, Apr 27, 2020 at 01:45:46PM -0700, Sami Tolvanen wrote:
> > On Fri, Apr 24, 2020 at 12:21:14PM +0100, Will Deacon wrote:
> > > The vmap version that I asked Sami to drop
> > > is at least better in this regard, although the guard page is at the wrong
> > > end of the stack and we just hope that the allocation below us didn't pass
> > > VM_NO_GUARD. Looks like the same story for vmap stack :/
> > 
> > SCS grows up and the guard page is after the allocation, so how is it at
> > the wrong end? Am I missing something here?
> 
> Sorry, I'd got the SCS upside-down in my head (hey, that second 'S' stands
> for 'Stack'!). But I think I'm right about vmap stack, which feels a
> little fragile even though it seems to work out today with the very limited
> uses of VM_NO_GUARD.

Yeah, when VMAP_STACK was originally being developed, IIRC, there was
an effort made to eliminate all the users of VM_NO_GUARD, and it looks
like it's mostly there. Really the only use left is arm64's kernel image
mapping routines, and then it's not actually used in the traditional
sense -- it's just a boolean for whether to toss in a guard page at the
end of the data section, and the VMAs are built manually. I think that
code could actually be refactored to drop it too and then the only user
would be KASAN, which, IIUC, wants to build consecutive vmap areas.
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index 70def4907036..baea6024b409 100644
--- a/Makefile
+++ b/Makefile
@@ -866,6 +866,12 @@  ifdef CONFIG_LIVEPATCH
 KBUILD_CFLAGS += $(call cc-option, -flive-patching=inline-clone)
 endif
 
+ifdef CONFIG_SHADOW_CALL_STACK
+CC_FLAGS_SCS	:= -fsanitize=shadow-call-stack
+KBUILD_CFLAGS	+= $(CC_FLAGS_SCS)
+export CC_FLAGS_SCS
+endif
+
 # arch Makefile may override CC so keep this after arch Makefile is included
 NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
 
diff --git a/arch/Kconfig b/arch/Kconfig
index 786a85d4ad40..691a552c2cc3 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -533,6 +533,40 @@  config STACKPROTECTOR_STRONG
 	  about 20% of all kernel functions, which increases the kernel code
 	  size by about 2%.
 
+config ARCH_SUPPORTS_SHADOW_CALL_STACK
+	bool
+	help
+	  An architecture should select this if it supports Clang's Shadow
+	  Call Stack, has asm/scs.h, and implements runtime support for shadow
+	  stack switching.
+
+config SHADOW_CALL_STACK
+	bool "Clang Shadow Call Stack"
+	depends on ARCH_SUPPORTS_SHADOW_CALL_STACK
+	help
+	  This option enables Clang's Shadow Call Stack, which uses a
+	  shadow stack to protect function return addresses from being
+	  overwritten by an attacker. More information can be found in
+	  Clang's documentation:
+
+	    https://clang.llvm.org/docs/ShadowCallStack.html
+
+	  Note that security guarantees in the kernel differ from the ones
+	  documented for user space. The kernel must store addresses of shadow
+	  stacks used by other tasks and interrupt handlers in memory, which
+	  means an attacker capable of reading and writing arbitrary memory
+	  may be able to locate them and hijack control flow by modifying
+	  shadow stacks that are not currently in use.
+
+config SHADOW_CALL_STACK_VMAP
+	bool "Use virtually mapped shadow call stacks"
+	depends on SHADOW_CALL_STACK
+	help
+	  Use virtually mapped shadow call stacks. Selecting this option
+	  provides better stack exhaustion protection, but increases per-thread
+	  memory consumption as a full page is allocated for each shadow stack.
+
+
 config HAVE_ARCH_WITHIN_STACK_FRAMES
 	bool
 	help
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 333a6695a918..18fc4d29ef27 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -42,3 +42,9 @@ 
  * compilers, like ICC.
  */
 #define barrier() __asm__ __volatile__("" : : : "memory")
+
+#if __has_feature(shadow_call_stack)
+# define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
+#else
+# define __noscs
+#endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index e970f97a7fcb..97b62f47a80d 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -193,6 +193,10 @@  struct ftrace_likely_data {
 # define randomized_struct_fields_end
 #endif
 
+#ifndef __noscs
+# define __noscs
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
diff --git a/include/linux/scs.h b/include/linux/scs.h
new file mode 100644
index 000000000000..c5572fd770b0
--- /dev/null
+++ b/include/linux/scs.h
@@ -0,0 +1,57 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shadow Call Stack support.
+ *
+ * Copyright (C) 2019 Google LLC
+ */
+
+#ifndef _LINUX_SCS_H
+#define _LINUX_SCS_H
+
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <asm/page.h>
+
+#ifdef CONFIG_SHADOW_CALL_STACK
+
+/*
+ * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit
+ * architecture) provided ~40% safety margin on stack usage while keeping
+ * memory allocation overhead reasonable.
+ */
+#define SCS_SIZE	1024UL
+#define GFP_SCS		(GFP_KERNEL | __GFP_ZERO)
+
+/*
+ * A random number outside the kernel's virtual address space to mark the
+ * end of the shadow stack.
+ */
+#define SCS_END_MAGIC	0xaf0194819b1635f6UL
+
+#define task_scs(tsk)	(task_thread_info(tsk)->shadow_call_stack)
+
+static inline void task_set_scs(struct task_struct *tsk, void *s)
+{
+	task_scs(tsk) = s;
+}
+
+extern void scs_init(void);
+extern void scs_task_reset(struct task_struct *tsk);
+extern int scs_prepare(struct task_struct *tsk, int node);
+extern bool scs_corrupted(struct task_struct *tsk);
+extern void scs_release(struct task_struct *tsk);
+
+#else /* CONFIG_SHADOW_CALL_STACK */
+
+#define task_scs(tsk)	NULL
+
+static inline void task_set_scs(struct task_struct *tsk, void *s) {}
+static inline void scs_init(void) {}
+static inline void scs_task_reset(struct task_struct *tsk) {}
+static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; }
+static inline bool scs_corrupted(struct task_struct *tsk) { return false; }
+static inline void scs_release(struct task_struct *tsk) {}
+
+#endif /* CONFIG_SHADOW_CALL_STACK */
+
+#endif /* _LINUX_SCS_H */
diff --git a/init/init_task.c b/init/init_task.c
index bd403ed3e418..aaa71366d162 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -11,6 +11,7 @@ 
 #include <linux/mm.h>
 #include <linux/audit.h>
 #include <linux/numa.h>
+#include <linux/scs.h>
 
 #include <asm/pgtable.h>
 #include <linux/uaccess.h>
@@ -185,6 +186,13 @@  struct task_struct init_task
 };
 EXPORT_SYMBOL(init_task);
 
+#ifdef CONFIG_SHADOW_CALL_STACK
+unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] __init_task_data
+		__aligned(SCS_SIZE) = {
+	[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
+};
+#endif
+
 /*
  * Initial thread structure. Alignment of this is handled by a special
  * linker map entry.
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb4130ced32..c332eb9d4841 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -103,6 +103,7 @@  obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 4385f3d639f2..c4c984d29573 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -94,6 +94,7 @@ 
 #include <linux/thread_info.h>
 #include <linux/stackleak.h>
 #include <linux/kasan.h>
+#include <linux/scs.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -456,6 +457,8 @@  void put_task_stack(struct task_struct *tsk)
 
 void free_task(struct task_struct *tsk)
 {
+	scs_release(tsk);
+
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 	/*
 	 * The task is finally done with both the stack and thread_info,
@@ -840,6 +843,8 @@  void __init fork_init(void)
 			  NULL, free_vm_stack_cache);
 #endif
 
+	scs_init();
+
 	lockdep_init_task(&init_task);
 	uprobes_init();
 }
@@ -899,6 +904,10 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (err)
 		goto free_stack;
 
+	err = scs_prepare(tsk, node);
+	if (err)
+		goto free_stack;
+
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..c99620c1ec20 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,7 @@ 
 #include <linux/nospec.h>
 
 #include <linux/kcov.h>
+#include <linux/scs.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -6045,6 +6046,7 @@  void init_idle(struct task_struct *idle, int cpu)
 	idle->se.exec_start = sched_clock();
 	idle->flags |= PF_IDLE;
 
+	scs_task_reset(idle);
 	kasan_unpoison_task_stack(idle);
 
 #ifdef CONFIG_SMP
diff --git a/kernel/scs.c b/kernel/scs.c
new file mode 100644
index 000000000000..28abed21950c
--- /dev/null
+++ b/kernel/scs.c
@@ -0,0 +1,187 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shadow Call Stack support.
+ *
+ * Copyright (C) 2019 Google LLC
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/scs.h>
+
+static inline void *__scs_base(struct task_struct *tsk)
+{
+	/*
+	 * To minimize risk the of exposure, architectures may clear a
+	 * task's thread_info::shadow_call_stack while that task is
+	 * running, and only save/restore the active shadow call stack
+	 * pointer when the usual register may be clobbered (e.g. across
+	 * context switches).
+	 *
+	 * The shadow call stack is aligned to SCS_SIZE, and grows
+	 * upwards, so we can mask out the low bits to extract the base
+	 * when the task is not running.
+	 */
+	return (void *)((unsigned long)task_scs(tsk) & ~(SCS_SIZE - 1));
+}
+
+static inline unsigned long *scs_magic(void *s)
+{
+	return (unsigned long *)(s + SCS_SIZE) - 1;
+}
+
+static inline void scs_set_magic(void *s)
+{
+	*scs_magic(s) = SCS_END_MAGIC;
+}
+
+#ifdef CONFIG_SHADOW_CALL_STACK_VMAP
+
+/* Matches NR_CACHED_STACKS for VMAP_STACK */
+#define NR_CACHED_SCS 2
+static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
+
+static void *scs_alloc(int node)
+{
+	int i;
+	void *s;
+
+	for (i = 0; i < NR_CACHED_SCS; i++) {
+		s = this_cpu_xchg(scs_cache[i], NULL);
+		if (s) {
+			memset(s, 0, SCS_SIZE);
+			goto out;
+		}
+	}
+
+	/*
+	 * We allocate a full page for the shadow stack, which should be
+	 * more than we need. Check the assumption nevertheless.
+	 */
+	BUILD_BUG_ON(SCS_SIZE > PAGE_SIZE);
+
+	s = __vmalloc_node_range(PAGE_SIZE, SCS_SIZE,
+				 VMALLOC_START, VMALLOC_END,
+				 GFP_SCS, PAGE_KERNEL, 0,
+				 node, __builtin_return_address(0));
+
+out:
+	if (s)
+		scs_set_magic(s);
+	/* TODO: poison for KASAN, unpoison in scs_free */
+
+	return s;
+}
+
+static void scs_free(void *s)
+{
+	int i;
+
+	for (i = 0; i < NR_CACHED_SCS; i++)
+		if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
+			return;
+
+	vfree_atomic(s);
+}
+
+static int scs_cleanup(unsigned int cpu)
+{
+	int i;
+	void **cache = per_cpu_ptr(scs_cache, cpu);
+
+	for (i = 0; i < NR_CACHED_SCS; i++) {
+		vfree(cache[i]);
+		cache[i] = NULL;
+	}
+
+	return 0;
+}
+
+void __init scs_init(void)
+{
+	WARN_ON(cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
+			scs_cleanup) < 0);
+}
+
+#else /* !CONFIG_SHADOW_CALL_STACK_VMAP */
+
+static struct kmem_cache *scs_cache;
+
+static inline void *scs_alloc(int node)
+{
+	void *s;
+
+	s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
+	if (s) {
+		scs_set_magic(s);
+		/*
+		 * Poison the allocation to catch unintentional accesses to
+		 * the shadow stack when KASAN is enabled.
+		 */
+		kasan_poison_object_data(scs_cache, s);
+	}
+
+	return s;
+}
+
+static inline void scs_free(void *s)
+{
+	kasan_unpoison_object_data(scs_cache, s);
+	kmem_cache_free(scs_cache, s);
+}
+
+void __init scs_init(void)
+{
+	scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, SCS_SIZE,
+				0, NULL);
+	WARN_ON(!scs_cache);
+}
+
+#endif /* CONFIG_SHADOW_CALL_STACK_VMAP */
+
+void scs_task_reset(struct task_struct *tsk)
+{
+	/*
+	 * Reset the shadow stack to the base address in case the task
+	 * is reused.
+	 */
+	task_set_scs(tsk, __scs_base(tsk));
+}
+
+int scs_prepare(struct task_struct *tsk, int node)
+{
+	void *s;
+
+	s = scs_alloc(node);
+	if (!s)
+		return -ENOMEM;
+
+	task_set_scs(tsk, s);
+	return 0;
+}
+
+bool scs_corrupted(struct task_struct *tsk)
+{
+	unsigned long *magic = scs_magic(__scs_base(tsk));
+
+	return READ_ONCE_NOCHECK(*magic) != SCS_END_MAGIC;
+}
+
+void scs_release(struct task_struct *tsk)
+{
+	void *s;
+
+	s = __scs_base(tsk);
+	if (!s)
+		return;
+
+	WARN_ON(scs_corrupted(tsk));
+
+	task_set_scs(tsk, NULL);
+	scs_free(s);
+}