diff mbox series

[v2,24/39] x86/cet/shstk: Add user-mode shadow stack support

Message ID 20220929222936.14584-25-rick.p.edgecombe@intel.com (mailing list archive)
State New
Headers show
Series Shadowstacks for userspace | expand

Commit Message

Edgecombe, Rick P Sept. 29, 2022, 10:29 p.m. UTC
From: Yu-cheng Yu <yu-cheng.yu@intel.com>

Introduce basic shadow stack enabling/disabling/allocation routines.
A task's shadow stack is allocated from memory with VM_SHADOW_STACK flag
and has a fixed size of min(RLIMIT_STACK, 4GB).

Keep the task's shadow stack address and size in thread_struct. This will
be copied when cloning new threads, but needs to be cleared during exec,
so add a function to do this.

Do not support IA32 emulation.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Kees Cook <keescook@chromium.org>

---

v2:
 - Get rid of unnessary shstk->base checks
 - Don't support IA32 emulation

v1:
 - Switch to xsave helpers.
 - Expand commit log.

Yu-cheng v30:
 - Remove superfluous comments for struct thread_shstk.
 - Replace 'populate' with 'unused'.

Yu-cheng v28:
 - Update shstk_setup() with wrmsrl_safe(), returns success when shadow
   stack feature is not present (since this is a setup function).

 arch/x86/include/asm/cet.h        |  13 +++
 arch/x86/include/asm/msr.h        |  11 +++
 arch/x86/include/asm/processor.h  |   5 ++
 arch/x86/include/uapi/asm/prctl.h |   2 +
 arch/x86/kernel/Makefile          |   2 +
 arch/x86/kernel/process_64.c      |   2 +
 arch/x86/kernel/shstk.c           | 143 ++++++++++++++++++++++++++++++
 7 files changed, 178 insertions(+)

Comments

Kees Cook Oct. 3, 2022, 7:43 p.m. UTC | #1
On Thu, Sep 29, 2022 at 03:29:21PM -0700, Rick Edgecombe wrote:
> From: Yu-cheng Yu <yu-cheng.yu@intel.com>
> 
> Introduce basic shadow stack enabling/disabling/allocation routines.
> A task's shadow stack is allocated from memory with VM_SHADOW_STACK flag
> and has a fixed size of min(RLIMIT_STACK, 4GB).
> 
> Keep the task's shadow stack address and size in thread_struct. This will
> be copied when cloning new threads, but needs to be cleared during exec,
> so add a function to do this.
> 
> Do not support IA32 emulation.
> 
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
> Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Cc: Kees Cook <keescook@chromium.org>
> 
> ---
> 
> v2:
>  - Get rid of unnessary shstk->base checks
>  - Don't support IA32 emulation
> 
> v1:
>  - Switch to xsave helpers.
>  - Expand commit log.
> 
> Yu-cheng v30:
>  - Remove superfluous comments for struct thread_shstk.
>  - Replace 'populate' with 'unused'.
> 
> Yu-cheng v28:
>  - Update shstk_setup() with wrmsrl_safe(), returns success when shadow
>    stack feature is not present (since this is a setup function).
> 
>  arch/x86/include/asm/cet.h        |  13 +++
>  arch/x86/include/asm/msr.h        |  11 +++
>  arch/x86/include/asm/processor.h  |   5 ++
>  arch/x86/include/uapi/asm/prctl.h |   2 +
>  arch/x86/kernel/Makefile          |   2 +
>  arch/x86/kernel/process_64.c      |   2 +
>  arch/x86/kernel/shstk.c           | 143 ++++++++++++++++++++++++++++++
>  7 files changed, 178 insertions(+)
> 
> diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
> index 0fa4dbc98c49..a4a1f4c0089b 100644
> --- a/arch/x86/include/asm/cet.h
> +++ b/arch/x86/include/asm/cet.h
> @@ -7,12 +7,25 @@
>  
>  struct task_struct;
>  
> +struct thread_shstk {
> +	u64	base;
> +	u64	size;
> +};
> +
>  #ifdef CONFIG_X86_SHADOW_STACK
>  long cet_prctl(struct task_struct *task, int option,
>  		      unsigned long features);
> +int shstk_setup(void);
> +void shstk_free(struct task_struct *p);
> +int shstk_disable(void);
> +void reset_thread_shstk(void);
>  #else
>  static inline long cet_prctl(struct task_struct *task, int option,
>  		      unsigned long features) { return -EINVAL; }
> +static inline int shstk_setup(void) { return -EOPNOTSUPP; }
> +static inline void shstk_free(struct task_struct *p) {}
> +static inline int shstk_disable(void) { return -EOPNOTSUPP; }
> +static inline void reset_thread_shstk(void) {}
>  #endif /* CONFIG_X86_SHADOW_STACK */

shstk_setup() and shstk_disable() are not called outside of shstk.c, so
they can be removed from this header entirely.

>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
> index 65ec1965cd28..a9cb4c434e60 100644
> --- a/arch/x86/include/asm/msr.h
> +++ b/arch/x86/include/asm/msr.h
> @@ -310,6 +310,17 @@ void msrs_free(struct msr *msrs);
>  int msr_set_bit(u32 msr, u8 bit);
>  int msr_clear_bit(u32 msr, u8 bit);
>  
> +static inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
> +{
> +	u64 val, new_val;
> +
> +	rdmsrl(msr, val);
> +	new_val = (val & ~clear) | set;
> +
> +	if (new_val != val)
> +		wrmsrl(msr, new_val);
> +}

I always get uncomfortable when I see these kinds of generalized helper
functions for touching cpu bits, etc. It just begs for future attacker
abuse to muck with arbitrary bits -- even marked inline there is a risk
the compiler will ignore that in some circumstances (not as currently
used in the code, but I'm imagining future changes leading to such a
condition). Will you humor me and change this to a macro instead? That'll
force it always inline (even __always_inline isn't always inline):

/* Helper that can never get accidentally un-inlined. */
#define set_clr_bits_msrl(msr, set, clear)	do {	\
	u64 __val, __new_val;				\
							\
	rdmsrl(msr, __val);				\
	__new_val = (__val & ~(clear)) | (set);		\
							\
	if (__new_val != __val)				\
		wrmsrl(msr, __new_val);			\
} while (0)


> +
>  #ifdef CONFIG_SMP
>  int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
>  int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index a92bf76edafe..3a0c9d9d4d1d 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -27,6 +27,7 @@ struct vm86;
>  #include <asm/unwind_hints.h>
>  #include <asm/vmxfeatures.h>
>  #include <asm/vdso/processor.h>
> +#include <asm/cet.h>
>  
>  #include <linux/personality.h>
>  #include <linux/cache.h>
> @@ -533,6 +534,10 @@ struct thread_struct {
>  	unsigned long		features;
>  	unsigned long		features_locked;
>  
> +#ifdef CONFIG_X86_SHADOW_STACK
> +	struct thread_shstk	shstk;
> +#endif
> +
>  	/* Floating point and extended processor state */
>  	struct fpu		fpu;
>  	/*
> diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
> index 028158e35269..41af3a8c4fa4 100644
> --- a/arch/x86/include/uapi/asm/prctl.h
> +++ b/arch/x86/include/uapi/asm/prctl.h
> @@ -26,4 +26,6 @@
>  #define ARCH_CET_DISABLE		0x4002
>  #define ARCH_CET_LOCK			0x4003
>  
For readability, maybe add: /* ARCH_CET_* "features" bits */

> +#define CET_SHSTK			0x1

This is UAPI, so the BIT() macro isn't available, but since this is
unsigned long, please use the form:  (1ULL <<  0)  etc...

> +
>  #endif /* _ASM_X86_PRCTL_H */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index a20a5ebfacd7..8950d1f71226 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -139,6 +139,8 @@ obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
>  
>  obj-$(CONFIG_AMD_MEM_ENCRYPT)		+= sev.o
>  
> +obj-$(CONFIG_X86_SHADOW_STACK)		+= shstk.o
> +
>  ###
>  # 64 bit specific files
>  ifeq ($(CONFIG_X86_64),y)
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 8fa2c2b7de65..be544b4b4c8b 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -514,6 +514,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
>  		load_gs_index(__USER_DS);
>  	}
>  
> +	reset_thread_shstk();
> +
>  	loadsegment(fs, 0);
>  	loadsegment(es, _ds);
>  	loadsegment(ds, _ds);
> diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
> index e3276ac9e9b9..a0b8d4adb2bf 100644
> --- a/arch/x86/kernel/shstk.c
> +++ b/arch/x86/kernel/shstk.c
> @@ -8,8 +8,151 @@
>  
>  #include <linux/sched.h>
>  #include <linux/bitops.h>
> +#include <linux/types.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/sched/signal.h>
> +#include <linux/compat.h>
> +#include <linux/sizes.h>
> +#include <linux/user.h>
> +#include <asm/msr.h>
> +#include <asm/fpu/xstate.h>
> +#include <asm/fpu/types.h>
> +#include <asm/cet.h>
> +#include <asm/special_insns.h>
> +#include <asm/fpu/api.h>
>  #include <asm/prctl.h>
>  
> +static bool feature_enabled(unsigned long features)
> +{
> +	return current->thread.features & features;
> +}
> +
> +static void feature_set(unsigned long features)
> +{
> +	current->thread.features |= features;
> +}
> +
> +static void feature_clr(unsigned long features)
> +{
> +	current->thread.features &= ~features;
> +}

"feature" vs "features" here is confusing. Should these helpers enforce
the single-bit-set requirements? If so, please switch to a bit number
instead of a mask. If not, please rename these to
"features_{enabled,set,clr}", and fix "features_enabled" to check them
all:
	return (current->thread.features & features) == features;

> +static unsigned long alloc_shstk(unsigned long size)
> +{
> +	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
> +	struct mm_struct *mm = current->mm;
> +	unsigned long addr, unused;

WARN_ON + clamp on "size" here, or perhaps move the bounds check from
shstk_setup() into here?

> +
> +	mmap_write_lock(mm);
> +	addr = do_mmap(NULL, addr, size, PROT_READ, flags,
> +		       VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);

This will use the mmap base address offset randomization, I guess?

> +
> +	mmap_write_unlock(mm);
> +
> +	return addr;
> +}
> +
> +static void unmap_shadow_stack(u64 base, u64 size)
> +{
> +	while (1) {
> +		int r;
> +
> +		r = vm_munmap(base, size);
> +
> +		/*
> +		 * vm_munmap() returns -EINTR when mmap_lock is held by
> +		 * something else, and that lock should not be held for a
> +		 * long time.  Retry it for the case.
> +		 */
> +		if (r == -EINTR) {
> +			cond_resched();
> +			continue;
> +		}
> +
> +		/*
> +		 * For all other types of vm_munmap() failure, either the
> +		 * system is out of memory or there is bug.
> +		 */
> +		WARN_ON_ONCE(r);
> +		break;
> +	}
> +}
> +
> +int shstk_setup(void)

Only called local. Make static?

> +{
> +	struct thread_shstk *shstk = &current->thread.shstk;
> +	unsigned long addr, size;
> +
> +	/* Already enabled */
> +	if (feature_enabled(CET_SHSTK))
> +		return 0;
> +
> +	/* Also not supported for 32 bit */
> +	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) || in_ia32_syscall())
> +		return -EOPNOTSUPP;
> +
> +	size = PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
> +	addr = alloc_shstk(size);
> +	if (IS_ERR_VALUE(addr))
> +		return PTR_ERR((void *)addr);
> +
> +	fpu_lock_and_load();
> +	wrmsrl(MSR_IA32_PL3_SSP, addr + size);
> +	wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
> +	fpregs_unlock();
> +
> +	shstk->base = addr;
> +	shstk->size = size;
> +	feature_set(CET_SHSTK);
> +
> +	return 0;
> +}
> +
> +void reset_thread_shstk(void)
> +{
> +	memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
> +	current->thread.features = 0;
> +	current->thread.features_locked = 0;
> +}

If features is always going to be tied to shstk, why not put them in the
shstk struct?

Also, shouldn't this also be called from arch_setup_new_exec() instead
of the open-coded wipe of features there?

> +
> +void shstk_free(struct task_struct *tsk)
> +{
> +	struct thread_shstk *shstk = &tsk->thread.shstk;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
> +	    !feature_enabled(CET_SHSTK))
> +		return;
> +
> +	if (!tsk->mm)
> +		return;
> +
> +	unmap_shadow_stack(shstk->base, shstk->size);

I feel like base and size should be zeroed here?

> +}
> +
> +int shstk_disable(void)

This is only called locally. static?

> +{
> +	if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> +		return -EOPNOTSUPP;
> +
> +	/* Already disabled? */
> +	if (!feature_enabled(CET_SHSTK))
> +		return 0;
> +
> +	fpu_lock_and_load();
> +	/* Disable WRSS too when disabling shadow stack */
> +	set_clr_bits_msrl(MSR_IA32_U_CET, 0, CET_SHSTK_EN);
> +	wrmsrl(MSR_IA32_PL3_SSP, 0);
> +	fpregs_unlock();
> +
> +	shstk_free(current);
> +	feature_clr(CET_SHSTK);
> +
> +	return 0;
> +}
> +
>  long cet_prctl(struct task_struct *task, int option, unsigned long features)
>  {
>  	if (option == ARCH_CET_LOCK) {
> -- 
> 2.17.1
>
Dave Hansen Oct. 3, 2022, 8:04 p.m. UTC | #2
On 10/3/22 12:43, Kees Cook wrote:
>> +static inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
>> +{
>> +	u64 val, new_val;
>> +
>> +	rdmsrl(msr, val);
>> +	new_val = (val & ~clear) | set;
>> +
>> +	if (new_val != val)
>> +		wrmsrl(msr, new_val);
>> +}
> I always get uncomfortable when I see these kinds of generalized helper
> functions for touching cpu bits, etc. It just begs for future attacker
> abuse to muck with arbitrary bits -- even marked inline there is a risk
> the compiler will ignore that in some circumstances (not as currently
> used in the code, but I'm imagining future changes leading to such a
> condition). Will you humor me and change this to a macro instead? That'll
> force it always inline (even __always_inline isn't always inline):

Oh, are you thinking that this is dangerous because it's so surgical and
non-intrusive?  It's even more powerful to an attacker than, say
wrmsrl(), because there they actually have to know what the existing
value is to update it.  With this helper, it's quite easy to flip an
individual bit without disturbing the neighboring bits.

Is that it?

I don't _like_ the #defines, but doing one here doesn't seem too onerous
considering how critical MSRs are.
Kees Cook Oct. 4, 2022, 4:04 a.m. UTC | #3
On Mon, Oct 03, 2022 at 01:04:37PM -0700, Dave Hansen wrote:
> On 10/3/22 12:43, Kees Cook wrote:
> >> +static inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
> >> +{
> >> +	u64 val, new_val;
> >> +
> >> +	rdmsrl(msr, val);
> >> +	new_val = (val & ~clear) | set;
> >> +
> >> +	if (new_val != val)
> >> +		wrmsrl(msr, new_val);
> >> +}
> > I always get uncomfortable when I see these kinds of generalized helper
> > functions for touching cpu bits, etc. It just begs for future attacker
> > abuse to muck with arbitrary bits -- even marked inline there is a risk
> > the compiler will ignore that in some circumstances (not as currently
> > used in the code, but I'm imagining future changes leading to such a
> > condition). Will you humor me and change this to a macro instead? That'll
> > force it always inline (even __always_inline isn't always inline):
> 
> Oh, are you thinking that this is dangerous because it's so surgical and
> non-intrusive?  It's even more powerful to an attacker than, say
> wrmsrl(), because there they actually have to know what the existing
> value is to update it.  With this helper, it's quite easy to flip an
> individual bit without disturbing the neighboring bits.
> 
> Is that it?

Yeah, it was kind of the combo: both a potential entry point to wrmsrl
for arbitrary values, but also one where all the work is done to mask
stuff out.

> I don't _like_ the #defines, but doing one here doesn't seem too onerous
> considering how critical MSRs are.

I bet there are others, but this just weirded me out. I'll live with a
macro, especially since the chance of it mutating in a non-inline is
very small, but I figured I'd mention the idea.
David Laight Oct. 4, 2022, 10:17 a.m. UTC | #4
From: Dave Hansen
> Sent: 03 October 2022 21:05
> 
> On 10/3/22 12:43, Kees Cook wrote:
> >> +static inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
> >> +{
> >> +	u64 val, new_val;
> >> +
> >> +	rdmsrl(msr, val);
> >> +	new_val = (val & ~clear) | set;
> >> +
> >> +	if (new_val != val)
> >> +		wrmsrl(msr, new_val);
> >> +}
> > I always get uncomfortable when I see these kinds of generalized helper
> > functions for touching cpu bits, etc. It just begs for future attacker
> > abuse to muck with arbitrary bits -- even marked inline there is a risk
> > the compiler will ignore that in some circumstances (not as currently
> > used in the code, but I'm imagining future changes leading to such a
> > condition). Will you humor me and change this to a macro instead? That'll
> > force it always inline (even __always_inline isn't always inline):
> 
> Oh, are you thinking that this is dangerous because it's so surgical and
> non-intrusive?  It's even more powerful to an attacker than, say
> wrmsrl(), because there they actually have to know what the existing
> value is to update it.  With this helper, it's quite easy to flip an
> individual bit without disturbing the neighboring bits.
> 
> Is that it?
> 
> I don't _like_ the #defines, but doing one here doesn't seem too onerous
> considering how critical MSRs are.

How often is the 'msr' number not a compile-time constant?
Adding rd/wrmsr variants that verify this would reduce the
attack surface as well.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Edgecombe, Rick P Oct. 4, 2022, 4:25 p.m. UTC | #5
On Mon, 2022-10-03 at 21:04 -0700, Kees Cook wrote:
> > I don't _like_ the #defines, but doing one here doesn't seem too
> > onerous
> > considering how critical MSRs are.
> 
> I bet there are others, but this just weirded me out. I'll live with
> a
> macro, especially since the chance of it mutating in a non-inline is
> very small, but I figured I'd mention the idea.

Makes sense. I'll change it to a define.
Kees Cook Oct. 4, 2022, 7:32 p.m. UTC | #6
On Tue, Oct 04, 2022 at 10:17:57AM +0000, David Laight wrote:
> From: Dave Hansen
> > Sent: 03 October 2022 21:05
> > 
> > On 10/3/22 12:43, Kees Cook wrote:
> > >> +static inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
> > >> +{
> > >> +	u64 val, new_val;
> > >> +
> > >> +	rdmsrl(msr, val);
> > >> +	new_val = (val & ~clear) | set;
> > >> +
> > >> +	if (new_val != val)
> > >> +		wrmsrl(msr, new_val);
> > >> +}
> > > I always get uncomfortable when I see these kinds of generalized helper
> > > functions for touching cpu bits, etc. It just begs for future attacker
> > > abuse to muck with arbitrary bits -- even marked inline there is a risk
> > > the compiler will ignore that in some circumstances (not as currently
> > > used in the code, but I'm imagining future changes leading to such a
> > > condition). Will you humor me and change this to a macro instead? That'll
> > > force it always inline (even __always_inline isn't always inline):
> > 
> > Oh, are you thinking that this is dangerous because it's so surgical and
> > non-intrusive?  It's even more powerful to an attacker than, say
> > wrmsrl(), because there they actually have to know what the existing
> > value is to update it.  With this helper, it's quite easy to flip an
> > individual bit without disturbing the neighboring bits.
> > 
> > Is that it?
> > 
> > I don't _like_ the #defines, but doing one here doesn't seem too onerous
> > considering how critical MSRs are.
> 
> How often is the 'msr' number not a compile-time constant?
> Adding rd/wrmsr variants that verify this would reduce the
> attack surface as well.

Oh, yes! I do this all the time with FORTIFY shenanigans. Right, so,
instead of a macro, the "cannot be un-inlined" could be enforced with
this (untested):

static __always_inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
{
	u64 val, new_val;

	BUILD_BUG_ON(!__builtin_constant_p(msr) ||
		     !__builtin_constant_p(set) ||
		     !__builtin_constant_p(clear));

	rdmsrl(msr, val);
	new_val = (val & ~clear) | set;

	if (new_val != val)
		wrmsrl(msr, new_val);
}
David Laight Oct. 5, 2022, 1:32 p.m. UTC | #7
From: Kees Cook
> Sent: 04 October 2022 20:32
...
> Oh, yes! I do this all the time with FORTIFY shenanigans. Right, so,
> instead of a macro, the "cannot be un-inlined" could be enforced with
> this (untested):
> 
> static __always_inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
> {
> 	u64 val, new_val;
> 
> 	BUILD_BUG_ON(!__builtin_constant_p(msr) ||
> 		     !__builtin_constant_p(set) ||
> 		     !__builtin_constant_p(clear));

You can reduce the amount of text the brain has to parse
by using:

	BUILD_BUG_ON(!__builtin_constant_p(msr + set + clear));

Just requires the brain to do a quick 'oh yes'...

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Edgecombe, Rick P Oct. 20, 2022, 9:29 p.m. UTC | #8
Just now realizing, I never responded to most of this feedback as the
later conversation focused in on one area. All seems good (thanks!),
except not sure about the below:

On Mon, 2022-10-03 at 12:43 -0700, Kees Cook wrote:
> On Thu, Sep 29, 2022 at 03:29:21PM -0700, Rick Edgecombe wrote:
> > +
> > +	mmap_write_lock(mm);
> > +	addr = do_mmap(NULL, addr, size, PROT_READ, flags,
> > +		       VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
> 
> This will use the mmap base address offset randomization, I guess?

Yes.

> 
> > +
> > +	mmap_write_unlock(mm);
> > +
> > +	return addr;
> > +}
> > +
> > +static void unmap_shadow_stack(u64 base, u64 size)
> > +{
> > +	while (1) {
> > +		int r;
> > +
> > +		r = vm_munmap(base, size);
> > +
> > +		/*
> > +		 * vm_munmap() returns -EINTR when mmap_lock is held by
> > +		 * something else, and that lock should not be held for
> > a
> > +		 * long time.  Retry it for the case.
> > +		 */
> > +		if (r == -EINTR) {
> > +			cond_resched();
> > +			continue;
> > +		}
> > +
> > +		/*
> > +		 * For all other types of vm_munmap() failure, either
> > the
> > +		 * system is out of memory or there is bug.
> > +		 */
> > +		WARN_ON_ONCE(r);
> > +		break;
> > +	}
> > +}
> > +
> > +int shstk_setup(void)
> 
> Only called local. Make static?
> 
> > +{
> > +	struct thread_shstk *shstk = &current->thread.shstk;
> > +	unsigned long addr, size;
> > +
> > +	/* Already enabled */
> > +	if (feature_enabled(CET_SHSTK))
> > +		return 0;
> > +
> > +	/* Also not supported for 32 bit */
> > +	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
> > in_ia32_syscall())
> > +		return -EOPNOTSUPP;
> > +
> > +	size = PAGE_ALIGN(min_t(unsigned long long,
> > rlimit(RLIMIT_STACK), SZ_4G));
> > +	addr = alloc_shstk(size);
> > +	if (IS_ERR_VALUE(addr))
> > +		return PTR_ERR((void *)addr);
> > +
> > +	fpu_lock_and_load();
> > +	wrmsrl(MSR_IA32_PL3_SSP, addr + size);
> > +	wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
> > +	fpregs_unlock();
> > +
> > +	shstk->base = addr;
> > +	shstk->size = size;
> > +	feature_set(CET_SHSTK);
> > +
> > +	return 0;
> > +}
> > +
> > +void reset_thread_shstk(void)
> > +{
> > +	memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
> > +	current->thread.features = 0;
> > +	current->thread.features_locked = 0;
> > +}
> 
> If features is always going to be tied to shstk, why not put them in
> the
> shstk struct?

CET and LAM used to share an enabling interface and also kernel side
enablement state tracking. But in the end LAM got its own enabling
interface. Thomas had suggested that they could share a state field on
the kernel side. But then LAM already had enough state tracking for
it's needs.

Shadow stack used to track enabling with the fields in the shstk struct
that keep track of the threads shadow stack. But then we added WRSS
which needs another field to keep track of the status. So I thought to
leave the 'features' field and use it for all the CET features
tracking. I left it outside of the shstk struct so it looks usable for
any other features that might be looking for an status bit. I can
definitely compile it out when there is no user shadow stack.

snip


> > +
> > +void shstk_free(struct task_struct *tsk)
> > +{
> > +	struct thread_shstk *shstk = &tsk->thread.shstk;
> > +
> > +	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
> > +	    !feature_enabled(CET_SHSTK))
> > +		return;
> > +
> > +	if (!tsk->mm)
> > +		return;
> > +
> > +	unmap_shadow_stack(shstk->base, shstk->size);
> 
> I feel like base and size should be zeroed here?
> 

The code used to use shstk->base and shstk->size to keep track of if
shadow stack was enabled. I'm not sure why to zero it now. Just
defensively or did you see a concrete issue?
Kees Cook Oct. 20, 2022, 10:54 p.m. UTC | #9
On Thu, Oct 20, 2022 at 09:29:38PM +0000, Edgecombe, Rick P wrote:
> The code used to use shstk->base and shstk->size to keep track of if
> shadow stack was enabled. I'm not sure why to zero it now. Just
> defensively or did you see a concrete issue?

Just to be defensive. It's not fast path by any means, to better to
have values that make a bit of sense there. *shrug* It just stood out
as feeling "missing" while I was reading the code.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 0fa4dbc98c49..a4a1f4c0089b 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -7,12 +7,25 @@ 
 
 struct task_struct;
 
+struct thread_shstk {
+	u64	base;
+	u64	size;
+};
+
 #ifdef CONFIG_X86_SHADOW_STACK
 long cet_prctl(struct task_struct *task, int option,
 		      unsigned long features);
+int shstk_setup(void);
+void shstk_free(struct task_struct *p);
+int shstk_disable(void);
+void reset_thread_shstk(void);
 #else
 static inline long cet_prctl(struct task_struct *task, int option,
 		      unsigned long features) { return -EINVAL; }
+static inline int shstk_setup(void) { return -EOPNOTSUPP; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline int shstk_disable(void) { return -EOPNOTSUPP; }
+static inline void reset_thread_shstk(void) {}
 #endif /* CONFIG_X86_SHADOW_STACK */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 65ec1965cd28..a9cb4c434e60 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -310,6 +310,17 @@  void msrs_free(struct msr *msrs);
 int msr_set_bit(u32 msr, u8 bit);
 int msr_clear_bit(u32 msr, u8 bit);
 
+static inline void set_clr_bits_msrl(u32 msr, u64 set, u64 clear)
+{
+	u64 val, new_val;
+
+	rdmsrl(msr, val);
+	new_val = (val & ~clear) | set;
+
+	if (new_val != val)
+		wrmsrl(msr, new_val);
+}
+
 #ifdef CONFIG_SMP
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a92bf76edafe..3a0c9d9d4d1d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@  struct vm86;
 #include <asm/unwind_hints.h>
 #include <asm/vmxfeatures.h>
 #include <asm/vdso/processor.h>
+#include <asm/cet.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -533,6 +534,10 @@  struct thread_struct {
 	unsigned long		features;
 	unsigned long		features_locked;
 
+#ifdef CONFIG_X86_SHADOW_STACK
+	struct thread_shstk	shstk;
+#endif
+
 	/* Floating point and extended processor state */
 	struct fpu		fpu;
 	/*
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 028158e35269..41af3a8c4fa4 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -26,4 +26,6 @@ 
 #define ARCH_CET_DISABLE		0x4002
 #define ARCH_CET_LOCK			0x4003
 
+#define CET_SHSTK			0x1
+
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a20a5ebfacd7..8950d1f71226 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -139,6 +139,8 @@  obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)		+= sev.o
 
+obj-$(CONFIG_X86_SHADOW_STACK)		+= shstk.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8fa2c2b7de65..be544b4b4c8b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -514,6 +514,8 @@  start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 		load_gs_index(__USER_DS);
 	}
 
+	reset_thread_shstk();
+
 	loadsegment(fs, 0);
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index e3276ac9e9b9..a0b8d4adb2bf 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -8,8 +8,151 @@ 
 
 #include <linux/sched.h>
 #include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <asm/msr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/cet.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/api.h>
 #include <asm/prctl.h>
 
+static bool feature_enabled(unsigned long features)
+{
+	return current->thread.features & features;
+}
+
+static void feature_set(unsigned long features)
+{
+	current->thread.features |= features;
+}
+
+static void feature_clr(unsigned long features)
+{
+	current->thread.features &= ~features;
+}
+
+static unsigned long alloc_shstk(unsigned long size)
+{
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, unused;
+
+	mmap_write_lock(mm);
+	addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+		       VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+
+	mmap_write_unlock(mm);
+
+	return addr;
+}
+
+static void unmap_shadow_stack(u64 base, u64 size)
+{
+	while (1) {
+		int r;
+
+		r = vm_munmap(base, size);
+
+		/*
+		 * vm_munmap() returns -EINTR when mmap_lock is held by
+		 * something else, and that lock should not be held for a
+		 * long time.  Retry it for the case.
+		 */
+		if (r == -EINTR) {
+			cond_resched();
+			continue;
+		}
+
+		/*
+		 * For all other types of vm_munmap() failure, either the
+		 * system is out of memory or there is bug.
+		 */
+		WARN_ON_ONCE(r);
+		break;
+	}
+}
+
+int shstk_setup(void)
+{
+	struct thread_shstk *shstk = &current->thread.shstk;
+	unsigned long addr, size;
+
+	/* Already enabled */
+	if (feature_enabled(CET_SHSTK))
+		return 0;
+
+	/* Also not supported for 32 bit */
+	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) || in_ia32_syscall())
+		return -EOPNOTSUPP;
+
+	size = PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
+	addr = alloc_shstk(size);
+	if (IS_ERR_VALUE(addr))
+		return PTR_ERR((void *)addr);
+
+	fpu_lock_and_load();
+	wrmsrl(MSR_IA32_PL3_SSP, addr + size);
+	wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+	fpregs_unlock();
+
+	shstk->base = addr;
+	shstk->size = size;
+	feature_set(CET_SHSTK);
+
+	return 0;
+}
+
+void reset_thread_shstk(void)
+{
+	memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
+	current->thread.features = 0;
+	current->thread.features_locked = 0;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+	struct thread_shstk *shstk = &tsk->thread.shstk;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+	    !feature_enabled(CET_SHSTK))
+		return;
+
+	if (!tsk->mm)
+		return;
+
+	unmap_shadow_stack(shstk->base, shstk->size);
+}
+
+int shstk_disable(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+		return -EOPNOTSUPP;
+
+	/* Already disabled? */
+	if (!feature_enabled(CET_SHSTK))
+		return 0;
+
+	fpu_lock_and_load();
+	/* Disable WRSS too when disabling shadow stack */
+	set_clr_bits_msrl(MSR_IA32_U_CET, 0, CET_SHSTK_EN);
+	wrmsrl(MSR_IA32_PL3_SSP, 0);
+	fpregs_unlock();
+
+	shstk_free(current);
+	feature_clr(CET_SHSTK);
+
+	return 0;
+}
+
 long cet_prctl(struct task_struct *task, int option, unsigned long features)
 {
 	if (option == ARCH_CET_LOCK) {