diff mbox series

[v29,23/32] x86/cet/shstk: Add user-mode shadow stack support

Message ID 20210820181201.31490-24-yu-cheng.yu@intel.com (mailing list archive)
State New
Headers show
Series Control-flow Enforcement: Shadow Stack | expand

Commit Message

Yu-cheng Yu Aug. 20, 2021, 6:11 p.m. UTC
Introduce basic shadow stack enabling/disabling/allocation routines.
A task's shadow stack is allocated from memory with VM_SHADOW_STACK flag
and has a fixed size of min(RLIMIT_STACK, 4GB).

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Kees Cook <keescook@chromium.org>
---
v28:
- Update shstk_setup() with wrmsrl_safe(), returns success when shadow
  stack feature is not present (since this is a setup function).

v27:
- Change 'struct cet_status' to 'struct thread_shstk', and change member
  types from unsigned long to u64.
- Re-order local variables in reverse order of length.
- WARN_ON_ONCE() when vm_munmap() fails.
---
 arch/x86/include/asm/cet.h       |  30 +++++++
 arch/x86/include/asm/processor.h |   5 ++
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/shstk.c          | 134 +++++++++++++++++++++++++++++++
 4 files changed, 170 insertions(+)
 create mode 100644 arch/x86/include/asm/cet.h
 create mode 100644 arch/x86/kernel/shstk.c

Comments

Borislav Petkov Aug. 26, 2021, 4:25 p.m. UTC | #1
On Fri, Aug 20, 2021 at 11:11:52AM -0700, Yu-cheng Yu wrote:
> diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
> new file mode 100644
> index 000000000000..6432baf4de1f
> --- /dev/null
> +++ b/arch/x86/include/asm/cet.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_CET_H
> +#define _ASM_X86_CET_H
> +
> +#ifndef __ASSEMBLY__
> +#include <linux/types.h>
> +
> +struct task_struct;
> +
> +/*
> + * Per-thread CET status
> + */

That comment is superfluous and wrong now. The struct name says exactly
what that thing is.

> +static unsigned long alloc_shstk(unsigned long size)
> +{
> +	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
> +	struct mm_struct *mm = current->mm;
> +	unsigned long addr, populate;

s/populate/unused/

> +
> +	mmap_write_lock(mm);
> +	addr = do_mmap(NULL, 0, size, PROT_READ, flags, VM_SHADOW_STACK, 0,
> +		       &populate, NULL);
> +	mmap_write_unlock(mm);
> +
> +	return addr;
> +}
> +
> +int shstk_setup(void)
> +{
> +	struct thread_shstk *shstk = &current->thread.shstk;
> +	unsigned long addr, size;
> +	int err;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> +		return 0;
> +
> +	size = round_up(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G), PAGE_SIZE);
> +	addr = alloc_shstk(size);
> +	if (IS_ERR_VALUE(addr))
> +		return PTR_ERR((void *)addr);
> +
> +	start_update_msrs();

You're setting CET_U with the MSR writes below. Why do you need to do
XRSTOR here? To zero out PL[012]_SSP?

If so, you can WRMSR those too - no need for a full XRSTOR...

> +	err = wrmsrl_safe(MSR_IA32_PL3_SSP, addr + size);
> +	if (!err)
> +		wrmsrl_safe(MSR_IA32_U_CET, CET_SHSTK_EN);
> +	end_update_msrs();
> +
> +	if (!err) {
> +		shstk->base = addr;
> +		shstk->size = size;
> +	}
> +
> +	return err;
> +}
Yu-cheng Yu Aug. 27, 2021, 6:10 p.m. UTC | #2
On 8/26/2021 9:25 AM, Borislav Petkov wrote:
> On Fri, Aug 20, 2021 at 11:11:52AM -0700, Yu-cheng Yu wrote:
[...]
>> +
>> +int shstk_setup(void)
>> +{
>> +	struct thread_shstk *shstk = &current->thread.shstk;
>> +	unsigned long addr, size;
>> +	int err;
>> +
>> +	if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
>> +		return 0;
>> +
>> +	size = round_up(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G), PAGE_SIZE);
>> +	addr = alloc_shstk(size);
>> +	if (IS_ERR_VALUE(addr))
>> +		return PTR_ERR((void *)addr);
>> +
>> +	start_update_msrs();
> 
> You're setting CET_U with the MSR writes below. Why do you need to do
> XRSTOR here? To zero out PL[012]_SSP?
> 
> If so, you can WRMSR those too - no need for a full XRSTOR...
> 

Because on context switches the whole xstates are switched together, we 
need to make sure all are in registers.

>> +	err = wrmsrl_safe(MSR_IA32_PL3_SSP, addr + size);
>> +	if (!err)
>> +		wrmsrl_safe(MSR_IA32_U_CET, CET_SHSTK_EN);
>> +	end_update_msrs();
>> +
>> +	if (!err) {
>> +		shstk->base = addr;
>> +		shstk->size = size;
>> +	}
>> +
>> +	return err;
>> +}
>
Borislav Petkov Aug. 27, 2021, 6:21 p.m. UTC | #3
On Fri, Aug 27, 2021 at 11:10:31AM -0700, Yu, Yu-cheng wrote:
> Because on context switches the whole xstates are switched together,
> we need to make sure all are in registers.

There's context switch code which does that already.

Why would shstk_setup() be responsible for switching the whole extended
states buffer instead of only the shadow stack stuff only?
Yu-cheng Yu Aug. 27, 2021, 6:37 p.m. UTC | #4
On 8/27/2021 11:21 AM, Borislav Petkov wrote:
> On Fri, Aug 27, 2021 at 11:10:31AM -0700, Yu, Yu-cheng wrote:
>> Because on context switches the whole xstates are switched together,
>> we need to make sure all are in registers.
> 
> There's context switch code which does that already.
> 
> Why would shstk_setup() be responsible for switching the whole extended
> states buffer instead of only the shadow stack stuff only?
> 

Right now, the kernel does lazy restore, and it waits until right before 
a task goes back to ring-3 to restore xstates.  If a task needs to write 
to any xstate registers before that (e.g. for signals), it restores the 
whole xstates first and clears TIF_NEED_FPU_LOAD, which will prevent 
xstates being restored again later.
Borislav Petkov Aug. 27, 2021, 6:46 p.m. UTC | #5
On Fri, Aug 27, 2021 at 11:37:34AM -0700, Yu, Yu-cheng wrote:
> Right now, the kernel does lazy restore, and it waits until right before a
> task goes back to ring-3 to restore xstates.  If a task needs to write to
> any xstate registers before that (e.g. for signals), it restores the whole
> xstates first and clears TIF_NEED_FPU_LOAD, which will prevent xstates being
> restored again later.

shstk_setup() allocates a shadow stack for the task and puts its pointer
in MSR_IA32_PL3_SSP.

What does that have to do with a task having to write any xstate
registers?
Dave Hansen Aug. 27, 2021, 8:25 p.m. UTC | #6
On 8/27/21 11:21 AM, Borislav Petkov wrote:
> On Fri, Aug 27, 2021 at 11:10:31AM -0700, Yu, Yu-cheng wrote:
>> Because on context switches the whole xstates are switched together,
>> we need to make sure all are in registers.
> There's context switch code which does that already.
> 
> Why would shstk_setup() be responsible for switching the whole extended
> states buffer instead of only the shadow stack stuff only?

I don't think this has anything to do with context-switching, really.

The code lands in shstk_setup() which wants to make sure that the new
MSR values are set before the task goes out to userspace.  If
TIF_NEED_FPU_LOAD was set, it could do that by going out to the XSAVE
buffer and setting the MSR state in the buffer.  Before returning to
userspace, it would be XRSTOR'd.  A WRMSR by itself would not be
persistent because that XRSTOR would overwrite it.

But, if TIF_NEED_FPU_LOAD is *clear* it means the XSAVE buffer is
out-of-date and the registers are live.  WRMSR can be used and there
will be a XSAVE* to the task buffer during a context switch.

So, this code takes the coward's way out: it *forces* TIF_NEED_FPU_LOAD
to be clear by making the registers live with fpregs_restore_userregs().
 That lets it just use WRMSR instead of dealing with the XSAVE buffer
directly.  If it didn't do this with the *WHOLE* set of user FPU state,
we'd need more fine-granted "NEED_*_LOAD" tracking than our one FPU bit.

This is also *only* safe because the task is newly-exec()'d and the FPU
state was just reset.  Otherwise, we might have had to worry that the
non-PL3 SSPs have garbage or that non-SHSTK bits are set in MSR_IA32_U_CET.

That said, after staring at it, I *think* this code is functionally
correct and OK performance-wise.  I suspect that the (very blunt) XRSTOR
inside of start_update_msrs()->fpregs_restore_userregs() is quite rare
because TIF_NEED_FPU_LOAD will usually be clear due to the proximity to
execve().  So, adding direct XSAVE buffer manipulation would probably
only make it more error prone.
Borislav Petkov Sept. 1, 2021, 1 p.m. UTC | #7
First of all,

thanks a lot Dave for taking the time to communicate properly with me!

On Fri, Aug 27, 2021 at 01:25:29PM -0700, Dave Hansen wrote:
> I don't think this has anything to do with context-switching, really.
> 
> The code lands in shstk_setup() which wants to make sure that the new
> MSR values are set before the task goes out to userspace.  If
> TIF_NEED_FPU_LOAD was set, it could do that by going out to the XSAVE
> buffer and setting the MSR state in the buffer.  Before returning to
> userspace, it would be XRSTOR'd.  A WRMSR by itself would not be
> persistent because that XRSTOR would overwrite it.
> 
> But, if TIF_NEED_FPU_LOAD is *clear* it means the XSAVE buffer is
> out-of-date and the registers are live.  WRMSR can be used and there
> will be a XSAVE* to the task buffer during a context switch.
> 
> So, this code takes the coward's way out: it *forces* TIF_NEED_FPU_LOAD
> to be clear by making the registers live with fpregs_restore_userregs().
>  That lets it just use WRMSR instead of dealing with the XSAVE buffer
> directly.  If it didn't do this with the *WHOLE* set of user FPU state,
> we'd need more fine-granted "NEED_*_LOAD" tracking than our one FPU bit.
> 
> This is also *only* safe because the task is newly-exec()'d and the FPU
> state was just reset.  Otherwise, we might have had to worry that the
> non-PL3 SSPs have garbage or that non-SHSTK bits are set in MSR_IA32_U_CET.
> 
> That said, after staring at it, I *think* this code is functionally
> correct and OK performance-wise.

Right, except that that is being done in
setup_signal_shadow_stack()/restore_signal_shadow_stack() too, for the
restore token.

Which means, a potential XRSTOR each time just for a single MSR. That
means, twice per signal in the worst case.

Which means, shadow stack should be pretty noticeable in signal-heavy
benchmarks...

> I suspect that the (very blunt) XRSTOR inside of
> start_update_msrs()->fpregs_restore_userregs() is quite rare because
> TIF_NEED_FPU_LOAD will usually be clear due to the proximity to
> execve(). So, adding direct XSAVE buffer manipulation would probably
> only make it more error prone.

@Yu-cheng: please take Dave's explanation as is and stick it over
start_update_msrs() so that it is clear what that thing is doing.

Thx.
Dave Hansen Sept. 1, 2021, 3:24 p.m. UTC | #8
On 9/1/21 6:00 AM, Borislav Petkov wrote:
>> So, this code takes the coward's way out: it *forces* TIF_NEED_FPU_LOAD
>> to be clear by making the registers live with fpregs_restore_userregs().
>>  That lets it just use WRMSR instead of dealing with the XSAVE buffer
>> directly.  If it didn't do this with the *WHOLE* set of user FPU state,
>> we'd need more fine-granted "NEED_*_LOAD" tracking than our one FPU bit.
>>
>> This is also *only* safe because the task is newly-exec()'d and the FPU
>> state was just reset.  Otherwise, we might have had to worry that the
>> non-PL3 SSPs have garbage or that non-SHSTK bits are set in MSR_IA32_U_CET.
>>
>> That said, after staring at it, I *think* this code is functionally
>> correct and OK performance-wise.
> Right, except that that is being done in
> setup_signal_shadow_stack()/restore_signal_shadow_stack() too, for the
> restore token.
> 
> Which means, a potential XRSTOR each time just for a single MSR. That
> means, twice per signal in the worst case.
> 
> Which means, shadow stack should be pretty noticeable in signal-heavy
> benchmarks...

Ahh, good point.  I totally missed the signal side.

Yu-cheng, it would probably be wise to take a look at where
TIF_NEED_FPU_LOAD is set in the signal paths.  I suspect the new shadow
stack code is clearing it pretty quickly.  That's not *necessarily* a
waste because it has to be XRSTOR'd somewhere before returning to
userspace.  But, it does impose an extra XSAVE/XRSTOR burden if the code
is preempted somewhere between
setup_signal_shadow_stack()/restore_signal_shadow_stack() and the return
to usersspace.

Some performance numbers would be nice.

Even nicer would be to make your code work without requiring
TIF_NEED_FPU_LOAD to be clear, or actively clearing it.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
new file mode 100644
index 000000000000..6432baf4de1f
--- /dev/null
+++ b/arch/x86/include/asm/cet.h
@@ -0,0 +1,30 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CET_H
+#define _ASM_X86_CET_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+struct task_struct;
+
+/*
+ * Per-thread CET status
+ */
+struct thread_shstk {
+	u64	base;
+	u64	size;
+};
+
+#ifdef CONFIG_X86_SHADOW_STACK
+int shstk_setup(void);
+void shstk_free(struct task_struct *p);
+void shstk_disable(void);
+#else
+static inline int shstk_setup(void) { return 0; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline void shstk_disable(void) {}
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f3020c54e2cb..10497634b7a4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@  struct vm86;
 #include <asm/unwind_hints.h>
 #include <asm/vmxfeatures.h>
 #include <asm/vdso/processor.h>
+#include <asm/cet.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -527,6 +528,10 @@  struct thread_struct {
 	 */
 	u32			pkru;
 
+#ifdef CONFIG_X86_SHADOW_STACK
+	struct thread_shstk	shstk;
+#endif
+
 	/* Floating point and extended processor state */
 	struct fpu		fpu;
 	/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3e625c61f008..9e064845e497 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -150,6 +150,7 @@  obj-$(CONFIG_UNWINDER_FRAME_POINTER)	+= unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)		+= sev.o
+obj-$(CONFIG_X86_SHADOW_STACK)		+= shstk.o
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
new file mode 100644
index 000000000000..5993aa8db338
--- /dev/null
+++ b/arch/x86/kernel/shstk.c
@@ -0,0 +1,134 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <asm/msr.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/cet.h>
+
+static void start_update_msrs(void)
+{
+	fpregs_lock();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		fpregs_restore_userregs();
+}
+
+static void end_update_msrs(void)
+{
+	fpregs_unlock();
+}
+
+static unsigned long alloc_shstk(unsigned long size)
+{
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, populate;
+
+	mmap_write_lock(mm);
+	addr = do_mmap(NULL, 0, size, PROT_READ, flags, VM_SHADOW_STACK, 0,
+		       &populate, NULL);
+	mmap_write_unlock(mm);
+
+	return addr;
+}
+
+int shstk_setup(void)
+{
+	struct thread_shstk *shstk = &current->thread.shstk;
+	unsigned long addr, size;
+	int err;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+		return 0;
+
+	size = round_up(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G), PAGE_SIZE);
+	addr = alloc_shstk(size);
+	if (IS_ERR_VALUE(addr))
+		return PTR_ERR((void *)addr);
+
+	start_update_msrs();
+	err = wrmsrl_safe(MSR_IA32_PL3_SSP, addr + size);
+	if (!err)
+		wrmsrl_safe(MSR_IA32_U_CET, CET_SHSTK_EN);
+	end_update_msrs();
+
+	if (!err) {
+		shstk->base = addr;
+		shstk->size = size;
+	}
+
+	return err;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+	struct thread_shstk *shstk = &tsk->thread.shstk;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+	    !shstk->size ||
+	    !shstk->base)
+		return;
+
+	if (!tsk->mm)
+		return;
+
+	while (1) {
+		int r;
+
+		r = vm_munmap(shstk->base, shstk->size);
+
+		/*
+		 * vm_munmap() returns -EINTR when mmap_lock is held by
+		 * something else, and that lock should not be held for a
+		 * long time.  Retry it for the case.
+		 */
+		if (r == -EINTR) {
+			cond_resched();
+			continue;
+		}
+
+		/*
+		 * For all other types of vm_munmap() failure, either the
+		 * system is out of memory or there is bug.
+		 */
+		WARN_ON_ONCE(r);
+		break;
+	}
+
+	shstk->base = 0;
+	shstk->size = 0;
+}
+
+void shstk_disable(void)
+{
+	struct thread_shstk *shstk = &current->thread.shstk;
+	u64 msr_val;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+	    !shstk->size ||
+	    !shstk->base)
+		return;
+
+	start_update_msrs();
+	rdmsrl(MSR_IA32_U_CET, msr_val);
+	wrmsrl(MSR_IA32_U_CET, msr_val & ~CET_SHSTK_EN);
+	wrmsrl(MSR_IA32_PL3_SSP, 0);
+	end_update_msrs();
+
+	shstk_free(current);
+}