Message ID | 20240731-clone3-shadow-stack-v7-9-a9532eebfb1d@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fork: Support shadow stacks in clone3() | expand |
On Wed, Jul 31, 2024 at 01:14:15PM +0100, Mark Brown wrote: > Add basic test coverage for specifying the shadow stack for a newly > created thread via clone3(), including coverage of the newly extended > argument structure. We check that a user specified shadow stack can be > provided, and that invalid combinations of parameters are rejected. > > In order to facilitate testing on systems without userspace shadow stack > support we manually enable shadow stacks on startup, this is architecture > specific due to the use of an arch_prctl() on x86. Due to interactions with > potential userspace locking of features we actually detect support for > shadow stacks on the running system by attempting to allocate a shadow > stack page during initialisation using map_shadow_stack(), warning if this > succeeds when the enable failed. > > In order to allow testing of user configured shadow stacks on > architectures with that feature we need to ensure that we do not return > from the function where the clone3() syscall is called in the child > process, doing so would trigger a shadow stack underflow. To do this we > use inline assembly rather than the standard syscall wrapper to call > clone3(). In order to avoid surprises we also use a syscall rather than > the libc exit() function., this should be overly cautious. > > Signed-off-by: Mark Brown <broonie@kernel.org> > --- > tools/testing/selftests/clone3/clone3.c | 134 +++++++++++++++++++++- > tools/testing/selftests/clone3/clone3_selftests.h | 38 ++++++ > 2 files changed, 171 insertions(+), 1 deletion(-) > > diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c > index 26221661e9ae..81c2e8648e8b 100644 > --- a/tools/testing/selftests/clone3/clone3.c > +++ b/tools/testing/selftests/clone3/clone3.c > @@ -3,6 +3,7 @@ > /* Based on Christian Brauner's clone3() example */ > > #define _GNU_SOURCE > +#include <asm/mman.h> > #include <errno.h> > #include <inttypes.h> > #include <linux/types.h> > @@ -11,6 +12,7 @@ > #include <stdint.h> > #include <stdio.h> > #include <stdlib.h> > +#include <sys/mman.h> > #include <sys/syscall.h> > #include <sys/types.h> > #include <sys/un.h> > @@ -19,8 +21,12 @@ > #include <sched.h> > > #include "../kselftest.h" > +#include "../ksft_shstk.h" > #include "clone3_selftests.h" > > +static bool shadow_stack_supported; > +static size_t max_supported_args_size; > + > enum test_mode { > CLONE3_ARGS_NO_TEST, > CLONE3_ARGS_ALL_0, > @@ -28,6 +34,10 @@ enum test_mode { > CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG, > CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG, > CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG, > + CLONE3_ARGS_SHADOW_STACK, > + CLONE3_ARGS_SHADOW_STACK_NO_SIZE, > + CLONE3_ARGS_SHADOW_STACK_NO_POINTER, > + CLONE3_ARGS_SHADOW_STACK_NO_TOKEN, > }; > > typedef bool (*filter_function)(void); > @@ -44,6 +54,44 @@ struct test { > filter_function filter; > }; > > + > +/* > + * We check for shadow stack support by attempting to use > + * map_shadow_stack() since features may have been locked by the > + * dynamic linker resulting in spurious errors when we attempt to > + * enable on startup. We warn if the enable failed. > + */ > +static void test_shadow_stack_supported(void) > +{ > + long ret; > + > + ret = syscall(__NR_map_shadow_stack, 0, getpagesize(), 0); > + if (ret == -1) { > + ksft_print_msg("map_shadow_stack() not supported\n"); > + } else if ((void *)ret == MAP_FAILED) { > + ksft_print_msg("Failed to map shadow stack\n"); > + } else { > + ksft_print_msg("Shadow stack supportd\n"); typo: supportd -> supported > + shadow_stack_supported = true; > + > + if (!shadow_stack_enabled) > + ksft_print_msg("Mapped but did not enable shadow stack\n"); > + } > +} On my CET system, this reports: ... # clone3() syscall supported # Shadow stack supportd # Running test 'simple clone3()' ... (happily doesn't print "Mapped but did not enable ..."). > + > +static unsigned long long get_shadow_stack_page(unsigned long flags) > +{ > + unsigned long long page; > + > + page = syscall(__NR_map_shadow_stack, 0, getpagesize(), flags); > + if ((void *)page == MAP_FAILED) { > + ksft_print_msg("map_shadow_stack() failed: %d\n", errno); > + return 0; > + } > + > + return page; > +} > + > static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) > { > struct __clone_args args = { > @@ -89,6 +137,21 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) > case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG: > args.exit_signal = 0x00000000000000f0ULL; > break; > + case CLONE3_ARGS_SHADOW_STACK: > + /* We need to specify a normal stack too to avoid corruption */ > + args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN); > + args.shadow_stack_size = getpagesize(); > + break; # Running test 'Shadow stack on system with shadow stack' # [5496] Trying clone3() with flags 0 (size 0) # I am the parent (5496). My child's pid is 5505 # Child exited with signal 11 # [5496] clone3() with flags says: 11 expected 0 # [5496] Result (11) is different than expected (0) not ok 20 Shadow stack on system with shadow stack The child segfaults immediately, it seems? > + case CLONE3_ARGS_SHADOW_STACK_NO_POINTER: > + args.shadow_stack_size = getpagesize(); > + break; # Running test 'Shadow stack with no pointer' # [5496] Trying clone3() with flags 0 (size 0) # Invalid argument - Failed to create new process # [5496] clone3() with flags says: -22 expected -22 ok 21 Shadow stack with no pointer This seems like it misses the failure and reports ok > + case CLONE3_ARGS_SHADOW_STACK_NO_SIZE: > + args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN); > + break; # Running test 'Shadow stack with no size' # [5496] Trying clone3() with flags 0 (size 0) # Invalid argument - Failed to create new process # [5496] clone3() with flags says: -22 expected -22 ok 22 Shadow stack with no size Same? > + case CLONE3_ARGS_SHADOW_STACK_NO_TOKEN: > + args.shadow_stack = get_shadow_stack_page(0); > + args.shadow_stack_size = getpagesize(); > + break; This actually segfaults the parent: # Running test 'Shadow stack with no token' # [5496] Trying clone3() with flags 0x100 (size 0) # I am the parent (5496). My child's pid is 5507 Segmentation fault Let me know what would be most helpful to dig into more...
On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote: > On Wed, Jul 31, 2024 at 01:14:15PM +0100, Mark Brown wrote: > > + case CLONE3_ARGS_SHADOW_STACK: > > + /* We need to specify a normal stack too to avoid corruption */ > > + args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN); > > + args.shadow_stack_size = getpagesize(); > > + break; > # Running test 'Shadow stack on system with shadow stack' > # [5496] Trying clone3() with flags 0 (size 0) > # I am the parent (5496). My child's pid is 5505 > # Child exited with signal 11 > # [5496] clone3() with flags says: 11 expected 0 > # [5496] Result (11) is different than expected (0) > not ok 20 Shadow stack on system with shadow stack > The child segfaults immediately, it seems? That's what I'd expect if we either didn't manage to create the shadow stack token in the page we mapped or we messed up in arch_shstk_post_fork() somehow, probably getting the wrong pointer for the token or not mapping things correctly. I'll have done something silly, it'll doubtless be very obvious and embarrassing when I see it but I'm not seeing it right now... > > + case CLONE3_ARGS_SHADOW_STACK_NO_POINTER: > > + args.shadow_stack_size = getpagesize(); > > + break; > # Running test 'Shadow stack with no pointer' > # [5496] Trying clone3() with flags 0 (size 0) > # Invalid argument - Failed to create new process > # [5496] clone3() with flags says: -22 expected -22 > ok 21 Shadow stack with no pointer > This seems like it misses the failure and reports ok No, this is testing to make sure we get the failure - if we have arguments that can't possibly be valid then we should reject them with an error code during validation prior to trying to create the new thread. The "expected -22" in the output says it's looking for an error. Same for the other similar expected error code. > > + case CLONE3_ARGS_SHADOW_STACK_NO_TOKEN: > > + args.shadow_stack = get_shadow_stack_page(0); > > + args.shadow_stack_size = getpagesize(); > > + break; > This actually segfaults the parent: > # Running test 'Shadow stack with no token' > # [5496] Trying clone3() with flags 0x100 (size 0) > # I am the parent (5496). My child's pid is 5507 > Segmentation fault Oh dear. We possibly manage to corrupt the parent's shadow stack somehow? I don't think I managed to do that in my arm64 testing. This should also be something going wrong in arch_shstk_post_fork(). > Let me know what would be most helpful to dig into more... It'll almost certianly be something in arch_shstk_post_fork(), that's the bit I couldn't test. Just making that always return success should avoid the first fault, the second ought to not crash but will report a fail as we should be rejecting the shadow stack when we try to consume the token.
On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote: > # Running test 'Shadow stack on system with shadow stack' > # [5496] Trying clone3() with flags 0 (size 0) > # I am the parent (5496). My child's pid is 5505 > # Child exited with signal 11 > # [5496] clone3() with flags says: 11 expected 0 > # [5496] Result (11) is different than expected (0) > not ok 20 Shadow stack on system with shadow stack > The child segfaults immediately, it seems? Does this help: diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 1755fa21e6fb..27acbdf44c5f 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -198,13 +198,14 @@ int arch_shstk_post_fork(struct task_struct *t, struct kernel_clone_args *args) * the token 64-bit. */ struct mm_struct *mm; - unsigned long addr; + unsigned long addr, ssp; u64 expected; u64 val; - int ret = -EINVAL;; + int ret = -EINVAL; - addr = args->shadow_stack + args->shadow_stack_size - sizeof(u64); - expected = (addr - SS_FRAME_SIZE) | BIT(0); + ssp = args->shadow_stack + args->shadow_stack_size; + addr = ssp - SS_FRAME_SIZE; + expected = ssp | BIT(0); mm = get_task_mm(t); if (!mm)
On Tue, Aug 06, 2024 at 09:10:28PM +0100, Mark Brown wrote: > On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote: > > > # Running test 'Shadow stack on system with shadow stack' > > # [5496] Trying clone3() with flags 0 (size 0) > > # I am the parent (5496). My child's pid is 5505 > > # Child exited with signal 11 > > # [5496] clone3() with flags says: 11 expected 0 > > # [5496] Result (11) is different than expected (0) > > not ok 20 Shadow stack on system with shadow stack > > > The child segfaults immediately, it seems? > > Does this help: > > diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c > index 1755fa21e6fb..27acbdf44c5f 100644 > --- a/arch/x86/kernel/shstk.c > +++ b/arch/x86/kernel/shstk.c > @@ -198,13 +198,14 @@ int arch_shstk_post_fork(struct task_struct *t, struct kernel_clone_args *args) > * the token 64-bit. > */ > struct mm_struct *mm; > - unsigned long addr; > + unsigned long addr, ssp; > u64 expected; > u64 val; > - int ret = -EINVAL;; > + int ret = -EINVAL; > > - addr = args->shadow_stack + args->shadow_stack_size - sizeof(u64); > - expected = (addr - SS_FRAME_SIZE) | BIT(0); > + ssp = args->shadow_stack + args->shadow_stack_size; > + addr = ssp - SS_FRAME_SIZE; > + expected = ssp | BIT(0); > > mm = get_task_mm(t); > if (!mm) Yes indeed! This passes now. "Shadow stack with no token" still crashes the parent. It seems to crash in waitpid(). Under gdb it hangs instead, showing it's in glibc's __GI___wait4(). Ah, it's crashing at c3 (ret), so shadow stack problem, I imagine. Does waitpid() need to be open-coded like the clone3() call too?
On Tue, Aug 06, 2024 at 02:43:22PM -0700, Kees Cook wrote: > On Tue, Aug 06, 2024 at 09:10:28PM +0100, Mark Brown wrote: > > Does this help: > > diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c > > index 1755fa21e6fb..27acbdf44c5f 100644 > > --- a/arch/x86/kernel/shstk.c > > +++ b/arch/x86/kernel/shstk.c > > @@ -198,13 +198,14 @@ int arch_shstk_post_fork(struct task_struct *t, struct kernel_clone_args *args) > > * the token 64-bit. > > */ > > struct mm_struct *mm; > > - unsigned long addr; > > + unsigned long addr, ssp; > Yes indeed! This passes now. Ah, great - thanks! > "Shadow stack with no token" still crashes the parent. It seems to > crash in waitpid(). Under gdb it hangs instead, showing it's in glibc's > __GI___wait4(). Ah, it's crashing at c3 (ret), so shadow stack problem, > I imagine. Yes, likely. They are delivered as a SEGV with SEGV_CPERR. > Does waitpid() need to be open-coded like the clone3() call too? I wouldn't have expected so, it should just be a function call and definitely didn't do anything funky on arm64. It seems more likely that we've managed to corrupt the stack or shadow stack - most likely the new thread is still using the original shadow stack rather than the new one and so corrupts it. Again not immediately seeing where. I'll have another look tomorrow if nobody has any bright ideas before then...
On Tue, Aug 06, 2024 at 10:57:39PM +0100, Mark Brown wrote: > On Tue, Aug 06, 2024 at 02:43:22PM -0700, Kees Cook wrote: > > "Shadow stack with no token" still crashes the parent. It seems to > > crash in waitpid(). Under gdb it hangs instead, showing it's in glibc's > > __GI___wait4(). Ah, it's crashing at c3 (ret), so shadow stack problem, > > I imagine. > Yes, likely. They are delivered as a SEGV with SEGV_CPERR. > > Does waitpid() need to be open-coded like the clone3() call too? > I wouldn't have expected so, it should just be a function call and > definitely didn't do anything funky on arm64. It seems more likely that > we've managed to corrupt the stack or shadow stack - most likely the new > thread is still using the original shadow stack rather than the new one > and so corrupts it. Again not immediately seeing where. I'll have > another look tomorrow if nobody has any bright ideas before then... ...or possibly we're delivering the signal that's generated when we fail to validate the child's shadow stack token to the parent rather than the child. That logic (in shstk_post_fork()) should be shared with arm64 though so it ought to have been failing for me too. Failure to validate the token should look to the parent like the child immediately taking a shadow stack fault.
On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote: > On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote: > > This actually segfaults the parent: > > > # Running test 'Shadow stack with no token' > > # [5496] Trying clone3() with flags 0x100 (size 0) > > # I am the parent (5496). My child's pid is 5507 > > Segmentation fault > > Oh dear. We possibly manage to corrupt the parent's shadow stack > somehow? I don't think I managed to do that in my arm64 testing. This > should also be something going wrong in arch_shstk_post_fork(). > > > Let me know what would be most helpful to dig into more... > > It'll almost certianly be something in arch_shstk_post_fork(), that's > the bit I couldn't test. Just making that always return success should > avoid the first fault, the second ought to not crash but will report a > fail as we should be rejecting the shadow stack when we try to consume > the token. It took me a while to figure out where a thread switches shstk (even without this series): kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk (and shstk_alloc_thread_stack is called just before update_fpu_shstk). I don't understand the token consumption in arch_shstk_post_fork(). This wasn't needed before with the fixed-size new shstk, why is it needed now? Anyway, my attempt to trace the shstk changes for the test: write(1, "TAP version 13\n", 15) = 15 write(1, "1..2\n", 5) = 5 clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument) write(1, "# clone3() syscall supported\n", 29) = 29 map_shadow_stack(NULL, 4096, 0) = 125837480497152 write(1, "# Shadow stack supportd\n", 24) = 24 write(1, "# Running test 'Shadow stack wit"..., 44) = 44 getpid() = 4943 write(1, "# [4943] Trying clone3() with fl"..., 51) = 51 map_shadow_stack(NULL, 4096, 0) = 125837480488960 clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944 getpid() = 4943 write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached ) = 49 [pid 4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} --- [pid 4943] wait4(-1, <unfinished ...> [pid 4944] +++ killed by SIGSEGV (core dumped) +++ <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} --- --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} --- +++ killed by SIGSEGV (core dumped) +++ [ 569.153288] shstk_setup: clone3[4943] ssp:7272d2200000 [ 569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000 [ 569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000 [ 569.154008] shstk_post_fork: clone3[4944] [ 569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork I don't see an update_fpu_shstk for 4944? Should I with this test? And the parent dies with SEGV_MAPERR?? I'll keep looking in the morning ...
On Tue, Aug 06, 2024 at 10:08:44PM -0700, Kees Cook wrote: > On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote: > > > # Running test 'Shadow stack with no token' > It took me a while to figure out where a thread switches shstk (even > without this series): > kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk > (and shstk_alloc_thread_stack is called just before update_fpu_shstk). > I don't understand the token consumption in arch_shstk_post_fork(). This > wasn't needed before with the fixed-size new shstk, why is it needed > now? Concerns were raised on earlier rounds of review that since instead of allocating the shadow stack as part of creating the new thread we are using a previously allocated shadow stack someone could use this as part of an exploit. You could just jump on top of any existing shadow stack and cause writes to it. > Anyway, my attempt to trace the shstk changes for the test: > write(1, "TAP version 13\n", 15) = 15 > write(1, "1..2\n", 5) = 5 > clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument) > write(1, "# clone3() syscall supported\n", 29) = 29 > map_shadow_stack(NULL, 4096, 0) = 125837480497152 > write(1, "# Shadow stack supportd\n", 24) = 24 > write(1, "# Running test 'Shadow stack wit"..., 44) = 44 > getpid() = 4943 > write(1, "# [4943] Trying clone3() with fl"..., 51) = 51 > map_shadow_stack(NULL, 4096, 0) = 125837480488960 > clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944 > getpid() = 4943 > write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached > ) = 49 > [pid 4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} --- > [pid 4943] wait4(-1, <unfinished ...> > [pid 4944] +++ killed by SIGSEGV (core dumped) +++ So we created the thread, then before we get to the wait4() in the parent we start delivering a SEGV_CPERR to the child. The flow for the child is as expected. > <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944 > --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} --- > --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} --- > +++ killed by SIGSEGV (core dumped) +++ Then the parent gets an ordinary segfault, not a shadow stack specific one, like some memory got deallocated underneath it or a pointer got corrupted. > [ 569.153288] shstk_setup: clone3[4943] ssp:7272d2200000 > [ 569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000 > [ 569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000 > [ 569.154008] shstk_post_fork: clone3[4944] > [ 569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork > I don't see an update_fpu_shstk for 4944? Should I with this test? I'd only expect to see one update, my understanding is that that update is for the child but happening in the context of the parent as the hild is not yet started. Does this help: diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 27acbdf44c5f..d7005974aff5 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -258,6 +258,8 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, if (args->shadow_stack) { addr = args->shadow_stack; size = args->shadow_stack_size; + shstk->base = 0; + shstk->size = 0; } else { /* * For CLONE_VFORK the child will share the parents
On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote: > On Tue, Aug 06, 2024 at 10:08:44PM -0700, Kees Cook wrote: > > On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote: > > > > > # Running test 'Shadow stack with no token' > > > It took me a while to figure out where a thread switches shstk (even > > without this series): > > > kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk > > (and shstk_alloc_thread_stack is called just before update_fpu_shstk). > > > I don't understand the token consumption in arch_shstk_post_fork(). This > > wasn't needed before with the fixed-size new shstk, why is it needed > > now? > > Concerns were raised on earlier rounds of review that since instead of > allocating the shadow stack as part of creating the new thread we are > using a previously allocated shadow stack someone could use this as part > of an exploit. You could just jump on top of any existing shadow stack > and cause writes to it. > > > Anyway, my attempt to trace the shstk changes for the test: > > > write(1, "TAP version 13\n", 15) = 15 > > write(1, "1..2\n", 5) = 5 > > clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument) > > write(1, "# clone3() syscall supported\n", 29) = 29 > > map_shadow_stack(NULL, 4096, 0) = 125837480497152 > > write(1, "# Shadow stack supportd\n", 24) = 24 > > write(1, "# Running test 'Shadow stack wit"..., 44) = 44 > > getpid() = 4943 > > write(1, "# [4943] Trying clone3() with fl"..., 51) = 51 > > map_shadow_stack(NULL, 4096, 0) = 125837480488960 > > clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944 > > getpid() = 4943 > > write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached > > ) = 49 > > [pid 4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} --- > > [pid 4943] wait4(-1, <unfinished ...> > > [pid 4944] +++ killed by SIGSEGV (core dumped) +++ > > So we created the thread, then before we get to the wait4() in the > parent we start delivering a SEGV_CPERR to the child. The flow for the > child is as expected. > > > <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944 > > --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} --- > > --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} --- > > +++ killed by SIGSEGV (core dumped) +++ > > Then the parent gets an ordinary segfault, not a shadow stack specific > one, like some memory got deallocated underneath it or a pointer got > corrupted. > > > [ 569.153288] shstk_setup: clone3[4943] ssp:7272d2200000 > > [ 569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000 > > [ 569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000 > > [ 569.154008] shstk_post_fork: clone3[4944] > > [ 569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork > > > I don't see an update_fpu_shstk for 4944? Should I with this test? > > I'd only expect to see one update, my understanding is that that update > is for the child but happening in the context of the parent as the hild > is not yet started. What's weird here that I don't understand is that the parent is 4943, so this report makes sense: > > [ 569.153288] shstk_setup: clone3[4943] ssp:7272d2200000 The child is 4944, yet I see: > > [ 569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000 > > [ 569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000 These map to my logging: copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ... new_ssp = shstk_alloc_thread_stack(p, args); pr_err("%s: %s[%d] new_ssp:%lx\n", __func__, p->comm, task_pid_nr(p), new_ssp); and update_fpu_shstk(struct task_struct *dst, unsigned long ssp) ... xstate->user_ssp = (u64)ssp; pr_err("%s: %s[%d] ssp:%lx\n", __func__, dst->comm, task_pid_nr(dst), ssp); The child should be "p" (and "dst") here -- stuff is being copied from current to p, but p is reporting itself as 4943 here? (Oh, this is reporting pid, not tid... I bet that's what I've got wrong.) > Does this help: > > diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c > index 27acbdf44c5f..d7005974aff5 100644 > --- a/arch/x86/kernel/shstk.c > +++ b/arch/x86/kernel/shstk.c > @@ -258,6 +258,8 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, > if (args->shadow_stack) { > addr = args->shadow_stack; > size = args->shadow_stack_size; > + shstk->base = 0; > + shstk->size = 0; > } else { > /* > * For CLONE_VFORK the child will share the parents I'll fix my reporting and give this patch a try too. Thanks! -Kees
On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote: > On Tue, Aug 06, 2024 at 10:08:44PM -0700, Kees Cook wrote: > > On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote: > > > > > # Running test 'Shadow stack with no token' > > > It took me a while to figure out where a thread switches shstk (even > > without this series): > > > kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk > > (and shstk_alloc_thread_stack is called just before update_fpu_shstk). > > > I don't understand the token consumption in arch_shstk_post_fork(). This > > wasn't needed before with the fixed-size new shstk, why is it needed > > now? > > Concerns were raised on earlier rounds of review that since instead of > allocating the shadow stack as part of creating the new thread we are > using a previously allocated shadow stack someone could use this as part > of an exploit. You could just jump on top of any existing shadow stack > and cause writes to it. > > > Anyway, my attempt to trace the shstk changes for the test: > > > write(1, "TAP version 13\n", 15) = 15 > > write(1, "1..2\n", 5) = 5 > > clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument) > > write(1, "# clone3() syscall supported\n", 29) = 29 > > map_shadow_stack(NULL, 4096, 0) = 125837480497152 > > write(1, "# Shadow stack supportd\n", 24) = 24 > > write(1, "# Running test 'Shadow stack wit"..., 44) = 44 > > getpid() = 4943 > > write(1, "# [4943] Trying clone3() with fl"..., 51) = 51 > > map_shadow_stack(NULL, 4096, 0) = 125837480488960 > > clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944 > > getpid() = 4943 > > write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached > > ) = 49 > > [pid 4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} --- > > [pid 4943] wait4(-1, <unfinished ...> > > [pid 4944] +++ killed by SIGSEGV (core dumped) +++ > > So we created the thread, then before we get to the wait4() in the > parent we start delivering a SEGV_CPERR to the child. The flow for the > child is as expected. > > > <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944 > > --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} --- > > --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} --- > > +++ killed by SIGSEGV (core dumped) +++ > > Then the parent gets an ordinary segfault, not a shadow stack specific > one, like some memory got deallocated underneath it or a pointer got > corrupted. > > > [ 569.153288] shstk_setup: clone3[4943] ssp:7272d2200000 > > [ 569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000 > > [ 569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000 > > [ 569.154008] shstk_post_fork: clone3[4944] > > [ 569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork > > > I don't see an update_fpu_shstk for 4944? Should I with this test? > > I'd only expect to see one update, my understanding is that that update > is for the child but happening in the context of the parent as the hild > is not yet started. > > Does this help: > > diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c > index 27acbdf44c5f..d7005974aff5 100644 > --- a/arch/x86/kernel/shstk.c > +++ b/arch/x86/kernel/shstk.c > @@ -258,6 +258,8 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, > if (args->shadow_stack) { > addr = args->shadow_stack; > size = args->shadow_stack_size; > + shstk->base = 0; > + shstk->size = 0; > } else { > /* > * For CLONE_VFORK the child will share the parents Yup, that fixes it! # Totals: pass:23 fail:0 xfail:0 xpass:0 skip:1 error:0 (The skip is "Shadow stack on system without shadow stack")
On Wed, Aug 07, 2024 at 12:23:01PM -0700, Kees Cook wrote: > On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote: > > size = args->shadow_stack_size; > > + shstk->base = 0; > > + shstk->size = 0; > Yup, that fixes it! > # Totals: pass:23 fail:0 xfail:0 xpass:0 skip:1 error:0 > (The skip is "Shadow stack on system without shadow stack") Excellent, thanks! It's amazing how many dumb mistakes you can find if you actually try running the code :/ .
On Wed, Aug 07, 2024 at 11:03:24PM +0100, Mark Brown wrote: > On Wed, Aug 07, 2024 at 12:23:01PM -0700, Kees Cook wrote: > > On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote: > > > > size = args->shadow_stack_size; > > > + shstk->base = 0; > > > + shstk->size = 0; > > > Yup, that fixes it! > > > # Totals: pass:23 fail:0 xfail:0 xpass:0 skip:1 error:0 > > > (The skip is "Shadow stack on system without shadow stack") > > Excellent, thanks! It's amazing how many dumb mistakes you can find if > you actually try running the code :/ . Heh, well, it's tricky work writing it without reference hardware. :) I just wish there was CET emulation in QEmu...
diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 26221661e9ae..81c2e8648e8b 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -3,6 +3,7 @@ /* Based on Christian Brauner's clone3() example */ #define _GNU_SOURCE +#include <asm/mman.h> #include <errno.h> #include <inttypes.h> #include <linux/types.h> @@ -11,6 +12,7 @@ #include <stdint.h> #include <stdio.h> #include <stdlib.h> +#include <sys/mman.h> #include <sys/syscall.h> #include <sys/types.h> #include <sys/un.h> @@ -19,8 +21,12 @@ #include <sched.h> #include "../kselftest.h" +#include "../ksft_shstk.h" #include "clone3_selftests.h" +static bool shadow_stack_supported; +static size_t max_supported_args_size; + enum test_mode { CLONE3_ARGS_NO_TEST, CLONE3_ARGS_ALL_0, @@ -28,6 +34,10 @@ enum test_mode { CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG, CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG, + CLONE3_ARGS_SHADOW_STACK, + CLONE3_ARGS_SHADOW_STACK_NO_SIZE, + CLONE3_ARGS_SHADOW_STACK_NO_POINTER, + CLONE3_ARGS_SHADOW_STACK_NO_TOKEN, }; typedef bool (*filter_function)(void); @@ -44,6 +54,44 @@ struct test { filter_function filter; }; + +/* + * We check for shadow stack support by attempting to use + * map_shadow_stack() since features may have been locked by the + * dynamic linker resulting in spurious errors when we attempt to + * enable on startup. We warn if the enable failed. + */ +static void test_shadow_stack_supported(void) +{ + long ret; + + ret = syscall(__NR_map_shadow_stack, 0, getpagesize(), 0); + if (ret == -1) { + ksft_print_msg("map_shadow_stack() not supported\n"); + } else if ((void *)ret == MAP_FAILED) { + ksft_print_msg("Failed to map shadow stack\n"); + } else { + ksft_print_msg("Shadow stack supportd\n"); + shadow_stack_supported = true; + + if (!shadow_stack_enabled) + ksft_print_msg("Mapped but did not enable shadow stack\n"); + } +} + +static unsigned long long get_shadow_stack_page(unsigned long flags) +{ + unsigned long long page; + + page = syscall(__NR_map_shadow_stack, 0, getpagesize(), flags); + if ((void *)page == MAP_FAILED) { + ksft_print_msg("map_shadow_stack() failed: %d\n", errno); + return 0; + } + + return page; +} + static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) { struct __clone_args args = { @@ -89,6 +137,21 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG: args.exit_signal = 0x00000000000000f0ULL; break; + case CLONE3_ARGS_SHADOW_STACK: + /* We need to specify a normal stack too to avoid corruption */ + args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN); + args.shadow_stack_size = getpagesize(); + break; + case CLONE3_ARGS_SHADOW_STACK_NO_POINTER: + args.shadow_stack_size = getpagesize(); + break; + case CLONE3_ARGS_SHADOW_STACK_NO_SIZE: + args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN); + break; + case CLONE3_ARGS_SHADOW_STACK_NO_TOKEN: + args.shadow_stack = get_shadow_stack_page(0); + args.shadow_stack_size = getpagesize(); + break; } memcpy(&args_ext.args, &args, sizeof(struct __clone_args)); @@ -102,7 +165,12 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) if (pid == 0) { ksft_print_msg("I am the child, my PID is %d\n", getpid()); - _exit(EXIT_SUCCESS); + /* + * Use a raw syscall to ensure we don't get issues + * with manually specified shadow stack and exit handlers. + */ + syscall(__NR_exit, EXIT_SUCCESS); + ksft_print_msg("CHILD FAILED TO EXIT PID is %d\n", getpid()); } ksft_print_msg("I am the parent (%d). My child's pid is %d\n", @@ -191,6 +259,26 @@ static bool no_timenamespace(void) return true; } +static bool have_shadow_stack(void) +{ + if (shadow_stack_supported) { + ksft_print_msg("Shadow stack supported\n"); + return true; + } + + return false; +} + +static bool no_shadow_stack(void) +{ + if (!shadow_stack_supported) { + ksft_print_msg("Shadow stack not supported\n"); + return true; + } + + return false; +} + static size_t page_size_plus_8(void) { return getpagesize() + 8; @@ -334,6 +422,47 @@ static const struct test tests[] = { .expected = -EINVAL, .test_mode = CLONE3_ARGS_NO_TEST, }, + { + .name = "Shadow stack on system with shadow stack", + .size = 0, + .expected = 0, + .e2big_valid = true, + .test_mode = CLONE3_ARGS_SHADOW_STACK, + .filter = no_shadow_stack, + }, + { + .name = "Shadow stack with no pointer", + .size = 0, + .expected = -EINVAL, + .e2big_valid = true, + .test_mode = CLONE3_ARGS_SHADOW_STACK_NO_POINTER, + }, + { + .name = "Shadow stack with no size", + .size = 0, + .expected = -EINVAL, + .e2big_valid = true, + .test_mode = CLONE3_ARGS_SHADOW_STACK_NO_SIZE, + .filter = no_shadow_stack, + }, + { + .name = "Shadow stack with no token", + .flags = CLONE_VM, + .size = 0, + .expected = SIGSEGV, + .e2big_valid = true, + .test_mode = CLONE3_ARGS_SHADOW_STACK_NO_TOKEN, + .filter = no_shadow_stack, + }, + { + .name = "Shadow stack on system without shadow stack", + .flags = CLONE_VM, + .size = 0, + .expected = -EINVAL, + .e2big_valid = true, + .test_mode = CLONE3_ARGS_SHADOW_STACK, + .filter = have_shadow_stack, + }, }; int main(int argc, char *argv[]) @@ -341,9 +470,12 @@ int main(int argc, char *argv[]) size_t size; int i; + enable_shadow_stack(); + ksft_print_header(); ksft_set_plan(ARRAY_SIZE(tests)); test_clone3_supported(); + test_shadow_stack_supported(); for (i = 0; i < ARRAY_SIZE(tests); i++) test_clone3(&tests[i]); diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h index 39b5dcba663c..38d82934668a 100644 --- a/tools/testing/selftests/clone3/clone3_selftests.h +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -31,12 +31,50 @@ struct __clone_args { __aligned_u64 set_tid; __aligned_u64 set_tid_size; __aligned_u64 cgroup; +#ifndef CLONE_ARGS_SIZE_VER2 +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#endif + __aligned_u64 shadow_stack; + __aligned_u64 shadow_stack_size; +#ifndef CLONE_ARGS_SIZE_VER3 +#define CLONE_ARGS_SIZE_VER3 104 /* sizeof fourth published struct */ +#endif }; +/* + * For architectures with shadow stack support we need to be + * absolutely sure that the clone3() syscall will be inline and not a + * function call so we open code. + */ +#ifdef __x86_64__ +static pid_t __always_inline sys_clone3(struct __clone_args *args, size_t size) +{ + long ret; + register long _num __asm__ ("rax") = __NR_clone3; + register long _args __asm__ ("rdi") = (long)(args); + register long _size __asm__ ("rsi") = (long)(size); + + __asm__ volatile ( + "syscall\n" + : "=a"(ret) + : "r"(_args), "r"(_size), + "0"(_num) + : "rcx", "r11", "memory", "cc" + ); + + if (ret < 0) { + errno = -ret; + return -1; + } + + return ret; +} +#else static pid_t sys_clone3(struct __clone_args *args, size_t size) { return syscall(__NR_clone3, args, size); } +#endif static inline void test_clone3_supported(void) {
Add basic test coverage for specifying the shadow stack for a newly created thread via clone3(), including coverage of the newly extended argument structure. We check that a user specified shadow stack can be provided, and that invalid combinations of parameters are rejected. In order to facilitate testing on systems without userspace shadow stack support we manually enable shadow stacks on startup, this is architecture specific due to the use of an arch_prctl() on x86. Due to interactions with potential userspace locking of features we actually detect support for shadow stacks on the running system by attempting to allocate a shadow stack page during initialisation using map_shadow_stack(), warning if this succeeds when the enable failed. In order to allow testing of user configured shadow stacks on architectures with that feature we need to ensure that we do not return from the function where the clone3() syscall is called in the child process, doing so would trigger a shadow stack underflow. To do this we use inline assembly rather than the standard syscall wrapper to call clone3(). In order to avoid surprises we also use a syscall rather than the libc exit() function., this should be overly cautious. Signed-off-by: Mark Brown <broonie@kernel.org> --- tools/testing/selftests/clone3/clone3.c | 134 +++++++++++++++++++++- tools/testing/selftests/clone3/clone3_selftests.h | 38 ++++++ 2 files changed, 171 insertions(+), 1 deletion(-)