Message ID | 87h7bzjaer.fsf@oldenburg.str.redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | x86: Implement arch_prctl(ARCH_VSYSCALL_LOCKOUT) to disable vsyscall | expand |
On Fri, Nov 26, 2021, at 5:47 AM, Florian Weimer wrote: > Distributions struggle with changing the default for vsyscall > emulation because it is a clear break of userspace ABI, something > that should not happen. > > The legacy vsyscall interface is supposed to be used by libcs only, > not by applications. This commit adds a new arch_prctl request, > ARCH_VSYSCALL_LOCKOUT. Newer libcs can adopt this request to signal > to the kernel that the process does not need vsyscall emulation. > The kernel can then disable it for the remaining lifetime of the > process. Legacy libcs do not perform this call, so vsyscall remains > enabled for them. This approach should achieves backwards > compatibility (perfect compatibility if the assumption that only libcs > use vsyscall is accurate), and it provides full hardening for new > binaries. Why is a lockout needed instead of just a toggle? By the time an attacker can issue prctls, an emulated vsyscall seems like a pretty minor exploit technique. And programs that load legacy modules or instrument other programs might need to re-enable them. Also, the interaction with emulate mode is somewhat complex. For now, let’s support this in xonly mode only. A complete implementation will require nontrivial mm work. I had that implemented pre-KPTI, but KPTI made it more complicated. Finally, /proc/self/maps should be wired up via the gate_area code. > > The chosen value of ARCH_VSYSCALL_LOCKOUT should avoid conflicts > with outher x86-64 arch_prctl requests. > > Future arch_prctls requests commonly used at process startup can imply > vsyscall lockout, so that a separate system call for the lockout is > not needed. > > Signed-off-by: Florian Weimer <fweimer@redhat.com> > > --- > arch/x86/entry/vsyscall/vsyscall_64.c | 6 + > arch/x86/include/asm/mmu.h | 6 + > arch/x86/include/uapi/asm/prctl.h | 2 + > arch/x86/kernel/process_64.c | 5 + > tools/arch/x86/include/uapi/asm/prctl.h | 2 + > tools/testing/selftests/x86/Makefile | 13 +- > tools/testing/selftests/x86/vsyscall_lockout.c | 431 +++++++++++++++++++++++++ > 7 files changed, 462 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c > b/arch/x86/entry/vsyscall/vsyscall_64.c > index 0b6b277ee050..ac176481cbdf 100644 > --- a/arch/x86/entry/vsyscall/vsyscall_64.c > +++ b/arch/x86/entry/vsyscall/vsyscall_64.c > @@ -174,6 +174,12 @@ bool emulate_vsyscall(unsigned long error_code, > > tsk = current; > > + if (tsk->mm->context.vsyscall_lockout) { > + warn_bad_vsyscall(KERN_WARNING, regs, > + "vsyscall after lockout (exploit attempt?)"); > + goto sigsegv; > + } > + > /* > * Check for access_ok violations and find the syscall nr. > * > diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h > index 5d7494631ea9..59ddac5ad2e7 100644 > --- a/arch/x86/include/asm/mmu.h > +++ b/arch/x86/include/asm/mmu.h > @@ -41,6 +41,12 @@ typedef struct { > #ifdef CONFIG_X86_64 > unsigned short flags; > #endif > +#ifdef CONFIG_X86_VSYSCALL_EMULATION > + /* > + * Set to true by arch_prctl(ARCH_VSYSCALL_LOCKOUT). > + */ > + bool vsyscall_lockout; > +#endif > > struct mutex lock; > void __user *vdso; /* vdso base address */ > diff --git a/arch/x86/include/uapi/asm/prctl.h > b/arch/x86/include/uapi/asm/prctl.h > index 754a07856817..6f2b17ec4798 100644 > --- a/arch/x86/include/uapi/asm/prctl.h > +++ b/arch/x86/include/uapi/asm/prctl.h > @@ -18,4 +18,6 @@ > #define ARCH_MAP_VDSO_32 0x2002 > #define ARCH_MAP_VDSO_64 0x2003 > > +#define ARCH_VSYSCALL_LOCKOUT 0x5001 > + > #endif /* _ASM_X86_PRCTL_H */ > diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c > index 3402edec236c..eaabd365aa63 100644 > --- a/arch/x86/kernel/process_64.c > +++ b/arch/x86/kernel/process_64.c > @@ -816,6 +816,11 @@ long do_arch_prctl_64(struct task_struct *task, > int option, unsigned long arg2) > ret = put_user(base, (unsigned long __user *)arg2); > break; > } > + case ARCH_VSYSCALL_LOCKOUT: > +#ifdef CONFIG_X86_VSYSCALL_EMULATION > + current->mm->context.vsyscall_lockout = true; > +#endif > + break; > > #ifdef CONFIG_CHECKPOINT_RESTORE > # ifdef CONFIG_X86_X32_ABI > diff --git a/tools/arch/x86/include/uapi/asm/prctl.h > b/tools/arch/x86/include/uapi/asm/prctl.h > index 754a07856817..6f2b17ec4798 100644 > --- a/tools/arch/x86/include/uapi/asm/prctl.h > +++ b/tools/arch/x86/include/uapi/asm/prctl.h > @@ -18,4 +18,6 @@ > #define ARCH_MAP_VDSO_32 0x2002 > #define ARCH_MAP_VDSO_64 0x2003 > > +#define ARCH_VSYSCALL_LOCKOUT 0x5001 > + > #endif /* _ASM_X86_PRCTL_H */ > diff --git a/tools/testing/selftests/x86/Makefile > b/tools/testing/selftests/x86/Makefile > index 8a1f62ab3c8e..2269429b77e0 100644 > --- a/tools/testing/selftests/x86/Makefile > +++ b/tools/testing/selftests/x86/Makefile > @@ -18,7 +18,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 > test_syscall_vdso unwind_vdso \ > test_FCMOV test_FCOMI test_FISTTP \ > vdso_restorer > TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ > - corrupt_xstate_header amx > + corrupt_xstate_header amx vsyscall_lockout > # Some selftests require 32bit support enabled also on 64bit systems > TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall > > @@ -72,10 +72,12 @@ all_64: $(BINARIES_64) > EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) > > $(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h > - $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm > + $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ \ > + $(or $(LIBS), -lrt -ldl -lm) > > $(BINARIES_64): $(OUTPUT)/%_64: %.c helpers.h > - $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl > + $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ \ > + $(or $(LIBS), -lrt -ldl -lm) > > # x86_64 users should be encouraged to install 32-bit libraries > ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) > @@ -105,3 +107,8 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S > # state. > $(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start > -static > $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start > -static > + > +# This test does not link against anything (neither libc nor libgcc). > +$(OUTPUT)/vsyscall_lockout_64: \ > + LIBS := -Wl,-no-pie -static -nostdlib -nostartfiles > + CFLAGS += -fno-pie -fno-stack-protector -fno-builtin -ffreestanding > diff --git a/tools/testing/selftests/x86/vsyscall_lockout.c > b/tools/testing/selftests/x86/vsyscall_lockout.c > new file mode 100644 > index 000000000000..88669b4907ee > --- /dev/null > +++ b/tools/testing/selftests/x86/vsyscall_lockout.c > @@ -0,0 +1,431 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * vsyscall_lockout.c - check that disabling vsyscall works > + * Copyright (C) 2021 Red Hat, Inc. > + */ > + > +#include <stddef.h> > + > +#include <asm/prctl.h> > +#include <asm/vsyscall.h> > +#include <linux/signal.h> > +#include <linux/time.h> > +#include <linux/types.h> > +#include <linux/unistd.h> > + > +#ifndef ARCH_VSYSCALL_LOCKOUT > +#define ARCH_VSYSCALL_LOCKOUT 0x5001 > +#elif ARCH_VSYSCALL_LOCKOUT != 0x5001 > +#error wrong vlaue for ARCH_VSYSCALL_LOCKOUT > +#endif > + > +static inline long syscall0(int nr) > +{ > + unsigned long result; > + > + __asm__ volatile ("syscall" > + : "=a" (result) > + : "0" (nr) > + : "memory", "cc", "r11", "cx"); > + return result; > +} > + > +static inline long syscall1(int nr, long arg0) > +{ > + register long rdi __asm__ ("rdi") = arg0; > + unsigned long result; > + > + __asm__ volatile ("syscall" > + : "=a" (result) > + : "0" (nr), "r" (rdi) > + : "memory", "cc", "r11", "cx"); > + return result; > +} > + > +static inline long syscall2(int nr, long arg0, long arg1) > +{ > + register long rdi __asm__ ("rdi") = arg0; > + register long rsi __asm__ ("rsi") = arg1; > + unsigned long result; > + > + __asm__ volatile ("syscall" > + : "=a" (result) > + : "0" (nr), "r" (rdi), "r" (rsi) > + : "memory", "cc", "r11", "cx"); > + return result; > +} > + > +static inline long syscall3(int nr, long arg0, long arg1, long arg2) > +{ > + register long rdi __asm__ ("rdi") = arg0; > + register long rsi __asm__ ("rsi") = arg1; > + register long rdx __asm__ ("rdx") = arg2; > + unsigned long result; > + > + __asm__ volatile ("syscall" > + : "=a" (result) > + : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx) > + : "memory", "cc", "r11", "cx"); > + return result; > +} > + > +static inline long syscall4(int nr, long arg0, long arg1, long arg2, > long arg3) > +{ > + register long rdi __asm__ ("rdi") = arg0; > + register long rsi __asm__ ("rsi") = arg1; > + register long rdx __asm__ ("rdx") = arg2; > + register long r10 __asm__ ("r10") = arg2; > + unsigned long result; > + > + __asm__ volatile ("syscall" > + : "=a" (result) > + : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx), > + "r" (r10) > + : "memory", "cc", "r11", "cx"); > + return result; > +} > + > +static inline long vsyscall1(long addr, long arg0) > +{ > + register long rdi __asm__ ("rdi") = arg0; > + unsigned long result; > + > + __asm__ volatile ("callq *%%rax" > + : "=a" (result) > + : "0" (addr), "r" (rdi) > + : "memory", "cc", "r11", "cx"); > + return result; > +} > + > +static void __attribute__ ((noreturn)) sys_exit(int status) > +{ > + syscall1(__NR_exit, status); > + __builtin_unreachable(); > +} > + > +static void sigabrt(void) > +{ > + syscall2(__NR_kill, syscall0(__NR_getpid), SIGABRT); > +} > + > +static void print_char(char byte) > +{ > + if (syscall3(__NR_write, 1L, (long) &byte, 1L) < 0) > + sigabrt(); > +} > + > +static void print_string(const char *p) > +{ > + while (*p) { > + print_char(*p); > + ++p; > + } > +} > + > +static void print_dec_1(unsigned long val) > +{ > + if (val != 0) { > + print_dec_1(val / 10); > + print_char('0' + (val % 10)); > + } > +} > + > +static void print_dec(unsigned long val) > +{ > + if (val == 0) > + print_char('0'); > + else > + print_dec_1(val); > +} > + > +static void print_signed_dec(long val) > +{ > + if (val < 0) { > + print_char('-'); > + print_dec(-(unsigned long)val); > + } else > + print_dec(val); > +} > + > +static void print_time(const char *label, struct timeval tv) > +{ > + print_string(label); > + print_string(": "); > + print_dec(tv.tv_sec); > + print_char(' '); > + print_dec(tv.tv_usec); > + print_char('\n'); > +} > + > +static void print_failure(const char *label, long ret) > +{ > + print_string("error: "); > + print_string(label); > + print_string(" failed: "); > + print_signed_dec(ret); > + print_char('\n'); > +} > + > +static void xgettimeofday(struct timeval *tv) > +{ > + long ret = syscall1(__NR_gettimeofday, (long) tv); > + > + if (ret != 0) { > + print_failure("gettimeofday", ret); > + sigabrt(); > + } > +} > + > +static void xvgettimeofday(struct timeval *tv) > +{ > + long ret = vsyscall1(VSYSCALL_ADDR, (long) tv); > + > + if (ret) { > + print_failure("vgettimeofday", ret); > + sigabrt(); > + } > +} > + > +static int sys_arch_prctl(int code, unsigned long addr) > +{ > + return syscall2(__NR_arch_prctl, code, addr); > +} > + > +static __kernel_pid_t xfork(void) > +{ > + long ret = syscall0(__NR_fork); > + > + if (ret < 0) { > + print_failure("fork", ret); > + sigabrt(); > + } > + return ret; > +} > + > +static void xexecve(const char *pathname, char **argv, char **envp) > +{ > + long ret; > + > + ret = syscall3(__NR_execve, (long) pathname, (long) argv, > (long) envp); > + print_failure("execve", ret); > + sigabrt(); > +} > + > +static __kernel_pid_t xwaitpid(__kernel_pid_t pid, int *status, int > options) > +{ > + long ret = syscall4(__NR_wait4, pid, (long) status, options, > 0); > + > + if (ret < 0) { > + print_failure("wait4", ret); > + sigabrt(); > + } > + return ret; > +} > + > +static int > +do_lockout(void) > +{ > + long ret = sys_arch_prctl(ARCH_VSYSCALL_LOCKOUT, 0); > + if (ret < 0) > + print_failure("arch_prctl(ARCH_VSYSCALL_LOCKOUT)", > ret); > + return ret; > +} > + > +static long difftime(struct timeval first, struct timeval second) > +{ > + return second.tv_usec - first.tv_usec + > + (second.tv_sec - first.tv_sec) * 1000 * 1000; > +} > + > +/* > + * Second stage: Check that the lockout is not inherited across execve. > + */ > +static int main_2(void) > +{ > + struct timeval vsyscall_time = { -1, -1 }; > + int status = 0; > + > + xvgettimeofday(&vsyscall_time); > + print_time("vsyscall gettimeofday after fork", vsyscall_time); > + if (vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0) > + status = 1; > + > + return status; > +} > + > +static void check_lockout_after_fork(int *status, int twice) > +{ > + __kernel_pid_t pid; > + struct timeval vsyscall_time; > + int wstatus; > + > + if (twice) { > + __kernel_pid_t pid_outer; > + > + print_string("checking that lockout is inherited by > fork\n"); > + > + pid_outer = xfork(); > + if (pid_outer == 0) { > + if (do_lockout()) > + sys_exit(1); > + /* > + * Logic for the subprocess follows below. > + */ > + } else { > + xwaitpid(pid_outer, &wstatus, 0); > + if (wstatus != 0) { > + print_string("error: unexpected exit > status: "); > + print_signed_dec(wstatus); > + print_char('\n'); > + *status = 1; > + } > + return; > + } > + } else > + print_string("checking that lockout works after one > fork\n"); > + > + pid = xfork(); > + if (pid == 0) { > + if (!twice && do_lockout()) > + sys_exit(1); > + /* > + * This should trigger a fault. > + */ > + xvgettimeofday(&vsyscall_time); > + sys_exit(0); > + } > + xwaitpid(pid, &wstatus, 0); > + switch (wstatus) { > + case 0: > + print_string("error: no crash after lockout\n"); > + *status = 1; > + break; > + case 0x0100: > + *status = 1; > + break; > + case SIGSEGV: > + print_string("termination after lockout\n"); > + break; > + default: > + print_string("error: unexpected exit status: "); > + print_signed_dec(wstatus); > + print_char('\n'); > + *status = 1; > + } > + > + if (twice) > + sys_exit(*status); > + > + /* > + * Status in the parent process should be unaffected. > + */ > + xvgettimeofday(&vsyscall_time); > +} > + > +static void check_no_lockout_after_execve(char **argv, int *status) > +{ > + __kernel_pid_t pid; > + int wstatus; > + > + print_string("checking that lockout is not inherited by > execve\n"); > + pid = xfork(); > + if (pid == 0) { > + struct timeval vsyscall_time; > + char *new_argv[] = { argv[0], "2", NULL }; > + > + xvgettimeofday(&vsyscall_time); > + if (do_lockout()) > + sys_exit(1); > + > + /* > + * Re-exec the second stage. See main_2 above. > + */ > + xexecve(argv[0], new_argv, new_argv + 2); > + } > + > + xwaitpid(pid, &wstatus, 0); > + if (wstatus != 0) { > + print_string("error: unexpected exit status: "); > + print_signed_dec(wstatus); > + print_char('\n'); > + *status = 1; > + } > +} > + > +static int main(int argc, char **argv) > +{ > + struct timeval initial_time = { -1, -1 }; > + struct timeval vsyscall_time = { -1, -1 }; > + struct timeval final_time = { -1, -1 }; > + long vsyscall_diff, final_diff; > + int status = 0; > + > + if (argc > 1) > + switch (*argv[1]) { > + case '2': > + return main_2(); > + default: > + print_string("usage: "); > + print_string(argv[0]); > + print_string("\n"); > + return 1; > + } > + > + > + xgettimeofday(&initial_time); > + xvgettimeofday(&vsyscall_time); > + xgettimeofday(&final_time); > + vsyscall_diff = difftime(initial_time, vsyscall_time); > + final_diff = difftime(vsyscall_time, final_time); > + > + print_time("initial gettimeofday", initial_time); > + print_time("vsyscall gettimeofday", vsyscall_time); > + print_time("final gettimeofday", final_time); > + > + if (initial_time.tv_sec < 0 || initial_time.tv_usec < 0 || > + vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0 || > + final_time.tv_sec < 0 || final_time.tv_usec < 0) { > + print_string("error: negative time\n"); > + status = 1; > + } > + > + print_string("differences: "); > + print_signed_dec(vsyscall_diff); > + print_char(' '); > + print_signed_dec(final_diff); > + print_char('\n'); > + > + if (vsyscall_diff < 0 || final_diff < 0) { > + /* > + * This may produce false positives if there is an > active NTP. > + */ > + print_string("error: time went backwards\n"); > + status = 1; > + } > + > + check_lockout_after_fork(&status, 0); > + check_lockout_after_fork(&status, 1); > + check_no_lockout_after_execve(argv, &status); > + > + print_string("testing done, exit status: "); > + print_signed_dec(status); > + print_char('\n'); > + return status; > +} > + > +static void __attribute__ ((used)) main_trampoline(long *rsp) > +{ > + sys_exit(main(*rsp, (char **) (rsp + 1))); > +} > + > +__asm__ (".text\n\t" > + ".globl _start\n" > + "_start:\n\t" > + ".cfi_startproc\n\t" > + ".cfi_undefined rip\n\t" > + "movq %rsp, %rdi\n\t" > + "callq main_trampoline\n\t" /* Results in psABI %rsp > alignment. */ > + ".cfi_endproc\n\t" > + ".type _start, @function\n\t" > + ".size _start, . - _start\n\t" > + ".previous");
* Andy Lutomirski: > On Fri, Nov 26, 2021, at 5:47 AM, Florian Weimer wrote: >> Distributions struggle with changing the default for vsyscall >> emulation because it is a clear break of userspace ABI, something >> that should not happen. >> >> The legacy vsyscall interface is supposed to be used by libcs only, >> not by applications. This commit adds a new arch_prctl request, >> ARCH_VSYSCALL_LOCKOUT. Newer libcs can adopt this request to signal >> to the kernel that the process does not need vsyscall emulation. >> The kernel can then disable it for the remaining lifetime of the >> process. Legacy libcs do not perform this call, so vsyscall remains >> enabled for them. This approach should achieves backwards >> compatibility (perfect compatibility if the assumption that only libcs >> use vsyscall is accurate), and it provides full hardening for new >> binaries. > > Why is a lockout needed instead of just a toggle? By the time an > attacker can issue prctls, an emulated vsyscall seems like a pretty > minor exploit technique. And programs that load legacy modules or > instrument other programs might need to re-enable them. For glibc, I plan to add an environment variable to disable the lockout. There's no ELF markup that would allow us to do this during dlopen. (And after this change, you can run an old distribution in a chroot for legacy software, something that the userspace ABI break prevents.) If it can be disabled, people will definitely say, “we get more complete hardening if we break old userspace”. I want to avoid that. (People will say that anyway because there's this fairly large window of libcs that don't use vsyscalls anymore, but have not been patched yet to do the lockout.) Maybe the lockout also simplifies the implementation? > Also, the interaction with emulate mode is somewhat complex. For now, > let’s support this in xonly mode only. A complete implementation will > require nontrivial mm work. I had that implemented pre-KPTI, but KPTI > made it more complicated. I admit I only looked at the code in emulate_vsyscall. It has code that seems to deal with faults not due to instruction fetch, and also checks for vsyscall=emulate mode. But it seems that we don't get to this point for reads in vsyscall=emulate mode, presumably because the page is already mapped? > Finally, /proc/self/maps should be wired up via the gate_area code. Should the "[vsyscall]" string change to something else if execution is disabled? Thanks, Florian
On Fri, Nov 26, 2021, at 12:24 PM, Florian Weimer wrote: > * Andy Lutomirski: > >> On Fri, Nov 26, 2021, at 5:47 AM, Florian Weimer wrote: >>> Distributions struggle with changing the default for vsyscall >>> emulation because it is a clear break of userspace ABI, something >>> that should not happen. >>> >>> The legacy vsyscall interface is supposed to be used by libcs only, >>> not by applications. This commit adds a new arch_prctl request, >>> ARCH_VSYSCALL_LOCKOUT. Newer libcs can adopt this request to signal >>> to the kernel that the process does not need vsyscall emulation. >>> The kernel can then disable it for the remaining lifetime of the >>> process. Legacy libcs do not perform this call, so vsyscall remains >>> enabled for them. This approach should achieves backwards >>> compatibility (perfect compatibility if the assumption that only libcs >>> use vsyscall is accurate), and it provides full hardening for new >>> binaries. >> >> Why is a lockout needed instead of just a toggle? By the time an >> attacker can issue prctls, an emulated vsyscall seems like a pretty >> minor exploit technique. And programs that load legacy modules or >> instrument other programs might need to re-enable them. > > For glibc, I plan to add an environment variable to disable the lockout. > There's no ELF markup that would allow us to do this during dlopen. > (And after this change, you can run an old distribution in a chroot > for legacy software, something that the userspace ABI break prevents.) > > If it can be disabled, people will definitely say, “we get more complete > hardening if we break old userspace”. I want to avoid that. (People > will say that anyway because there's this fairly large window of libcs > that don't use vsyscalls anymore, but have not been patched yet to do > the lockout.) I’m having trouble following the logic. What I mean is that I think it should be possible to do the arch_prctl again to turn vsyscalls back on. > > Maybe the lockout also simplifies the implementation? > >> Also, the interaction with emulate mode is somewhat complex. For now, >> let’s support this in xonly mode only. A complete implementation will >> require nontrivial mm work. I had that implemented pre-KPTI, but KPTI >> made it more complicated. > > I admit I only looked at the code in emulate_vsyscall. It has code that > seems to deal with faults not due to instruction fetch, and also checks > for vsyscall=emulate mode. But it seems that we don't get to this point > for reads in vsyscall=emulate mode, presumably because the page is > already mapped? Yes, and, with KPTI off, it’s nontrivial to unmap it. I have code for this, but I’m not sure the complexity is worthwhile. > >> Finally, /proc/self/maps should be wired up via the gate_area code. > > Should the "[vsyscall]" string change to something else if execution is > disabled? I think the line should disappear entirely, just like booting with vsyscall=none. > > Thanks, > Florian
* Andy Lutomirski: > On Fri, Nov 26, 2021, at 12:24 PM, Florian Weimer wrote: >> * Andy Lutomirski: >> >>> On Fri, Nov 26, 2021, at 5:47 AM, Florian Weimer wrote: >>>> Distributions struggle with changing the default for vsyscall >>>> emulation because it is a clear break of userspace ABI, something >>>> that should not happen. >>>> >>>> The legacy vsyscall interface is supposed to be used by libcs only, >>>> not by applications. This commit adds a new arch_prctl request, >>>> ARCH_VSYSCALL_LOCKOUT. Newer libcs can adopt this request to signal >>>> to the kernel that the process does not need vsyscall emulation. >>>> The kernel can then disable it for the remaining lifetime of the >>>> process. Legacy libcs do not perform this call, so vsyscall remains >>>> enabled for them. This approach should achieves backwards >>>> compatibility (perfect compatibility if the assumption that only libcs >>>> use vsyscall is accurate), and it provides full hardening for new >>>> binaries. >>> >>> Why is a lockout needed instead of just a toggle? By the time an >>> attacker can issue prctls, an emulated vsyscall seems like a pretty >>> minor exploit technique. And programs that load legacy modules or >>> instrument other programs might need to re-enable them. >> >> For glibc, I plan to add an environment variable to disable the lockout. >> There's no ELF markup that would allow us to do this during dlopen. >> (And after this change, you can run an old distribution in a chroot >> for legacy software, something that the userspace ABI break prevents.) >> >> If it can be disabled, people will definitely say, “we get more complete >> hardening if we break old userspace”. I want to avoid that. (People >> will say that anyway because there's this fairly large window of libcs >> that don't use vsyscalls anymore, but have not been patched yet to do >> the lockout.) > > I’m having trouble following the logic. What I mean is that I think it > should be possible to do the arch_prctl again to turn vsyscalls back > on. The “By the time an attacker can issue prctls” argument does resonate with me, but I'm not the one who needs convincing. I can turn this into a toggle, and we could probably default our builds to vsyscalls=xonly. Given the userspace ABI impact, we'd still have to upstream the toggle. Do you see a chance of a patch a long these lines going in at all, given that it's an incomplete solution for vsyscall=emulate? >> Maybe the lockout also simplifies the implementation? >> >>> Also, the interaction with emulate mode is somewhat complex. For now, >>> let’s support this in xonly mode only. A complete implementation will >>> require nontrivial mm work. I had that implemented pre-KPTI, but KPTI >>> made it more complicated. >> >> I admit I only looked at the code in emulate_vsyscall. It has code that >> seems to deal with faults not due to instruction fetch, and also checks >> for vsyscall=emulate mode. But it seems that we don't get to this point >> for reads in vsyscall=emulate mode, presumably because the page is >> already mapped? > > Yes, and, with KPTI off, it’s nontrivial to unmap it. I have code for > this, but I’m not sure the complexity is worthwhile. Huh. KPTI is the new thing, right? Does it make things harder or not? I'm confused. If we knew at execve time that the new process image doesn't have vsyscall, would that be easier to set up? vsyscall opt-out could be triggered by an ELF NOTE segment on the program interpreter (or main program if there isn't one). >>> Finally, /proc/self/maps should be wired up via the gate_area code. >> >> Should the "[vsyscall]" string change to something else if execution is >> disabled? > > I think the line should disappear entirely, just like booting with > vsyscall=none. Hmm. But only for vsyscall=xonly, right? With vsyscall=emulate, reading at those addresses will still succeed. Thanks, Florian
On Fri, Nov 26, 2021, at 3:18 PM, Florian Weimer wrote: > * Andy Lutomirski: > >> On Fri, Nov 26, 2021, at 12:24 PM, Florian Weimer wrote: >>> * Andy Lutomirski: >>> >>>> On Fri, Nov 26, 2021, at 5:47 AM, Florian Weimer wrote: >>>>> Distributions struggle with changing the default for vsyscall >>>>> emulation because it is a clear break of userspace ABI, something >>>>> that should not happen. >>>>> >>>>> The legacy vsyscall interface is supposed to be used by libcs only, >>>>> not by applications. This commit adds a new arch_prctl request, >>>>> ARCH_VSYSCALL_LOCKOUT. Newer libcs can adopt this request to signal >>>>> to the kernel that the process does not need vsyscall emulation. >>>>> The kernel can then disable it for the remaining lifetime of the >>>>> process. Legacy libcs do not perform this call, so vsyscall remains >>>>> enabled for them. This approach should achieves backwards >>>>> compatibility (perfect compatibility if the assumption that only libcs >>>>> use vsyscall is accurate), and it provides full hardening for new >>>>> binaries. >>>> >>>> Why is a lockout needed instead of just a toggle? By the time an >>>> attacker can issue prctls, an emulated vsyscall seems like a pretty >>>> minor exploit technique. And programs that load legacy modules or >>>> instrument other programs might need to re-enable them. >>> >>> For glibc, I plan to add an environment variable to disable the lockout. >>> There's no ELF markup that would allow us to do this during dlopen. >>> (And after this change, you can run an old distribution in a chroot >>> for legacy software, something that the userspace ABI break prevents.) >>> >>> If it can be disabled, people will definitely say, “we get more complete >>> hardening if we break old userspace”. I want to avoid that. (People >>> will say that anyway because there's this fairly large window of libcs >>> that don't use vsyscalls anymore, but have not been patched yet to do >>> the lockout.) >> >> I’m having trouble following the logic. What I mean is that I think it >> should be possible to do the arch_prctl again to turn vsyscalls back >> on. > > The “By the time an attacker can issue prctls” argument does resonate > with me, but I'm not the one who needs convincing. Who else needs convincing? It's your patch. This could possibly be much more generic: have a mask of legacy features to disable and a separate mask of lock bits. > > I can turn this into a toggle, and we could probably default our builds > to vsyscalls=xonly. Given the userspace ABI impact, we'd still have to > upstream the toggle. Do you see a chance of a patch a long these lines > going in at all, given that it's an incomplete solution for > vsyscall=emulate? There is basically no reason for anyone to use vsyscall=emulate any more. I'm aware of exactly one use case, and it's quite bizarre and involves instrumenting an outdated binary with an outdated instrumentation tool. If either one is recent (last few years), vsyscall=xonly is fine. > >>> Maybe the lockout also simplifies the implementation? >>> >>>> Also, the interaction with emulate mode is somewhat complex. For now, >>>> let’s support this in xonly mode only. A complete implementation will >>>> require nontrivial mm work. I had that implemented pre-KPTI, but KPTI >>>> made it more complicated. >>> >>> I admit I only looked at the code in emulate_vsyscall. It has code that >>> seems to deal with faults not due to instruction fetch, and also checks >>> for vsyscall=emulate mode. But it seems that we don't get to this point >>> for reads in vsyscall=emulate mode, presumably because the page is >>> already mapped? >> >> Yes, and, with KPTI off, it’s nontrivial to unmap it. I have code for >> this, but I’m not sure the complexity is worthwhile. > > Huh. KPTI is the new thing, right? Does it make things harder or not? > I'm confused. > > If we knew at execve time that the new process image doesn't have > vsyscall, would that be easier to set up? vsyscall opt-out could be > triggered by an ELF NOTE segment on the program interpreter (or main > program if there isn't one). Nah, it's a different issue. The vsyscall mapping isn't a normal mapping at all. It's in the *kernel* address range, so it's not in the user portion of the page tables. This means that, per mm, there is only the pgd entry that can be changed. With kpti off, it can be fudged using the U bit (hah!). With kpti on, the same trick would work, but the whole pagetable arrangement is different, and the patch would need updating. The patch looks a bit like this: https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso_permm&id=18432aa9942e8c36c3ba008d2908c246127d135c except I screwed up and there's a bunch of irrelevant stuff in there. But the patch would need updating for new kernels. In any event, none of this is needed in xonly mode. > >>>> Finally, /proc/self/maps should be wired up via the gate_area code. >>> >>> Should the "[vsyscall]" string change to something else if execution is >>> disabled? >> >> I think the line should disappear entirely, just like booting with >> vsyscall=none. > > Hmm. But only for vsyscall=xonly, right? With vsyscall=emulate, > reading at those addresses will still succeed. IMO if vsyscall is disabled for a process, reads and executes should both fail. This is trivial in xonly mode. > > Thanks, > Florian
* Andy Lutomirski: > This could possibly be much more generic: have a mask of legacy > features to disable and a separate mask of lock bits. Is that really necessary? Adding additional ARCH_* constants does not seem to be particularly onerous and helps with detection of kernel support. >> I can turn this into a toggle, and we could probably default our builds >> to vsyscalls=xonly. Given the userspace ABI impact, we'd still have to >> upstream the toggle. Do you see a chance of a patch a long these lines >> going in at all, given that it's an incomplete solution for >> vsyscall=emulate? > > There is basically no reason for anyone to use vsyscall=emulate any > more. I'm aware of exactly one use case, and it's quite bizarre and > involves instrumenting an outdated binary with an outdated > instrumentation tool. If either one is recent (last few years), > vsyscall=xonly is fine. Yeah, we plan to stick to vsyscall=xonly. This means that the toggle is easier to implement, of course. >> Hmm. But only for vsyscall=xonly, right? With vsyscall=emulate, >> reading at those addresses will still succeed. > > IMO if vsyscall is disabled for a process, reads and executes should > both fail. This is trivial in xonly mode. Right, I'll document this as a glitch for now. I've got a v2 (with the toggle rather than pure lockout) and will sent it out shortly. Thanks, Florian
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 0b6b277ee050..ac176481cbdf 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -174,6 +174,12 @@ bool emulate_vsyscall(unsigned long error_code, tsk = current; + if (tsk->mm->context.vsyscall_lockout) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall after lockout (exploit attempt?)"); + goto sigsegv; + } + /* * Check for access_ok violations and find the syscall nr. * diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5d7494631ea9..59ddac5ad2e7 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -41,6 +41,12 @@ typedef struct { #ifdef CONFIG_X86_64 unsigned short flags; #endif +#ifdef CONFIG_X86_VSYSCALL_EMULATION + /* + * Set to true by arch_prctl(ARCH_VSYSCALL_LOCKOUT). + */ + bool vsyscall_lockout; +#endif struct mutex lock; void __user *vdso; /* vdso base address */ diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 754a07856817..6f2b17ec4798 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -18,4 +18,6 @@ #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_VSYSCALL_LOCKOUT 0x5001 + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3402edec236c..eaabd365aa63 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -816,6 +816,11 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) ret = put_user(base, (unsigned long __user *)arg2); break; } + case ARCH_VSYSCALL_LOCKOUT: +#ifdef CONFIG_X86_VSYSCALL_EMULATION + current->mm->context.vsyscall_lockout = true; +#endif + break; #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h index 754a07856817..6f2b17ec4798 100644 --- a/tools/arch/x86/include/uapi/asm/prctl.h +++ b/tools/arch/x86/include/uapi/asm/prctl.h @@ -18,4 +18,6 @@ #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_VSYSCALL_LOCKOUT 0x5001 + #endif /* _ASM_X86_PRCTL_H */ diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 8a1f62ab3c8e..2269429b77e0 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -18,7 +18,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header amx + corrupt_xstate_header amx vsyscall_lockout # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall @@ -72,10 +72,12 @@ all_64: $(BINARIES_64) EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) $(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h - $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm + $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ \ + $(or $(LIBS), -lrt -ldl -lm) $(BINARIES_64): $(OUTPUT)/%_64: %.c helpers.h - $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl + $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ \ + $(or $(LIBS), -lrt -ldl -lm) # x86_64 users should be encouraged to install 32-bit libraries ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) @@ -105,3 +107,8 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S # state. $(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static + +# This test does not link against anything (neither libc nor libgcc). +$(OUTPUT)/vsyscall_lockout_64: \ + LIBS := -Wl,-no-pie -static -nostdlib -nostartfiles + CFLAGS += -fno-pie -fno-stack-protector -fno-builtin -ffreestanding diff --git a/tools/testing/selftests/x86/vsyscall_lockout.c b/tools/testing/selftests/x86/vsyscall_lockout.c new file mode 100644 index 000000000000..88669b4907ee --- /dev/null +++ b/tools/testing/selftests/x86/vsyscall_lockout.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vsyscall_lockout.c - check that disabling vsyscall works + * Copyright (C) 2021 Red Hat, Inc. + */ + +#include <stddef.h> + +#include <asm/prctl.h> +#include <asm/vsyscall.h> +#include <linux/signal.h> +#include <linux/time.h> +#include <linux/types.h> +#include <linux/unistd.h> + +#ifndef ARCH_VSYSCALL_LOCKOUT +#define ARCH_VSYSCALL_LOCKOUT 0x5001 +#elif ARCH_VSYSCALL_LOCKOUT != 0x5001 +#error wrong vlaue for ARCH_VSYSCALL_LOCKOUT +#endif + +static inline long syscall0(int nr) +{ + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall1(int nr, long arg0) +{ + register long rdi __asm__ ("rdi") = arg0; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall2(int nr, long arg0, long arg1) +{ + register long rdi __asm__ ("rdi") = arg0; + register long rsi __asm__ ("rsi") = arg1; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi), "r" (rsi) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall3(int nr, long arg0, long arg1, long arg2) +{ + register long rdi __asm__ ("rdi") = arg0; + register long rsi __asm__ ("rsi") = arg1; + register long rdx __asm__ ("rdx") = arg2; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long syscall4(int nr, long arg0, long arg1, long arg2, long arg3) +{ + register long rdi __asm__ ("rdi") = arg0; + register long rsi __asm__ ("rsi") = arg1; + register long rdx __asm__ ("rdx") = arg2; + register long r10 __asm__ ("r10") = arg2; + unsigned long result; + + __asm__ volatile ("syscall" + : "=a" (result) + : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx), + "r" (r10) + : "memory", "cc", "r11", "cx"); + return result; +} + +static inline long vsyscall1(long addr, long arg0) +{ + register long rdi __asm__ ("rdi") = arg0; + unsigned long result; + + __asm__ volatile ("callq *%%rax" + : "=a" (result) + : "0" (addr), "r" (rdi) + : "memory", "cc", "r11", "cx"); + return result; +} + +static void __attribute__ ((noreturn)) sys_exit(int status) +{ + syscall1(__NR_exit, status); + __builtin_unreachable(); +} + +static void sigabrt(void) +{ + syscall2(__NR_kill, syscall0(__NR_getpid), SIGABRT); +} + +static void print_char(char byte) +{ + if (syscall3(__NR_write, 1L, (long) &byte, 1L) < 0) + sigabrt(); +} + +static void print_string(const char *p) +{ + while (*p) { + print_char(*p); + ++p; + } +} + +static void print_dec_1(unsigned long val) +{ + if (val != 0) { + print_dec_1(val / 10); + print_char('0' + (val % 10)); + } +} + +static void print_dec(unsigned long val) +{ + if (val == 0) + print_char('0'); + else + print_dec_1(val); +} + +static void print_signed_dec(long val) +{ + if (val < 0) { + print_char('-'); + print_dec(-(unsigned long)val); + } else + print_dec(val); +} + +static void print_time(const char *label, struct timeval tv) +{ + print_string(label); + print_string(": "); + print_dec(tv.tv_sec); + print_char(' '); + print_dec(tv.tv_usec); + print_char('\n'); +} + +static void print_failure(const char *label, long ret) +{ + print_string("error: "); + print_string(label); + print_string(" failed: "); + print_signed_dec(ret); + print_char('\n'); +} + +static void xgettimeofday(struct timeval *tv) +{ + long ret = syscall1(__NR_gettimeofday, (long) tv); + + if (ret != 0) { + print_failure("gettimeofday", ret); + sigabrt(); + } +} + +static void xvgettimeofday(struct timeval *tv) +{ + long ret = vsyscall1(VSYSCALL_ADDR, (long) tv); + + if (ret) { + print_failure("vgettimeofday", ret); + sigabrt(); + } +} + +static int sys_arch_prctl(int code, unsigned long addr) +{ + return syscall2(__NR_arch_prctl, code, addr); +} + +static __kernel_pid_t xfork(void) +{ + long ret = syscall0(__NR_fork); + + if (ret < 0) { + print_failure("fork", ret); + sigabrt(); + } + return ret; +} + +static void xexecve(const char *pathname, char **argv, char **envp) +{ + long ret; + + ret = syscall3(__NR_execve, (long) pathname, (long) argv, (long) envp); + print_failure("execve", ret); + sigabrt(); +} + +static __kernel_pid_t xwaitpid(__kernel_pid_t pid, int *status, int options) +{ + long ret = syscall4(__NR_wait4, pid, (long) status, options, 0); + + if (ret < 0) { + print_failure("wait4", ret); + sigabrt(); + } + return ret; +} + +static int +do_lockout(void) +{ + long ret = sys_arch_prctl(ARCH_VSYSCALL_LOCKOUT, 0); + if (ret < 0) + print_failure("arch_prctl(ARCH_VSYSCALL_LOCKOUT)", ret); + return ret; +} + +static long difftime(struct timeval first, struct timeval second) +{ + return second.tv_usec - first.tv_usec + + (second.tv_sec - first.tv_sec) * 1000 * 1000; +} + +/* + * Second stage: Check that the lockout is not inherited across execve. + */ +static int main_2(void) +{ + struct timeval vsyscall_time = { -1, -1 }; + int status = 0; + + xvgettimeofday(&vsyscall_time); + print_time("vsyscall gettimeofday after fork", vsyscall_time); + if (vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0) + status = 1; + + return status; +} + +static void check_lockout_after_fork(int *status, int twice) +{ + __kernel_pid_t pid; + struct timeval vsyscall_time; + int wstatus; + + if (twice) { + __kernel_pid_t pid_outer; + + print_string("checking that lockout is inherited by fork\n"); + + pid_outer = xfork(); + if (pid_outer == 0) { + if (do_lockout()) + sys_exit(1); + /* + * Logic for the subprocess follows below. + */ + } else { + xwaitpid(pid_outer, &wstatus, 0); + if (wstatus != 0) { + print_string("error: unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } + return; + } + } else + print_string("checking that lockout works after one fork\n"); + + pid = xfork(); + if (pid == 0) { + if (!twice && do_lockout()) + sys_exit(1); + /* + * This should trigger a fault. + */ + xvgettimeofday(&vsyscall_time); + sys_exit(0); + } + xwaitpid(pid, &wstatus, 0); + switch (wstatus) { + case 0: + print_string("error: no crash after lockout\n"); + *status = 1; + break; + case 0x0100: + *status = 1; + break; + case SIGSEGV: + print_string("termination after lockout\n"); + break; + default: + print_string("error: unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } + + if (twice) + sys_exit(*status); + + /* + * Status in the parent process should be unaffected. + */ + xvgettimeofday(&vsyscall_time); +} + +static void check_no_lockout_after_execve(char **argv, int *status) +{ + __kernel_pid_t pid; + int wstatus; + + print_string("checking that lockout is not inherited by execve\n"); + pid = xfork(); + if (pid == 0) { + struct timeval vsyscall_time; + char *new_argv[] = { argv[0], "2", NULL }; + + xvgettimeofday(&vsyscall_time); + if (do_lockout()) + sys_exit(1); + + /* + * Re-exec the second stage. See main_2 above. + */ + xexecve(argv[0], new_argv, new_argv + 2); + } + + xwaitpid(pid, &wstatus, 0); + if (wstatus != 0) { + print_string("error: unexpected exit status: "); + print_signed_dec(wstatus); + print_char('\n'); + *status = 1; + } +} + +static int main(int argc, char **argv) +{ + struct timeval initial_time = { -1, -1 }; + struct timeval vsyscall_time = { -1, -1 }; + struct timeval final_time = { -1, -1 }; + long vsyscall_diff, final_diff; + int status = 0; + + if (argc > 1) + switch (*argv[1]) { + case '2': + return main_2(); + default: + print_string("usage: "); + print_string(argv[0]); + print_string("\n"); + return 1; + } + + + xgettimeofday(&initial_time); + xvgettimeofday(&vsyscall_time); + xgettimeofday(&final_time); + vsyscall_diff = difftime(initial_time, vsyscall_time); + final_diff = difftime(vsyscall_time, final_time); + + print_time("initial gettimeofday", initial_time); + print_time("vsyscall gettimeofday", vsyscall_time); + print_time("final gettimeofday", final_time); + + if (initial_time.tv_sec < 0 || initial_time.tv_usec < 0 || + vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0 || + final_time.tv_sec < 0 || final_time.tv_usec < 0) { + print_string("error: negative time\n"); + status = 1; + } + + print_string("differences: "); + print_signed_dec(vsyscall_diff); + print_char(' '); + print_signed_dec(final_diff); + print_char('\n'); + + if (vsyscall_diff < 0 || final_diff < 0) { + /* + * This may produce false positives if there is an active NTP. + */ + print_string("error: time went backwards\n"); + status = 1; + } + + check_lockout_after_fork(&status, 0); + check_lockout_after_fork(&status, 1); + check_no_lockout_after_execve(argv, &status); + + print_string("testing done, exit status: "); + print_signed_dec(status); + print_char('\n'); + return status; +} + +static void __attribute__ ((used)) main_trampoline(long *rsp) +{ + sys_exit(main(*rsp, (char **) (rsp + 1))); +} + +__asm__ (".text\n\t" + ".globl _start\n" + "_start:\n\t" + ".cfi_startproc\n\t" + ".cfi_undefined rip\n\t" + "movq %rsp, %rdi\n\t" + "callq main_trampoline\n\t" /* Results in psABI %rsp alignment. */ + ".cfi_endproc\n\t" + ".type _start, @function\n\t" + ".size _start, . - _start\n\t" + ".previous");
Distributions struggle with changing the default for vsyscall emulation because it is a clear break of userspace ABI, something that should not happen. The legacy vsyscall interface is supposed to be used by libcs only, not by applications. This commit adds a new arch_prctl request, ARCH_VSYSCALL_LOCKOUT. Newer libcs can adopt this request to signal to the kernel that the process does not need vsyscall emulation. The kernel can then disable it for the remaining lifetime of the process. Legacy libcs do not perform this call, so vsyscall remains enabled for them. This approach should achieves backwards compatibility (perfect compatibility if the assumption that only libcs use vsyscall is accurate), and it provides full hardening for new binaries. The chosen value of ARCH_VSYSCALL_LOCKOUT should avoid conflicts with outher x86-64 arch_prctl requests. Future arch_prctls requests commonly used at process startup can imply vsyscall lockout, so that a separate system call for the lockout is not needed. Signed-off-by: Florian Weimer <fweimer@redhat.com> --- arch/x86/entry/vsyscall/vsyscall_64.c | 6 + arch/x86/include/asm/mmu.h | 6 + arch/x86/include/uapi/asm/prctl.h | 2 + arch/x86/kernel/process_64.c | 5 + tools/arch/x86/include/uapi/asm/prctl.h | 2 + tools/testing/selftests/x86/Makefile | 13 +- tools/testing/selftests/x86/vsyscall_lockout.c | 431 +++++++++++++++++++++++++ 7 files changed, 462 insertions(+), 3 deletions(-)