Message ID | 20170311000501.46607-2-thgarnie@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
* Thomas Garnier <thgarnie@google.com> wrote: > Implement specific usage of verify_pre_usermode_state for user-mode > returns for x86. > --- > Based on next-20170308 > --- > arch/x86/Kconfig | 1 + > arch/x86/entry/common.c | 3 +++ > arch/x86/entry/entry_64.S | 19 +++++++++++++++++++ > arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++ > arch/x86/include/asm/processor.h | 11 ----------- > 5 files changed, 34 insertions(+), 11 deletions(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 005df7c825f5..6d48e18e6f09 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -63,6 +63,7 @@ config X86 > select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI > select ARCH_MIGHT_HAVE_PC_PARPORT > select ARCH_MIGHT_HAVE_PC_SERIO > + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT > select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c > index 370c42c7f046..525edbb77f03 100644 > --- a/arch/x86/entry/common.c > +++ b/arch/x86/entry/common.c > @@ -22,6 +22,7 @@ > #include <linux/context_tracking.h> > #include <linux/user-return-notifier.h> > #include <linux/uprobes.h> > +#include <linux/syscalls.h> > > #include <asm/desc.h> > #include <asm/traps.h> > @@ -180,6 +181,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) > struct thread_info *ti = current_thread_info(); > u32 cached_flags; > > + verify_pre_usermode_state(); > + > if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) > local_irq_disable(); > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S > index d2b2a2948ffe..04db589be466 100644 > --- a/arch/x86/entry/entry_64.S > +++ b/arch/x86/entry/entry_64.S > @@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath: > testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) > jnz 1f > > + /* > + * Check user-mode state on fast path return, the same check is done > + * under the slow path through syscall_return_slowpath. > + */ > +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION > + call verify_pre_usermode_state > +#else > + /* > + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a > + * warning. > + */ > + movq PER_CPU_VAR(current_task), %rax > + movq $TASK_SIZE_MAX, %rcx > + cmp %rcx, TASK_addr_limit(%rax) > + jz 1f > + movq %rcx, TASK_addr_limit(%rax) > +1: > +#endif > + > LOCKDEP_SYS_EXIT > TRACE_IRQS_ON /* user mode is traced as IRQs on */ > movq RIP(%rsp), %rcx Ugh, so you call an assembly function just to ... call another function. Plus why is it in assembly to begin with? Is this some older code that got written when the x86 entry code was in assembly, and never properly converted to C? Thanks, Ingo
On Sat, Mar 11, 2017 at 1:42 AM, Ingo Molnar <mingo@kernel.org> wrote: > > Ugh, so you call an assembly function just to ... call another function. > The verify_pre_usermode_state function is the architecture independent checker. By default it is added on each syscall handler so calling it here save code size. > Plus why is it in assembly to begin with? Is this some older code that got > written when the x86 entry code was in assembly, and never properly > converted to C? I wrote the assembly to make it faster and save a call on each syscall return. I can just call verify_pre_usermode_state if you prefer.
On 03/11/17 01:42, Ingo Molnar wrote: >> >> + /* >> + * Check user-mode state on fast path return, the same check is done >> + * under the slow path through syscall_return_slowpath. >> + */ >> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION >> + call verify_pre_usermode_state >> +#else >> + /* >> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a >> + * warning. >> + */ >> + movq PER_CPU_VAR(current_task), %rax >> + movq $TASK_SIZE_MAX, %rcx >> + cmp %rcx, TASK_addr_limit(%rax) >> + jz 1f >> + movq %rcx, TASK_addr_limit(%rax) >> +1: >> +#endif >> + How about simply doing... movq PER_CPU_VAR(current_task), %rax movq $TASK_SIZE_MAX, %rcx #ifdef CONFIG_BUG_ON_DATA_CORRUPTION cmpq %rcx, TASK_addr_limit(%rax) jne syscall_return_slowpath #else movq %rcx, TASK_addr_limit(%rax) #endif ... and let the slow path take care of BUG. This should be much faster, even with the BUG, and is simpler to boot. -hpa
On 03/13/17 17:04, H. Peter Anvin wrote: > On 03/11/17 01:42, Ingo Molnar wrote: >>> >>> + /* >>> + * Check user-mode state on fast path return, the same check is done >>> + * under the slow path through syscall_return_slowpath. >>> + */ >>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION >>> + call verify_pre_usermode_state >>> +#else >>> + /* >>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a >>> + * warning. >>> + */ >>> + movq PER_CPU_VAR(current_task), %rax >>> + movq $TASK_SIZE_MAX, %rcx >>> + cmp %rcx, TASK_addr_limit(%rax) >>> + jz 1f >>> + movq %rcx, TASK_addr_limit(%rax) >>> +1: >>> +#endif >>> + > > How about simply doing... > > movq PER_CPU_VAR(current_task), %rax > movq $TASK_SIZE_MAX, %rcx > #ifdef CONFIG_BUG_ON_DATA_CORRUPTION > cmpq %rcx, TASK_addr_limit(%rax) > jne syscall_return_slowpath > #else > movq %rcx, TASK_addr_limit(%rax) > #endif > > ... and let the slow path take care of BUG. This should be much faster, > even with the BUG, and is simpler to boot. > In fact, we could even to the cmpq/jne unconditionally. I'm guessing the occasional branch mispredict will be offset by occasionally touching a clean cacheline in the case of an unconditional store. Since this is something that should never happen, performance doesn't matter. -hpa
On Tue, Mar 14, 2017 at 2:40 AM, H. Peter Anvin <hpa@zytor.com> wrote: > On 03/13/17 17:04, H. Peter Anvin wrote: >> On 03/11/17 01:42, Ingo Molnar wrote: >>>> >>>> + /* >>>> + * Check user-mode state on fast path return, the same check is done >>>> + * under the slow path through syscall_return_slowpath. >>>> + */ >>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION >>>> + call verify_pre_usermode_state >>>> +#else >>>> + /* >>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a >>>> + * warning. >>>> + */ >>>> + movq PER_CPU_VAR(current_task), %rax >>>> + movq $TASK_SIZE_MAX, %rcx >>>> + cmp %rcx, TASK_addr_limit(%rax) >>>> + jz 1f >>>> + movq %rcx, TASK_addr_limit(%rax) >>>> +1: >>>> +#endif >>>> + >> >> How about simply doing... >> >> movq PER_CPU_VAR(current_task), %rax >> movq $TASK_SIZE_MAX, %rcx >> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION >> cmpq %rcx, TASK_addr_limit(%rax) >> jne syscall_return_slowpath >> #else >> movq %rcx, TASK_addr_limit(%rax) >> #endif >> >> ... and let the slow path take care of BUG. This should be much faster, >> even with the BUG, and is simpler to boot. >> > > In fact, we could even to the cmpq/jne unconditionally. I'm guessing > the occasional branch mispredict will be offset by occasionally touching > a clean cacheline in the case of an unconditional store. > > Since this is something that should never happen, performance doesn't > matter. Ingo: Which approach do you favor? I want to keep the fast path as fast as possible obviously. > > -hpa > >
On Tue, Mar 14, 2017 at 8:17 AM, Thomas Garnier <thgarnie@google.com> wrote: > On Tue, Mar 14, 2017 at 2:40 AM, H. Peter Anvin <hpa@zytor.com> wrote: >> On 03/13/17 17:04, H. Peter Anvin wrote: >>> On 03/11/17 01:42, Ingo Molnar wrote: >>>>> >>>>> + /* >>>>> + * Check user-mode state on fast path return, the same check is done >>>>> + * under the slow path through syscall_return_slowpath. >>>>> + */ >>>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION >>>>> + call verify_pre_usermode_state >>>>> +#else >>>>> + /* >>>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a >>>>> + * warning. >>>>> + */ >>>>> + movq PER_CPU_VAR(current_task), %rax >>>>> + movq $TASK_SIZE_MAX, %rcx >>>>> + cmp %rcx, TASK_addr_limit(%rax) >>>>> + jz 1f >>>>> + movq %rcx, TASK_addr_limit(%rax) >>>>> +1: >>>>> +#endif >>>>> + >>> >>> How about simply doing... >>> >>> movq PER_CPU_VAR(current_task), %rax >>> movq $TASK_SIZE_MAX, %rcx >>> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION >>> cmpq %rcx, TASK_addr_limit(%rax) >>> jne syscall_return_slowpath >>> #else >>> movq %rcx, TASK_addr_limit(%rax) >>> #endif >>> >>> ... and let the slow path take care of BUG. This should be much faster, >>> even with the BUG, and is simpler to boot. >>> >> >> In fact, we could even to the cmpq/jne unconditionally. I'm guessing >> the occasional branch mispredict will be offset by occasionally touching >> a clean cacheline in the case of an unconditional store. >> >> Since this is something that should never happen, performance doesn't >> matter. > > Ingo: Which approach do you favor? I want to keep the fast path as > fast as possible obviously. Even though my name isn't Ingo, Linus keeps trying to get me to be the actual maintainer of this file. :) How about (sorry about whitespace damage): #ifdef CONFIG_BUG_ON_DATA_CORRUPTION movq PER_CPU_VAR(current_task), %rax bt $63, TASK_addr_limit(%rax) jc syscall_return_slowpath #endif Now the kernel is totally unchanged if the config option is off and it's fast and simple if the option is on.
On Tue, Mar 14, 2017 at 8:39 AM, Andy Lutomirski <luto@amacapital.net> wrote: > Even though my name isn't Ingo, Linus keeps trying to get me to be the > actual maintainer of this file. :) How about (sorry about whitespace > damage): :D > > #ifdef CONFIG_BUG_ON_DATA_CORRUPTION > movq PER_CPU_VAR(current_task), %rax > bt $63, TASK_addr_limit(%rax) > jc syscall_return_slowpath > #endif > > Now the kernel is totally unchanged if the config option is off and > it's fast and simple if the option is on. I like using bt for fast comparison. We want to enforce the address limit by default, not only when CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one: /* Check user-mode state on fast path return. */ movq PER_CPU_VAR(current_task), %rax btq $63, TASK_addr_limit(%rax) jnc 1f #ifdef CONFIG_BUG_ON_DATA_CORRUPTION call syscall_return_slowpath jmp return_from_SYSCALL_64 #else movq $TASK_SIZE_MAX, %rcx movq %rcx, TASK_addr_limit(%rax) #endif 1: I saw that syscall_return_slowpath is supposed to be called not jumped to. I could just call verify_pre_usermode_state that would be about the same. If we want to avoid if/def then I guess this one is the best I can think of: /* Check user-mode state on fast path return. */ movq PER_CPU_VAR(current_task), %rax btq $63, TASK_addr_limit(%rax) jnc 1f call verify_pre_usermode_state 1: The check is fast and the call will happen only on corruption. What do you think?
On 03/14/17 08:39, Andy Lutomirski wrote: >> >> Ingo: Which approach do you favor? I want to keep the fast path as >> fast as possible obviously. > > Even though my name isn't Ingo, Linus keeps trying to get me to be the > actual maintainer of this file. :) How about (sorry about whitespace > damage): > > #ifdef CONFIG_BUG_ON_DATA_CORRUPTION > movq PER_CPU_VAR(current_task), %rax > bt $63, TASK_addr_limit(%rax) > jc syscall_return_slowpath > #endif > > Now the kernel is totally unchanged if the config option is off and > it's fast and simple if the option is on. > The idea as far as I understand was that the option was about whether or not to clobber the broken value or BUG on it, not to remove the check. My point, though, was that we can bail out to the slow path if there is a discrepancy and worry about BUG or not there; performance doesn't matter one iota if this triggers regardless of the remediation. It isn't clear that using bt would be faster, though; although it saves an instruction that instruction can be hoisted arbitrarily and so is extremely likely to be hidden in the pipeline. cmp (which is really a variant of sub) is one of the basic ALU instructions that are super-optimized on every CPU, whereas bt is substantially slower on some implementations. This version is also "slightly less secure" since it would make it possible to overwrite the guard page at the end of TASK_SIZE_MAX if one could figure out a way to put an arbitrary value into this variable, but I doubt that matters in any way. -hpa
On 03/14/17 09:29, Thomas Garnier wrote: > > We want to enforce the address limit by default, not only when > CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one: > > /* Check user-mode state on fast path return. */ > movq PER_CPU_VAR(current_task), %rax > btq $63, TASK_addr_limit(%rax) > jnc 1f > #ifdef CONFIG_BUG_ON_DATA_CORRUPTION > call syscall_return_slowpath > jmp return_from_SYSCALL_64 > #else > movq $TASK_SIZE_MAX, %rcx > movq %rcx, TASK_addr_limit(%rax) > #endif > 1: > > I saw that syscall_return_slowpath is supposed to be called not jumped > to. I could just call verify_pre_usermode_state that would be about > the same. I wanted to comment on that thing: why on earth isn't verify_pre_usermode_state() an inline? Making it an out-of-line function adds pointless extra overhead to the C code when we are talking about a few instructions. Second, you never do a branch-around to handle an exceptional condition on the fast path: you jump *out of line* to handle the special condition; a forward branch is preferred since it is slightly more likely to be predicted not taken. Now, I finally had a chance to actually look at the full file (I was preoccupied yesterday), and am a bit disappointed, to say the least. First of all, the jump target you need is only a handful of instructions further down the code path; you need to do *exactly* that is done when the test of _TIF_ALLWORK_MASK right above is tested! Not only that, but you already have PER_CPU_VAR(current_task) in %r11 just ready to be used! This was all in the three instructions immediately prior to the code you modified... So, all you'd need would be: movq $TASK_SIZE_MAX, %rcx cmpq %rcx, TASK_addr_limit(%r11) jne 1f We even get a short jump instruction! (Using bt saves one more instruction, but see previous caveats about it.) -hpa
On Tue, Mar 14, 2017 at 9:44 AM, H. Peter Anvin <hpa@zytor.com> wrote: > On 03/14/17 09:29, Thomas Garnier wrote: >> >> We want to enforce the address limit by default, not only when >> CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one: >> >> /* Check user-mode state on fast path return. */ >> movq PER_CPU_VAR(current_task), %rax >> btq $63, TASK_addr_limit(%rax) >> jnc 1f >> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION >> call syscall_return_slowpath >> jmp return_from_SYSCALL_64 >> #else >> movq $TASK_SIZE_MAX, %rcx >> movq %rcx, TASK_addr_limit(%rax) >> #endif >> 1: >> >> I saw that syscall_return_slowpath is supposed to be called not jumped >> to. I could just call verify_pre_usermode_state that would be about >> the same. > > I wanted to comment on that thing: why on earth isn't > verify_pre_usermode_state() an inline? Making it an out-of-line > function adds pointless extra overhead to the C code when we are talking > about a few instructions. Because outside of arch specific implementation it is called by each syscall handler. it will increase the code size a lot. > > Second, you never do a branch-around to handle an exceptional condition > on the fast path: you jump *out of line* to handle the special > condition; a forward branch is preferred since it is slightly more > likely to be predicted not taken. > > Now, I finally had a chance to actually look at the full file (I was > preoccupied yesterday), and am a bit disappointed, to say the least. > > First of all, the jump target you need is only a handful of instructions > further down the code path; you need to do *exactly* that is done when > the test of _TIF_ALLWORK_MASK right above is tested! Not only that, but > you already have PER_CPU_VAR(current_task) in %r11 just ready to be > used! This was all in the three instructions immediately prior to the > code you modified... Correct. > > So, all you'd need would be: > > movq $TASK_SIZE_MAX, %rcx > cmpq %rcx, TASK_addr_limit(%r11) > jne 1f > > We even get a short jump instruction! > > (Using bt saves one more instruction, but see previous caveats about it.) Okay that seems fair to me. Andy what do you think? (Given you suggested the bt). Ingo? Thanks for the feedback. > > -hpa > > > >
On 03/14/17 09:51, Thomas Garnier wrote: >> >> I wanted to comment on that thing: why on earth isn't >> verify_pre_usermode_state() an inline? Making it an out-of-line >> function adds pointless extra overhead to the C code when we are talking >> about a few instructions. > > Because outside of arch specific implementation it is called by each > syscall handler. it will increase the code size a lot. > Don't assume that. On a lot of architectures a function call can be more expensive than a simple compare and branch, because the compiler has to assume a whole bunch of registers are lost at that point. Either way, don't penalize the common architectures for it. Not okay. -hpa
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 005df7c825f5..6d48e18e6f09 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,6 +63,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 370c42c7f046..525edbb77f03 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -22,6 +22,7 @@ #include <linux/context_tracking.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> +#include <linux/syscalls.h> #include <asm/desc.h> #include <asm/traps.h> @@ -180,6 +181,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); u32 cached_flags; + verify_pre_usermode_state(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2b2a2948ffe..04db589be466 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath: testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz 1f + /* + * Check user-mode state on fast path return, the same check is done + * under the slow path through syscall_return_slowpath. + */ +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION + call verify_pre_usermode_state +#else + /* + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a + * warning. + */ + movq PER_CPU_VAR(current_task), %rax + movq $TASK_SIZE_MAX, %rcx + cmp %rcx, TASK_addr_limit(%rax) + jz 1f + movq %rcx, TASK_addr_limit(%rax) +1: +#endif + LOCKDEP_SYS_EXIT TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a264200c62f..0fbbb79d058c 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -76,4 +76,15 @@ typedef struct { pteval_t pte; } pte_t; #define EARLY_DYNAMIC_PAGE_TABLES 64 +/* + * User space process size. 47bits minus one guard page. The guard + * page is necessary on Intel CPUs: if a SYSCALL instruction is at + * the highest possible canonical userspace address, then that + * syscall will enter the kernel with a non-canonical return + * address, and SYSRET will explode dangerously. We avoid this + * particular problem by preventing anything from being mapped + * at the maximum canonical address. + */ +#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f385eca5407a..9bc99d37133e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -829,17 +829,6 @@ static inline void spin_lock_prefetch(const void *x) #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -/* - * User space process size. 47bits minus one guard page. The guard - * page is necessary on Intel CPUs: if a SYSCALL instruction is at - * the highest possible canonical userspace address, then that - * syscall will enter the kernel with a non-canonical return - * address, and SYSRET will explode dangerously. We avoid this - * particular problem by preventing anything from being mapped - * at the maximum canonical address. - */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) - /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */