Message ID | 151632014097.21271.16980532033566583357.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 01/19/2018, 01:02 AM, Dan Williams wrote: > The syscall table base is a user controlled function pointer in kernel > space. Like, 'get_user, use 'MASK_NOSPEC' to prevent any out of bounds > speculation. While retpoline prevents speculating into the user > controlled target it does not stop the pointer de-reference, the concern > is leaking memory relative to the syscall table base. > > Reported-by: Linus Torvalds <torvalds@linux-foundation.org> > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: "H. Peter Anvin" <hpa@zytor.com> > Cc: x86@kernel.org > Cc: Andy Lutomirski <luto@kernel.org> > Signed-off-by: Dan Williams <dan.j.williams@intel.com> > --- > arch/x86/entry/entry_64.S | 2 ++ > arch/x86/include/asm/smap.h | 9 ++++++++- > 2 files changed, 10 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S > index 4f8e1d35a97c..2320017077d4 100644 > --- a/arch/x86/entry/entry_64.S > +++ b/arch/x86/entry/entry_64.S > @@ -35,6 +35,7 @@ > #include <asm/asm.h> > #include <asm/smap.h> > #include <asm/pgtable_types.h> > +#include <asm/smap.h> This is already included 2 lines above thanks,
On Thu, Jan 18, 2018 at 04:02:21PM -0800, Dan Williams wrote: > The syscall table base is a user controlled function pointer in kernel > space. Like, 'get_user, use 'MASK_NOSPEC' to prevent any out of bounds > speculation. While retpoline prevents speculating into the user > controlled target it does not stop the pointer de-reference, the concern > is leaking memory relative to the syscall table base. This patch seems to cause a regression. An easy way to reproduce what I'm seeing is to run the samples/statx/test-statx. Here's what I see when I have this patchset applied: # ./test-statx /tmp statx(/tmp) = -1 /tmp: Bad file descriptor Reverting this single patch seems to fix it. Cheers, -- Luís > > Reported-by: Linus Torvalds <torvalds@linux-foundation.org> > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: "H. Peter Anvin" <hpa@zytor.com> > Cc: x86@kernel.org > Cc: Andy Lutomirski <luto@kernel.org> > Signed-off-by: Dan Williams <dan.j.williams@intel.com> > --- > arch/x86/entry/entry_64.S | 2 ++ > arch/x86/include/asm/smap.h | 9 ++++++++- > 2 files changed, 10 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S > index 4f8e1d35a97c..2320017077d4 100644 > --- a/arch/x86/entry/entry_64.S > +++ b/arch/x86/entry/entry_64.S > @@ -35,6 +35,7 @@ > #include <asm/asm.h> > #include <asm/smap.h> > #include <asm/pgtable_types.h> > +#include <asm/smap.h> > #include <asm/export.h> > #include <asm/frame.h> > #include <asm/nospec-branch.h> > @@ -264,6 +265,7 @@ entry_SYSCALL_64_fastpath: > cmpl $__NR_syscall_max, %eax > #endif > ja 1f /* return -ENOSYS (already in pt_regs->ax) */ > + MASK_NOSPEC %r11 %rax /* sanitize syscall_nr wrt speculation */ > movq %r10, %rcx > > /* > diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h > index 2b4ad4c6a226..3b5b2cf58dc6 100644 > --- a/arch/x86/include/asm/smap.h > +++ b/arch/x86/include/asm/smap.h > @@ -35,7 +35,14 @@ > * this directs the cpu to speculate with a NULL ptr rather than > * something targeting kernel memory. > * > - * assumes CF is set from a previous 'cmp TASK_addr_limit, %ptr' > + * In the syscall entry path it is possible to speculate past the > + * validation of the system call number. Use MASK_NOSPEC to sanitize the > + * syscall array index to zero (sys_read) rather than an arbitrary > + * target. > + * > + * assumes CF is set from a previous 'cmp' i.e.: > + * cmp TASK_addr_limit, %ptr > + * cmp __NR_syscall_max, %idx > */ > .macro MASK_NOSPEC mask val > sbb \mask, \mask > >
On Tue, Feb 6, 2018 at 11:29 AM, Luis Henriques <lhenriques@suse.com> wrote: > On Thu, Jan 18, 2018 at 04:02:21PM -0800, Dan Williams wrote: >> The syscall table base is a user controlled function pointer in kernel >> space. Like, 'get_user, use 'MASK_NOSPEC' to prevent any out of bounds >> speculation. While retpoline prevents speculating into the user >> controlled target it does not stop the pointer de-reference, the concern >> is leaking memory relative to the syscall table base. > > This patch seems to cause a regression. An easy way to reproduce what > I'm seeing is to run the samples/statx/test-statx. Here's what I see > when I have this patchset applied: > > # ./test-statx /tmp > statx(/tmp) = -1 > /tmp: Bad file descriptor > > Reverting this single patch seems to fix it. Just to clarify, when you say "this patch" you mean: 2fbd7af5af86 x86/syscall: Sanitize syscall table de-references under speculation ...not this early MASK_NOSPEC version of the patch, right? > > Cheers, > -- > Luís > >> >> Reported-by: Linus Torvalds <torvalds@linux-foundation.org> >> Cc: Thomas Gleixner <tglx@linutronix.de> >> Cc: Ingo Molnar <mingo@redhat.com> >> Cc: "H. Peter Anvin" <hpa@zytor.com> >> Cc: x86@kernel.org >> Cc: Andy Lutomirski <luto@kernel.org> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com> >> --- >> arch/x86/entry/entry_64.S | 2 ++ >> arch/x86/include/asm/smap.h | 9 ++++++++- >> 2 files changed, 10 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S >> index 4f8e1d35a97c..2320017077d4 100644 >> --- a/arch/x86/entry/entry_64.S >> +++ b/arch/x86/entry/entry_64.S >> @@ -35,6 +35,7 @@ >> #include <asm/asm.h> >> #include <asm/smap.h> >> #include <asm/pgtable_types.h> >> +#include <asm/smap.h> >> #include <asm/export.h> >> #include <asm/frame.h> >> #include <asm/nospec-branch.h> >> @@ -264,6 +265,7 @@ entry_SYSCALL_64_fastpath: >> cmpl $__NR_syscall_max, %eax >> #endif >> ja 1f /* return -ENOSYS (already in pt_regs->ax) */ >> + MASK_NOSPEC %r11 %rax /* sanitize syscall_nr wrt speculation */ >> movq %r10, %rcx >> >> /* >> diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h >> index 2b4ad4c6a226..3b5b2cf58dc6 100644 >> --- a/arch/x86/include/asm/smap.h >> +++ b/arch/x86/include/asm/smap.h >> @@ -35,7 +35,14 @@ >> * this directs the cpu to speculate with a NULL ptr rather than >> * something targeting kernel memory. >> * >> - * assumes CF is set from a previous 'cmp TASK_addr_limit, %ptr' >> + * In the syscall entry path it is possible to speculate past the >> + * validation of the system call number. Use MASK_NOSPEC to sanitize the >> + * syscall array index to zero (sys_read) rather than an arbitrary >> + * target. >> + * >> + * assumes CF is set from a previous 'cmp' i.e.: >> + * cmp TASK_addr_limit, %ptr >> + * cmp __NR_syscall_max, %idx >> */ >> .macro MASK_NOSPEC mask val >> sbb \mask, \mask >> >>
On Tue, Feb 6, 2018 at 11:48 AM, Dan Williams <dan.j.williams@intel.com> wrote: > > Just to clarify, when you say "this patch" you mean: > > 2fbd7af5af86 x86/syscall: Sanitize syscall table de-references > under speculation > > ...not this early MASK_NOSPEC version of the patch, right? I suspect not. If that patch is broken, the system wouldn't even boot. That said, looking at 2fbd7af5af86, I do note that the code generation is horribly stupid. It's due to two different issues: (a) the x86 asm constraints for that inline asm is nasty, and requires a register for 'size', even though an immediate works just fine. (b) the "cmp" is inside the asm, so gcc can't combine it with the *other* cmp in the C code. Fixing (a) is easy: +++ b/arch/x86/include/asm/barrier.h @@ -43 +43 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, - :"r"(size),"r" (index) + :"ir"(size),"r" (index) but fixing (b) looks fundamentally hard. Gcc generates (for do_syscall()): cmpq $332, %rbp #, nr ja .L295 #, cmp $333,%rbp sbb %rax,%rax; #, nr, mask note how it completely pointlessly does the comparison twice, even though it could have just done cmp $333,%rbp jae .L295 #, sbb %rax,%rax; #, nr, mask Ho humm. Sad. Linus
On Tue, Feb 6, 2018 at 12:26 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Tue, Feb 6, 2018 at 11:48 AM, Dan Williams <dan.j.williams@intel.com> wrote: >> >> Just to clarify, when you say "this patch" you mean: >> >> 2fbd7af5af86 x86/syscall: Sanitize syscall table de-references >> under speculation >> >> ...not this early MASK_NOSPEC version of the patch, right? > > I suspect not. If that patch is broken, the system wouldn't even boot. > > That said, looking at 2fbd7af5af86, I do note that the code generation > is horribly stupid. > > It's due to two different issues: > > (a) the x86 asm constraints for that inline asm is nasty, and > requires a register for 'size', even though an immediate works just > fine. > > (b) the "cmp" is inside the asm, so gcc can't combine it with the > *other* cmp in the C code. > > Fixing (a) is easy: > > +++ b/arch/x86/include/asm/barrier.h > @@ -43 +43 @@ static inline unsigned long > array_index_mask_nospec(unsigned long index, > - :"r"(size),"r" (index) > + :"ir"(size),"r" (index) > > but fixing (b) looks fundamentally hard. Gcc generates (for do_syscall()): > > cmpq $332, %rbp #, nr > ja .L295 #, > cmp $333,%rbp > sbb %rax,%rax; #, nr, mask > > note how it completely pointlessly does the comparison twice, even > though it could have just done > > cmp $333,%rbp > jae .L295 #, > sbb %rax,%rax; #, nr, mask > > Ho humm. Sad. Are there any compilers that would miscompile: mask = 0 - (index < size); That might be a way to improve the assembly.
On Tue, Feb 6, 2018 at 12:37 PM, Dan Williams <dan.j.williams@intel.com> wrote: > > Are there any compilers that would miscompile: > > mask = 0 - (index < size); > > That might be a way to improve the assembly. Sadly, that is *very* easy to miscompile. In fact, I'd be very surprised indeed if any compiler worth its name wouldn't combine the comparison with the conditional branch it accompanies, and just turn that into a constant. IOW, you'd get mask = 0 - (index < size); if (index <= size) { ... use mask .. and the compiler would just turn that into if (index <= size) { mask = -1; and be done with it. Linus
On Tue, Feb 6, 2018 at 12:42 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > Sadly, that is *very* easy to miscompile. Side note: don't read email, go watch the falcon heavy takeoff. Linus
On Tue, Feb 6, 2018 at 8:42 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Tue, Feb 6, 2018 at 12:37 PM, Dan Williams <dan.j.williams@intel.com> wrote: >> >> Are there any compilers that would miscompile: >> >> mask = 0 - (index < size); >> >> That might be a way to improve the assembly. > > Sadly, that is *very* easy to miscompile. In fact, I'd be very > surprised indeed if any compiler worth its name wouldn't combine the > comparison with the conditional branch it accompanies, and just turn > that into a constant. IOW, you'd get > > mask = 0 - (index < size); > if (index <= size) { > ... use mask .. > > and the compiler would just turn that into > > if (index <= size) { > mask = -1; > > and be done with it. > > Linus Can you use @cc to make an asm statement that outputs both the masked array index and the "if" condition? I can never remember the syntax, but something like: asm ("cmp %[limit], %[index]\n\tcmovae %[zero], %[index]" : [index] "+" (index), "@ccb" (result)); Then you shove this into a statement expression macro so you can do: if (index_mask_nospec(&nr, NR_syscalls)) { ... sys_call_table[nr] ..; } (Caveat emptor: I can also *ever* remember which way the $*!& AT&T syntax cmp instruction goes.) A down side is that nr actually ends up containing zero outside the if. *That* could be avoided with jump labels. --Andy
On Tue, Feb 6, 2018 at 12:49 PM, Andy Lutomirski <luto@kernel.org> wrote: > > Can you use @cc to make an asm statement that outputs both the masked > array index and the "if" condition? I can never remember the syntax, > but something like: Yes. Although I'd actually suggest just using an "asm goto" if we really want to optimize this. Give the "index_mask_nospec()" a third argument that is the label to jump to for overflow. Then you can just decide how to implement it best for any particular architecture (and compiler limitation). Linus
On Tue, Feb 6, 2018 at 12:58 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Tue, Feb 6, 2018 at 12:49 PM, Andy Lutomirski <luto@kernel.org> wrote: >> >> Can you use @cc to make an asm statement that outputs both the masked >> array index and the "if" condition? I can never remember the syntax, >> but something like: > > Yes. Although I'd actually suggest just using an "asm goto" if we > really want to optimize this. Give the "index_mask_nospec()" a third > argument that is the label to jump to for overflow. > > Then you can just decide how to implement it best for any particular > architecture (and compiler limitation). At that point we're basically just back to the array_ptr() version that returned a sanitized pointer to an array element. call = array_ptr(sys_call_table, nr & __SYSCALL_MASK, NR_syscalls); if (likely(call)) regs->ax = (*call)( regs->di, regs->si, regs->dx, regs->r10, regs->r8, regs->r9); e1e: ba 4d 01 00 00 mov $0x14d,%edx e23: 48 39 d5 cmp %rdx,%rbp e26: 48 19 d2 sbb %rdx,%rdx call = array_ptr(sys_call_table, nr & __SYSCALL_MASK, NR_syscalls); e29: 48 21 d5 and %rdx,%rbp e2c: 48 8d 04 ed 00 00 00 lea 0x0(,%rbp,8),%rax e33: 00 if (likely(call)) e34: 48 21 d0 and %rdx,%rax e37: 74 1e je e57 <do_syscall_64+0x77> regs->ax = (*call)( e39: 48 8b 4b 38 mov 0x38(%rbx),%rcx e3d: 48 8b 53 60 mov 0x60(%rbx),%rdx e41: 48 8b 73 68 mov 0x68(%rbx),%rsi e45: 48 8b 7b 70 mov 0x70(%rbx),%rdi e49: 4c 8b 4b 40 mov 0x40(%rbx),%r9 e4d: 4c 8b 43 48 mov 0x48(%rbx),%r8 e51: ff 10 callq *(%rax) e53: 48 89 43 50 mov %rax,0x50(%rbx) e57: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax
On Tue, Feb 06, 2018 at 11:48:45AM -0800, Dan Williams wrote: > On Tue, Feb 6, 2018 at 11:29 AM, Luis Henriques <lhenriques@suse.com> wrote: > > On Thu, Jan 18, 2018 at 04:02:21PM -0800, Dan Williams wrote: > >> The syscall table base is a user controlled function pointer in kernel > >> space. Like, 'get_user, use 'MASK_NOSPEC' to prevent any out of bounds > >> speculation. While retpoline prevents speculating into the user > >> controlled target it does not stop the pointer de-reference, the concern > >> is leaking memory relative to the syscall table base. > > > > This patch seems to cause a regression. An easy way to reproduce what > > I'm seeing is to run the samples/statx/test-statx. Here's what I see > > when I have this patchset applied: > > > > # ./test-statx /tmp > > statx(/tmp) = -1 > > /tmp: Bad file descriptor > > > > Reverting this single patch seems to fix it. > > Just to clarify, when you say "this patch" you mean: > > 2fbd7af5af86 x86/syscall: Sanitize syscall table de-references > under speculation > > ...not this early MASK_NOSPEC version of the patch, right? *sigh* Looks like I spent some good amount of time hunting a non-issue just because I have enough old branches hanging around to confusing me :-( Sorry for the noise. Cheers, -- Luís
On Tue, Feb 6, 2018 at 1:37 PM, Dan Williams <dan.j.williams@intel.com> wrote: > > At that point we're basically just back to the array_ptr() version > that returned a sanitized pointer to an array element. .. that one does an extra unnecessary 'andq' instead of the duplicated cmp. But at least it avoids comparing that 32-bit integer twice, so it's probably slightly smaller. (And your code generation is without the "r" -> "ir" fix for the size argument) Probably doesn't matter. But a "asm goto" would give you at least potentially optimal code. Linus
On Tue, Feb 6, 2018 at 2:52 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Tue, Feb 6, 2018 at 1:37 PM, Dan Williams <dan.j.williams@intel.com> wrote: >> >> At that point we're basically just back to the array_ptr() version >> that returned a sanitized pointer to an array element. > > .. that one does an extra unnecessary 'andq' instead of the duplicated > cmp. But at least it avoids comparing that 32-bit integer twice, so > it's probably slightly smaller. > > (And your code generation is without the "r" -> "ir" fix for the size argument) > > Probably doesn't matter. But a "asm goto" would give you at least > potentially optimal code. > Should we go with array_element_nospec() in the meantime? So we're not depending on jump labels? With the constraint fix and killing that superfluous AND the assembly is now: e26: 48 81 fd 4d 01 00 00 cmp $0x14d,%rbp e2d: 48 19 d2 sbb %rdx,%rdx NR_syscalls); if (likely(call)) e30: 48 21 d0 and %rdx,%rax e33: 74 1e je e53 <do_syscall_64+0x73> regs->ax = (*call)(regs->di, regs->si, regs->dx, e35: 48 8b 4b 38 mov 0x38(%rbx),%rcx e39: 48 8b 53 60 mov 0x60(%rbx),%rdx e3d: 48 8b 73 68 mov 0x68(%rbx),%rsi e41: 48 8b 7b 70 mov 0x70(%rbx),%rdi e45: 4c 8b 4b 40 mov 0x40(%rbx),%r9 e49: 4c 8b 43 48 mov 0x48(%rbx),%r8 e4d: ff 10 callq *(%rax) e4f: 48 89 43 50 mov %rax,0x50(%rbx) e53: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax
On Tue, Feb 6, 2018 at 4:33 PM, Dan Williams <dan.j.williams@intel.com> wrote: > > Should we go with array_element_nospec() in the meantime? So we're not > depending on jump labels? With the constraint fix and killing that > superfluous AND the assembly is now: > > e26: 48 81 fd 4d 01 00 00 cmp $0x14d,%rbp > e2d: 48 19 d2 sbb %rdx,%rdx > NR_syscalls); > if (likely(call)) > e30: 48 21 d0 and %rdx,%rax > e33: 74 1e je e53 <do_syscall_64+0x73> > regs->ax = (*call)(regs->di, regs->si, regs->dx, > e35: 48 8b 4b 38 mov 0x38(%rbx),%rcx > e39: 48 8b 53 60 mov 0x60(%rbx),%rdx > e3d: 48 8b 73 68 mov 0x68(%rbx),%rsi > e41: 48 8b 7b 70 mov 0x70(%rbx),%rdi > e45: 4c 8b 4b 40 mov 0x40(%rbx),%r9 > e49: 4c 8b 43 48 mov 0x48(%rbx),%r8 > e4d: ff 10 callq *(%rax) That looks fairly optimal, except for the fact that the callq is through a register. Of course, that register-indirect calling convention is forced on us by retpoline anyway (which you don't have enabled, likely because of a lack of compiler). But without retpoline that callq could be callq sys_call_table(,%rax,8) if the masking is done on the index (and if the conditional jump had been done on the cmp rather than the later 'and'). Instead, you have a leaq sys_call_table(,%rbp,8),%rax hiding somewhere earlier that doesn't show in your asm snippet. Oh well. We'll have an extra instruction however we do this. I guess that's just something we'll have to live with. No more bikeshedding.. Linus
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4f8e1d35a97c..2320017077d4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -35,6 +35,7 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/pgtable_types.h> +#include <asm/smap.h> #include <asm/export.h> #include <asm/frame.h> #include <asm/nospec-branch.h> @@ -264,6 +265,7 @@ entry_SYSCALL_64_fastpath: cmpl $__NR_syscall_max, %eax #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + MASK_NOSPEC %r11 %rax /* sanitize syscall_nr wrt speculation */ movq %r10, %rcx /* diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 2b4ad4c6a226..3b5b2cf58dc6 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -35,7 +35,14 @@ * this directs the cpu to speculate with a NULL ptr rather than * something targeting kernel memory. * - * assumes CF is set from a previous 'cmp TASK_addr_limit, %ptr' + * In the syscall entry path it is possible to speculate past the + * validation of the system call number. Use MASK_NOSPEC to sanitize the + * syscall array index to zero (sys_read) rather than an arbitrary + * target. + * + * assumes CF is set from a previous 'cmp' i.e.: + * cmp TASK_addr_limit, %ptr + * cmp __NR_syscall_max, %idx */ .macro MASK_NOSPEC mask val sbb \mask, \mask
The syscall table base is a user controlled function pointer in kernel space. Like, 'get_user, use 'MASK_NOSPEC' to prevent any out of bounds speculation. While retpoline prevents speculating into the user controlled target it does not stop the pointer de-reference, the concern is leaking memory relative to the syscall table base. Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: x86@kernel.org Cc: Andy Lutomirski <luto@kernel.org> Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- arch/x86/entry/entry_64.S | 2 ++ arch/x86/include/asm/smap.h | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-)