Message ID | 20170410164420.64003-1-thgarnie@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Apr 10, 2017 at 9:44 AM, Thomas Garnier <thgarnie@google.com> wrote: > This patch ensures a syscall does not return to user-mode with a kernel > address limit. If that happened, a process can corrupt kernel-mode > memory and elevate privileges. > > For example, it would mitigation this bug: > > - https://bugs.chromium.org/p/project-zero/issues/detail?id=990 > > The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also > added so each architecture can optimize this change. > > Signed-off-by: Thomas Garnier <thgarnie@google.com> > Tested-by: Kees Cook <keescook@chromium.org> Ingo, I think this series is ready. Can you pull it? (And if not, what should next steps be?) -Kees > --- > Based on next-20170410 > --- > arch/s390/Kconfig | 1 + > include/linux/syscalls.h | 26 +++++++++++++++++++++++++- > init/Kconfig | 6 ++++++ > kernel/sys.c | 13 +++++++++++++ > 4 files changed, 45 insertions(+), 1 deletion(-) > > diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig > index d25435d94b6e..489a0cc6e46b 100644 > --- a/arch/s390/Kconfig > +++ b/arch/s390/Kconfig > @@ -103,6 +103,7 @@ config S390 > select ARCH_INLINE_WRITE_UNLOCK_BH > select ARCH_INLINE_WRITE_UNLOCK_IRQ > select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE > + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > select ARCH_SAVE_PAGE_KEYS if HIBERNATION > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9b06f8..801a7a74fe28 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs; > SYSCALL_METADATA(sname, x, __VA_ARGS__) \ > __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) > > + > +/* > + * Called before coming back to user-mode. Returning to user-mode with an > + * address limit different than USER_DS can allow to overwrite kernel memory. > + */ > +static inline void verify_pre_usermode_state(void) { > + BUG_ON(!segment_eq(get_fs(), USER_DS)); > +} > + > +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > +#define __CHECK_USER_CALLER() \ > + bool user_caller = segment_eq(get_fs(), USER_DS) > +#define __VERIFY_PRE_USERMODE_STATE() \ > + if (user_caller) verify_pre_usermode_state() > +#else > +#define __CHECK_USER_CALLER() > +#define __VERIFY_PRE_USERMODE_STATE() > +asmlinkage void address_limit_check_failed(void); > +#endif > + > + > #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) > #define __SYSCALL_DEFINEx(x, name, ...) \ > asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ > @@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs; > asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ > asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ > { \ > - long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ > + long ret; \ > + __CHECK_USER_CALLER(); \ > + ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ > + __VERIFY_PRE_USERMODE_STATE(); \ > __MAP(x,__SC_TEST,__VA_ARGS__); \ > __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ > return ret; \ > diff --git a/init/Kconfig b/init/Kconfig > index 7f7027817bce..e5fbd0becfa7 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -1958,6 +1958,12 @@ config PROFILING > config TRACEPOINTS > bool > > +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > + bool > + help > + Disable the generic pre-usermode state verification. Allow each > + architecture to optimize how and when the verification is done. > + > source "arch/Kconfig" > > endmenu # General setup > diff --git a/kernel/sys.c b/kernel/sys.c > index 196c7134bee6..d30530ff8166 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -2459,3 +2459,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) > return 0; > } > #endif /* CONFIG_COMPAT */ > + > +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > +/* > + * This function is called when an architecture specific implementation detected > + * an invalid address limit. The generic user-mode state checker will finish on > + * the appropriate BUG_ON. > + */ > +asmlinkage void address_limit_check_failed(void) > +{ > + verify_pre_usermode_state(); > + panic("address_limit_check_failed called with a valid user-mode state"); > +} > +#endif > -- > 2.12.2.715.g7642488e1d-goog >
* Kees Cook <keescook@chromium.org> wrote: > On Mon, Apr 10, 2017 at 9:44 AM, Thomas Garnier <thgarnie@google.com> wrote: > > This patch ensures a syscall does not return to user-mode with a kernel > > address limit. If that happened, a process can corrupt kernel-mode > > memory and elevate privileges. > > > > For example, it would mitigation this bug: > > > > - https://bugs.chromium.org/p/project-zero/issues/detail?id=990 > > > > The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also > > added so each architecture can optimize this change. > > > > Signed-off-by: Thomas Garnier <thgarnie@google.com> > > Tested-by: Kees Cook <keescook@chromium.org> > > Ingo, I think this series is ready. Can you pull it? (And if not, what > should next steps be?) I have some feedback for other patches in this series, plus for this one as well: > > +/* > > + * Called before coming back to user-mode. Returning to user-mode with an > > + * address limit different than USER_DS can allow to overwrite kernel memory. > > + */ > > +static inline void verify_pre_usermode_state(void) { > > + BUG_ON(!segment_eq(get_fs(), USER_DS)); > > +} That's not standard kernel coding style. Also, patch titles should start with a verb - 75% of the series doesn't. > > +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > > +#define __CHECK_USER_CALLER() \ > > + bool user_caller = segment_eq(get_fs(), USER_DS) > > +#define __VERIFY_PRE_USERMODE_STATE() \ > > + if (user_caller) verify_pre_usermode_state() > > +#else > > +#define __CHECK_USER_CALLER() > > +#define __VERIFY_PRE_USERMODE_STATE() > > +asmlinkage void address_limit_check_failed(void); > > +#endif > > +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE That Kconfig name is way too long. Plus please don't put logical operations into Kconfig names. > > +/* > > + * This function is called when an architecture specific implementation detected > > + * an invalid address limit. The generic user-mode state checker will finish on > > + * the appropriate BUG_ON. > > + */ > > +asmlinkage void address_limit_check_failed(void) > > +{ > > + verify_pre_usermode_state(); > > + panic("address_limit_check_failed called with a valid user-mode state"); > > +} > > +#endif Awful naming all around: verify_pre_usermode_state() address_limit_check_failed() Both names start with very common names that makes one read these again and again. (And yes, there's lots of bad names in the kernel, but we should not follow bad examples.) Best practice for such functionality is to use a common prefix that is both easy to recognize and easy to skip. For example we could use 'addr_limit_check' as the prefix: addr_limit_check_failed() addr_limit_check_syscall() No need to over-specify it that it's a "pre" check - it's obvious from existing implementation and should be documented in the function itself for new implementations. Harmonize the Kconfig namespace to the common prefix as well, i.e. use something like: CONFIG_ADDR_LIMIT_CHECK No need to add 'ARCH' I think - an architecture that enables this should get it unconditionally. etc. It's all cobbled together I'm afraid and will need more iterations. Thanks, Ingo
* Thomas Garnier <thgarnie@google.com> wrote: > This patch ensures a syscall does not return to user-mode with a kernel > address limit. If that happened, a process can corrupt kernel-mode > memory and elevate privileges. Don't start changelogs with 'This patch' - it's obvious that we are talking about this patch. Writing: Ensure that a syscall does not return to user-mode with a kernel address limit. If that happens, a process can corrupt kernel-mode memory and elevate privileges. also note the spelling fix I did. (There's another spelling error elsewhere in this changelog as well.) Please read changelogs! > For example, it would mitigation this bug: > > - https://bugs.chromium.org/p/project-zero/issues/detail?id=990 > > The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also > added so each architecture can optimize this change. As I pointed it out in my previous reply this Kconfig name is awfully long - but it should have been obvious when this changelog was written ... > Signed-off-by: Thomas Garnier <thgarnie@google.com> > Tested-by: Kees Cook <keescook@chromium.org> > --- > Based on next-20170410 > --- > arch/s390/Kconfig | 1 + > include/linux/syscalls.h | 26 +++++++++++++++++++++++++- > init/Kconfig | 6 ++++++ > kernel/sys.c | 13 +++++++++++++ > 4 files changed, 45 insertions(+), 1 deletion(-) > > diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig > index d25435d94b6e..489a0cc6e46b 100644 > --- a/arch/s390/Kconfig > +++ b/arch/s390/Kconfig > @@ -103,6 +103,7 @@ config S390 > select ARCH_INLINE_WRITE_UNLOCK_BH > select ARCH_INLINE_WRITE_UNLOCK_IRQ > select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE > + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > select ARCH_SAVE_PAGE_KEYS if HIBERNATION > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9b06f8..801a7a74fe28 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs; > SYSCALL_METADATA(sname, x, __VA_ARGS__) \ > __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) > > + > +/* > + * Called before coming back to user-mode. Returning to user-mode with an > + * address limit different than USER_DS can allow to overwrite kernel memory. > + */ > +static inline void verify_pre_usermode_state(void) { > + BUG_ON(!segment_eq(get_fs(), USER_DS)); > +} Non-standard coding style. > + > +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > +#define __CHECK_USER_CALLER() \ > + bool user_caller = segment_eq(get_fs(), USER_DS) > +#define __VERIFY_PRE_USERMODE_STATE() \ > + if (user_caller) verify_pre_usermode_state() > +#else > +#define __CHECK_USER_CALLER() > +#define __VERIFY_PRE_USERMODE_STATE() > +asmlinkage void address_limit_check_failed(void); > +#endif > + > + > #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) > #define __SYSCALL_DEFINEx(x, name, ...) \ > asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ > @@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs; > asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ > asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ > { \ > - long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ > + long ret; \ > + __CHECK_USER_CALLER(); \ > + ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ > + __VERIFY_PRE_USERMODE_STATE(); \ > __MAP(x,__SC_TEST,__VA_ARGS__); \ > __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ > return ret; \ BTW., the '__VERIFY_PRE_USERMODE_STATE()' name is highly misleading: the 'pre' prefix suggests that this is done before a system call - while it's done afterwards. The solution is to not try to specify the exact call placement in the name, just describe the functionality (and harmonize along the common prefix). > +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > + bool > + help > + Disable the generic pre-usermode state verification. Allow each > + architecture to optimize how and when the verification is done. > + Please name the Kconfig symbols something like this: CONFIG_ADDR_LIMIT_CHECK CONFIG_ADDR_LIMIT_CHECK_ARCH or so, which tells us whether the check is done by the architecture code, without breaking the col80 limit with a single Kconfig name. BTW: > +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > +/* > + * This function is called when an architecture specific implementation detected > + * an invalid address limit. The generic user-mode state checker will finish on > + * the appropriate BUG_ON. > + */ > +asmlinkage void address_limit_check_failed(void) > +{ > + verify_pre_usermode_state(); > + panic("address_limit_check_failed called with a valid user-mode state"); It's very unconstructive to unconditionally panic the system, just because some kernel code leaked the address limit! Do a warn-once printout and kill the current task (i.e. don't continue execution), but don't crash everything else! Thanks, Ingo
On Mon, Apr 24, 2017 at 11:23 PM, Ingo Molnar <mingo@kernel.org> wrote: > > * Kees Cook <keescook@chromium.org> wrote: > >> On Mon, Apr 10, 2017 at 9:44 AM, Thomas Garnier <thgarnie@google.com> wrote: >> > This patch ensures a syscall does not return to user-mode with a kernel >> > address limit. If that happened, a process can corrupt kernel-mode >> > memory and elevate privileges. >> > >> > For example, it would mitigation this bug: >> > >> > - https://bugs.chromium.org/p/project-zero/issues/detail?id=990 >> > >> > The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also >> > added so each architecture can optimize this change. >> > >> > Signed-off-by: Thomas Garnier <thgarnie@google.com> >> > Tested-by: Kees Cook <keescook@chromium.org> >> >> Ingo, I think this series is ready. Can you pull it? (And if not, what >> should next steps be?) > > I have some feedback for other patches in this series, plus for this one as well: > >> > +/* >> > + * Called before coming back to user-mode. Returning to user-mode with an >> > + * address limit different than USER_DS can allow to overwrite kernel memory. >> > + */ >> > +static inline void verify_pre_usermode_state(void) { >> > + BUG_ON(!segment_eq(get_fs(), USER_DS)); >> > +} > > That's not standard kernel coding style. > > Also, patch titles should start with a verb - 75% of the series doesn't. Will fix both. > >> > +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE >> > +#define __CHECK_USER_CALLER() \ >> > + bool user_caller = segment_eq(get_fs(), USER_DS) >> > +#define __VERIFY_PRE_USERMODE_STATE() \ >> > + if (user_caller) verify_pre_usermode_state() >> > +#else >> > +#define __CHECK_USER_CALLER() >> > +#define __VERIFY_PRE_USERMODE_STATE() >> > +asmlinkage void address_limit_check_failed(void); >> > +#endif > >> > +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > > That Kconfig name is way too long. > > Plus please don't put logical operations into Kconfig names. > >> > +/* >> > + * This function is called when an architecture specific implementation detected >> > + * an invalid address limit. The generic user-mode state checker will finish on >> > + * the appropriate BUG_ON. >> > + */ >> > +asmlinkage void address_limit_check_failed(void) >> > +{ >> > + verify_pre_usermode_state(); >> > + panic("address_limit_check_failed called with a valid user-mode state"); >> > +} >> > +#endif > > Awful naming all around: > > verify_pre_usermode_state() > address_limit_check_failed() > > Both names start with very common names that makes one read these again and again. > (And yes, there's lots of bad names in the kernel, but we should not follow bad > examples.) > > Best practice for such functionality is to use a common prefix that is both easy > to recognize and easy to skip. For example we could use 'addr_limit_check' as the > prefix: > > addr_limit_check_failed() > addr_limit_check_syscall() > > No need to over-specify it that it's a "pre" check - it's obvious from existing > implementation and should be documented in the function itself for new > implementations. > > Harmonize the Kconfig namespace to the common prefix as well, i.e. use something > like: > > CONFIG_ADDR_LIMIT_CHECK > > No need to add 'ARCH' I think - an architecture that enables this should get it > unconditionally. > > etc. > > It's all cobbled together I'm afraid and will need more iterations. Make sense. Thanks for the feedback. > > Thanks, > > Ingo
On Mon, Apr 24, 2017 at 11:33 PM, Ingo Molnar <mingo@kernel.org> wrote: > > * Thomas Garnier <thgarnie@google.com> wrote: > >> This patch ensures a syscall does not return to user-mode with a kernel >> address limit. If that happened, a process can corrupt kernel-mode >> memory and elevate privileges. > > Don't start changelogs with 'This patch' - it's obvious that we are talking about > this patch. Writing: > > Ensure that a syscall does not return to user-mode with a kernel address limit. > If that happens, a process can corrupt kernel-mode memory and elevate > privileges. > > also note the spelling fix I did. (There's another spelling error elsewhere in > this changelog as well.) I will take your change and go over other commits. > > Please read changelogs! > >> For example, it would mitigation this bug: >> >> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990 >> >> The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also >> added so each architecture can optimize this change. > > As I pointed it out in my previous reply this Kconfig name is awfully long - but > it should have been obvious when this changelog was written ... > >> Signed-off-by: Thomas Garnier <thgarnie@google.com> >> Tested-by: Kees Cook <keescook@chromium.org> >> --- >> Based on next-20170410 >> --- >> arch/s390/Kconfig | 1 + >> include/linux/syscalls.h | 26 +++++++++++++++++++++++++- >> init/Kconfig | 6 ++++++ >> kernel/sys.c | 13 +++++++++++++ >> 4 files changed, 45 insertions(+), 1 deletion(-) >> >> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig >> index d25435d94b6e..489a0cc6e46b 100644 >> --- a/arch/s390/Kconfig >> +++ b/arch/s390/Kconfig >> @@ -103,6 +103,7 @@ config S390 >> select ARCH_INLINE_WRITE_UNLOCK_BH >> select ARCH_INLINE_WRITE_UNLOCK_IRQ >> select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE >> + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE >> select ARCH_SAVE_PAGE_KEYS if HIBERNATION >> select ARCH_SUPPORTS_ATOMIC_RMW >> select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h >> index 980c3c9b06f8..801a7a74fe28 100644 >> --- a/include/linux/syscalls.h >> +++ b/include/linux/syscalls.h >> @@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs; >> SYSCALL_METADATA(sname, x, __VA_ARGS__) \ >> __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) >> >> + >> +/* >> + * Called before coming back to user-mode. Returning to user-mode with an >> + * address limit different than USER_DS can allow to overwrite kernel memory. >> + */ >> +static inline void verify_pre_usermode_state(void) { >> + BUG_ON(!segment_eq(get_fs(), USER_DS)); >> +} > > Non-standard coding style. > >> + >> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE >> +#define __CHECK_USER_CALLER() \ >> + bool user_caller = segment_eq(get_fs(), USER_DS) >> +#define __VERIFY_PRE_USERMODE_STATE() \ >> + if (user_caller) verify_pre_usermode_state() >> +#else >> +#define __CHECK_USER_CALLER() >> +#define __VERIFY_PRE_USERMODE_STATE() >> +asmlinkage void address_limit_check_failed(void); >> +#endif >> + >> + >> #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) >> #define __SYSCALL_DEFINEx(x, name, ...) \ >> asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ >> @@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs; >> asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ >> asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ >> { \ >> - long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ >> + long ret; \ >> + __CHECK_USER_CALLER(); \ >> + ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ >> + __VERIFY_PRE_USERMODE_STATE(); \ >> __MAP(x,__SC_TEST,__VA_ARGS__); \ >> __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ >> return ret; \ > > BTW., the '__VERIFY_PRE_USERMODE_STATE()' name is highly misleading: the 'pre' > prefix suggests that this is done before a system call - while it's done > afterwards. > > The solution is to not try to specify the exact call placement in the name, just > describe the functionality (and harmonize along the common prefix). Yes, I think BEFORE would have made more sense but things are getting really long so I am fine with your proposal in the previous thread. > >> +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE >> + bool >> + help >> + Disable the generic pre-usermode state verification. Allow each >> + architecture to optimize how and when the verification is done. >> + > > Please name the Kconfig symbols something like this: > > CONFIG_ADDR_LIMIT_CHECK > CONFIG_ADDR_LIMIT_CHECK_ARCH > > or so, which tells us whether the check is done by the architecture code, without > breaking the col80 limit with a single Kconfig name. > > BTW: > >> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE >> +/* >> + * This function is called when an architecture specific implementation detected >> + * an invalid address limit. The generic user-mode state checker will finish on >> + * the appropriate BUG_ON. >> + */ >> +asmlinkage void address_limit_check_failed(void) >> +{ >> + verify_pre_usermode_state(); >> + panic("address_limit_check_failed called with a valid user-mode state"); > > It's very unconstructive to unconditionally panic the system, just because some > kernel code leaked the address limit! Do a warn-once printout and kill the current > task (i.e. don't continue execution), but don't crash everything else! The original change did not crash the kernel for this exact reason. Through reviews, there was an overall agreement that the kernel should not continue in this state. > > Thanks, > > Ingo
* Thomas Garnier <thgarnie@google.com> wrote: > >> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE > >> +/* > >> + * This function is called when an architecture specific implementation detected > >> + * an invalid address limit. The generic user-mode state checker will finish on > >> + * the appropriate BUG_ON. > >> + */ > >> +asmlinkage void address_limit_check_failed(void) > >> +{ > >> + verify_pre_usermode_state(); > >> + panic("address_limit_check_failed called with a valid user-mode state"); > > > > It's very unconstructive to unconditionally panic the system, just because some > > kernel code leaked the address limit! Do a warn-once printout and kill the current > > task (i.e. don't continue execution), but don't crash everything else! > > The original change did not crash the kernel for this exact reason. > Through reviews, there was an overall agreement that the kernel should > not continue in this state. Ok, I guess we can try that - but the panic message is still pretty misleading: panic("address_limit_check_failed called with a valid user-mode state"); ... so it was called with a _valid_ user-mode state, and we crash due to something valid? Huh? ( Also, the style rule applies to kernel messages as well: function names should be referred to as "function_name()". ) Thanks, Ingo
On Wed, Apr 26, 2017 at 1:12 AM, Ingo Molnar <mingo@kernel.org> wrote: > > * Thomas Garnier <thgarnie@google.com> wrote: > >> >> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE >> >> +/* >> >> + * This function is called when an architecture specific implementation detected >> >> + * an invalid address limit. The generic user-mode state checker will finish on >> >> + * the appropriate BUG_ON. >> >> + */ >> >> +asmlinkage void address_limit_check_failed(void) >> >> +{ >> >> + verify_pre_usermode_state(); >> >> + panic("address_limit_check_failed called with a valid user-mode state"); >> > >> > It's very unconstructive to unconditionally panic the system, just because some >> > kernel code leaked the address limit! Do a warn-once printout and kill the current >> > task (i.e. don't continue execution), but don't crash everything else! >> >> The original change did not crash the kernel for this exact reason. >> Through reviews, there was an overall agreement that the kernel should >> not continue in this state. > > Ok, I guess we can try that - but the panic message is still pretty misleading: > > panic("address_limit_check_failed called with a valid user-mode state"); > > ... so it was called with a _valid_ user-mode state, and we crash due to something > valid? Huh? Yes the message is accurate but I agree that it is misleading and I will improve it. The address_limit_check_failed function is called by assembly code on different architectures once the state was detected as invalid. Instead of crashing at different places, we redirect to the generic handler (verify_pre_usermode_state) that will crash on the appropriate BUG_ON line. The address_limit_check_failed function is not supposed to comeback, the panic call is just a safe guard. > > ( Also, the style rule applies to kernel messages as well: function names should > be referred to as "function_name()". ) Will change. > > Thanks, > > Ingo
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d25435d94b6e..489a0cc6e46b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -103,6 +103,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9b06f8..801a7a74fe28 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs; SYSCALL_METADATA(sname, x, __VA_ARGS__) \ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) + +/* + * Called before coming back to user-mode. Returning to user-mode with an + * address limit different than USER_DS can allow to overwrite kernel memory. + */ +static inline void verify_pre_usermode_state(void) { + BUG_ON(!segment_eq(get_fs(), USER_DS)); +} + +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE +#define __CHECK_USER_CALLER() \ + bool user_caller = segment_eq(get_fs(), USER_DS) +#define __VERIFY_PRE_USERMODE_STATE() \ + if (user_caller) verify_pre_usermode_state() +#else +#define __CHECK_USER_CALLER() +#define __VERIFY_PRE_USERMODE_STATE() +asmlinkage void address_limit_check_failed(void); +#endif + + #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ @@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs; asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ - long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + long ret; \ + __CHECK_USER_CALLER(); \ + ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + __VERIFY_PRE_USERMODE_STATE(); \ __MAP(x,__SC_TEST,__VA_ARGS__); \ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ return ret; \ diff --git a/init/Kconfig b/init/Kconfig index 7f7027817bce..e5fbd0becfa7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1958,6 +1958,12 @@ config PROFILING config TRACEPOINTS bool +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE + bool + help + Disable the generic pre-usermode state verification. Allow each + architecture to optimize how and when the verification is done. + source "arch/Kconfig" endmenu # General setup diff --git a/kernel/sys.c b/kernel/sys.c index 196c7134bee6..d30530ff8166 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2459,3 +2459,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) return 0; } #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE +/* + * This function is called when an architecture specific implementation detected + * an invalid address limit. The generic user-mode state checker will finish on + * the appropriate BUG_ON. + */ +asmlinkage void address_limit_check_failed(void) +{ + verify_pre_usermode_state(); + panic("address_limit_check_failed called with a valid user-mode state"); +} +#endif