[v7,1/4] syscalls: Restore address limit after a syscall
diff mbox

Message ID 20170410164420.64003-1-thgarnie@google.com
State New
Headers show

Commit Message

Thomas Garnier April 10, 2017, 4:44 p.m. UTC
This patch ensures a syscall does not return to user-mode with a kernel
address limit. If that happened, a process can corrupt kernel-mode
memory and elevate privileges.

For example, it would mitigation this bug:

- https://bugs.chromium.org/p/project-zero/issues/detail?id=990

The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
added so each architecture can optimize this change.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
---
Based on next-20170410
---
 arch/s390/Kconfig        |  1 +
 include/linux/syscalls.h | 26 +++++++++++++++++++++++++-
 init/Kconfig             |  6 ++++++
 kernel/sys.c             | 13 +++++++++++++
 4 files changed, 45 insertions(+), 1 deletion(-)

Comments

Kees Cook April 24, 2017, 11:57 p.m. UTC | #1
On Mon, Apr 10, 2017 at 9:44 AM, Thomas Garnier <thgarnie@google.com> wrote:
> This patch ensures a syscall does not return to user-mode with a kernel
> address limit. If that happened, a process can corrupt kernel-mode
> memory and elevate privileges.
>
> For example, it would mitigation this bug:
>
> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>
> The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
> added so each architecture can optimize this change.
>
> Signed-off-by: Thomas Garnier <thgarnie@google.com>
> Tested-by: Kees Cook <keescook@chromium.org>

Ingo, I think this series is ready. Can you pull it? (And if not, what
should next steps be?)

-Kees

> ---
> Based on next-20170410
> ---
>  arch/s390/Kconfig        |  1 +
>  include/linux/syscalls.h | 26 +++++++++++++++++++++++++-
>  init/Kconfig             |  6 ++++++
>  kernel/sys.c             | 13 +++++++++++++
>  4 files changed, 45 insertions(+), 1 deletion(-)
>
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index d25435d94b6e..489a0cc6e46b 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -103,6 +103,7 @@ config S390
>         select ARCH_INLINE_WRITE_UNLOCK_BH
>         select ARCH_INLINE_WRITE_UNLOCK_IRQ
>         select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
> +       select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>         select ARCH_SAVE_PAGE_KEYS if HIBERNATION
>         select ARCH_SUPPORTS_ATOMIC_RMW
>         select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9b06f8..801a7a74fe28 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>         SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
>         __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>
> +
> +/*
> + * Called before coming back to user-mode. Returning to user-mode with an
> + * address limit different than USER_DS can allow to overwrite kernel memory.
> + */
> +static inline void verify_pre_usermode_state(void) {
> +       BUG_ON(!segment_eq(get_fs(), USER_DS));
> +}
> +
> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +#define __CHECK_USER_CALLER() \
> +       bool user_caller = segment_eq(get_fs(), USER_DS)
> +#define __VERIFY_PRE_USERMODE_STATE() \
> +       if (user_caller) verify_pre_usermode_state()
> +#else
> +#define __CHECK_USER_CALLER()
> +#define __VERIFY_PRE_USERMODE_STATE()
> +asmlinkage void address_limit_check_failed(void);
> +#endif
> +
> +
>  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
>  #define __SYSCALL_DEFINEx(x, name, ...)                                        \
>         asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
> @@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>         asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
>         asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
>         {                                                               \
> -               long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
> +               long ret;                                               \
> +               __CHECK_USER_CALLER();                                  \
> +               ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));       \
> +               __VERIFY_PRE_USERMODE_STATE();                          \
>                 __MAP(x,__SC_TEST,__VA_ARGS__);                         \
>                 __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
>                 return ret;                                             \
> diff --git a/init/Kconfig b/init/Kconfig
> index 7f7027817bce..e5fbd0becfa7 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1958,6 +1958,12 @@ config PROFILING
>  config TRACEPOINTS
>         bool
>
> +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +       bool
> +       help
> +         Disable the generic pre-usermode state verification. Allow each
> +         architecture to optimize how and when the verification is done.
> +
>  source "arch/Kconfig"
>
>  endmenu                # General setup
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 196c7134bee6..d30530ff8166 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2459,3 +2459,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
>         return 0;
>  }
>  #endif /* CONFIG_COMPAT */
> +
> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +/*
> + * This function is called when an architecture specific implementation detected
> + * an invalid address limit. The generic user-mode state checker will finish on
> + * the appropriate BUG_ON.
> + */
> +asmlinkage void address_limit_check_failed(void)
> +{
> +       verify_pre_usermode_state();
> +       panic("address_limit_check_failed called with a valid user-mode state");
> +}
> +#endif
> --
> 2.12.2.715.g7642488e1d-goog
>
Ingo Molnar April 25, 2017, 6:23 a.m. UTC | #2
* Kees Cook <keescook@chromium.org> wrote:

> On Mon, Apr 10, 2017 at 9:44 AM, Thomas Garnier <thgarnie@google.com> wrote:
> > This patch ensures a syscall does not return to user-mode with a kernel
> > address limit. If that happened, a process can corrupt kernel-mode
> > memory and elevate privileges.
> >
> > For example, it would mitigation this bug:
> >
> > - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> >
> > The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
> > added so each architecture can optimize this change.
> >
> > Signed-off-by: Thomas Garnier <thgarnie@google.com>
> > Tested-by: Kees Cook <keescook@chromium.org>
> 
> Ingo, I think this series is ready. Can you pull it? (And if not, what
> should next steps be?)

I have some feedback for other patches in this series, plus for this one as well:

> > +/*
> > + * Called before coming back to user-mode. Returning to user-mode with an
> > + * address limit different than USER_DS can allow to overwrite kernel memory.
> > + */
> > +static inline void verify_pre_usermode_state(void) {
> > +       BUG_ON(!segment_eq(get_fs(), USER_DS));
> > +}

That's not standard kernel coding style.

Also, patch titles should start with a verb - 75% of the series doesn't.

> > +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> > +#define __CHECK_USER_CALLER() \
> > +       bool user_caller = segment_eq(get_fs(), USER_DS)
> > +#define __VERIFY_PRE_USERMODE_STATE() \
> > +       if (user_caller) verify_pre_usermode_state()
> > +#else
> > +#define __CHECK_USER_CALLER()
> > +#define __VERIFY_PRE_USERMODE_STATE()
> > +asmlinkage void address_limit_check_failed(void);
> > +#endif

> > +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE

That Kconfig name is way too long.

Plus please don't put logical operations into Kconfig names.

> > +/*
> > + * This function is called when an architecture specific implementation detected
> > + * an invalid address limit. The generic user-mode state checker will finish on
> > + * the appropriate BUG_ON.
> > + */
> > +asmlinkage void address_limit_check_failed(void)
> > +{
> > +       verify_pre_usermode_state();
> > +       panic("address_limit_check_failed called with a valid user-mode state");
> > +}
> > +#endif

Awful naming all around:

	verify_pre_usermode_state()
	address_limit_check_failed()

Both names start with very common names that makes one read these again and again. 
(And yes, there's lots of bad names in the kernel, but we should not follow bad 
examples.)

Best practice for such functionality is to use a common prefix that is both easy 
to recognize and easy to skip. For example we could use 'addr_limit_check' as the 
prefix:

	addr_limit_check_failed()
	addr_limit_check_syscall()

No need to over-specify it that it's a "pre" check - it's obvious from existing 
implementation and should be documented in the function itself for new 
implementations.

Harmonize the Kconfig namespace to the common prefix as well, i.e. use something 
like:

	CONFIG_ADDR_LIMIT_CHECK

No need to add 'ARCH' I think - an architecture that enables this should get it 
unconditionally.

etc.

It's all cobbled together I'm afraid and will need more iterations.

Thanks,

	Ingo
Ingo Molnar April 25, 2017, 6:33 a.m. UTC | #3
* Thomas Garnier <thgarnie@google.com> wrote:

> This patch ensures a syscall does not return to user-mode with a kernel
> address limit. If that happened, a process can corrupt kernel-mode
> memory and elevate privileges.

Don't start changelogs with 'This patch' - it's obvious that we are talking about 
this patch. Writing:

   Ensure that a syscall does not return to user-mode with a kernel address limit. 
   If that happens, a process can corrupt kernel-mode memory and elevate 
   privileges.

also note the spelling fix I did. (There's another spelling error elsewhere in 
this changelog as well.)

Please read changelogs!

> For example, it would mitigation this bug:
> 
> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> 
> The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
> added so each architecture can optimize this change.

As I pointed it out in my previous reply this Kconfig name is awfully long - but 
it should have been obvious when this changelog was written ...

> Signed-off-by: Thomas Garnier <thgarnie@google.com>
> Tested-by: Kees Cook <keescook@chromium.org>
> ---
> Based on next-20170410
> ---
>  arch/s390/Kconfig        |  1 +
>  include/linux/syscalls.h | 26 +++++++++++++++++++++++++-
>  init/Kconfig             |  6 ++++++
>  kernel/sys.c             | 13 +++++++++++++
>  4 files changed, 45 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index d25435d94b6e..489a0cc6e46b 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -103,6 +103,7 @@ config S390
>  	select ARCH_INLINE_WRITE_UNLOCK_BH
>  	select ARCH_INLINE_WRITE_UNLOCK_IRQ
>  	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
> +	select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>  	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
>  	select ARCH_SUPPORTS_ATOMIC_RMW
>  	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9b06f8..801a7a74fe28 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
>  	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>  
> +
> +/*
> + * Called before coming back to user-mode. Returning to user-mode with an
> + * address limit different than USER_DS can allow to overwrite kernel memory.
> + */
> +static inline void verify_pre_usermode_state(void) {
> +	BUG_ON(!segment_eq(get_fs(), USER_DS));
> +}

Non-standard coding style.

> +
> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +#define __CHECK_USER_CALLER() \
> +	bool user_caller = segment_eq(get_fs(), USER_DS)
> +#define __VERIFY_PRE_USERMODE_STATE() \
> +	if (user_caller) verify_pre_usermode_state()
> +#else
> +#define __CHECK_USER_CALLER()
> +#define __VERIFY_PRE_USERMODE_STATE()
> +asmlinkage void address_limit_check_failed(void);
> +#endif
> +
> +
>  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
>  #define __SYSCALL_DEFINEx(x, name, ...)					\
>  	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
> @@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
>  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
>  	{								\
> -		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> +		long ret;						\
> +		__CHECK_USER_CALLER();					\
> +		ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> +		__VERIFY_PRE_USERMODE_STATE();				\
>  		__MAP(x,__SC_TEST,__VA_ARGS__);				\
>  		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
>  		return ret;						\

BTW., the '__VERIFY_PRE_USERMODE_STATE()' name is highly misleading: the 'pre' 
prefix suggests that this is done before a system call - while it's done 
afterwards.

The solution is to not try to specify the exact call placement in the name, just 
describe the functionality (and harmonize along the common prefix).

> +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +	bool
> +	help
> +	  Disable the generic pre-usermode state verification. Allow each
> +	  architecture to optimize how and when the verification is done.
> +

Please name the Kconfig symbols something like this:

	CONFIG_ADDR_LIMIT_CHECK
	CONFIG_ADDR_LIMIT_CHECK_ARCH

or so, which tells us whether the check is done by the architecture code, without 
breaking the col80 limit with a single Kconfig name.

BTW:

> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +/*
> + * This function is called when an architecture specific implementation detected
> + * an invalid address limit. The generic user-mode state checker will finish on
> + * the appropriate BUG_ON.
> + */
> +asmlinkage void address_limit_check_failed(void)
> +{
> +	verify_pre_usermode_state();
> +	panic("address_limit_check_failed called with a valid user-mode state");

It's very unconstructive to unconditionally panic the system, just because some 
kernel code leaked the address limit! Do a warn-once printout and kill the current 
task (i.e. don't continue execution), but don't crash everything else!

Thanks,

	Ingo
Thomas Garnier April 25, 2017, 2:12 p.m. UTC | #4
On Mon, Apr 24, 2017 at 11:23 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Kees Cook <keescook@chromium.org> wrote:
>
>> On Mon, Apr 10, 2017 at 9:44 AM, Thomas Garnier <thgarnie@google.com> wrote:
>> > This patch ensures a syscall does not return to user-mode with a kernel
>> > address limit. If that happened, a process can corrupt kernel-mode
>> > memory and elevate privileges.
>> >
>> > For example, it would mitigation this bug:
>> >
>> > - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>> >
>> > The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
>> > added so each architecture can optimize this change.
>> >
>> > Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> > Tested-by: Kees Cook <keescook@chromium.org>
>>
>> Ingo, I think this series is ready. Can you pull it? (And if not, what
>> should next steps be?)
>
> I have some feedback for other patches in this series, plus for this one as well:
>
>> > +/*
>> > + * Called before coming back to user-mode. Returning to user-mode with an
>> > + * address limit different than USER_DS can allow to overwrite kernel memory.
>> > + */
>> > +static inline void verify_pre_usermode_state(void) {
>> > +       BUG_ON(!segment_eq(get_fs(), USER_DS));
>> > +}
>
> That's not standard kernel coding style.
>
> Also, patch titles should start with a verb - 75% of the series doesn't.

Will fix both.

>
>> > +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> > +#define __CHECK_USER_CALLER() \
>> > +       bool user_caller = segment_eq(get_fs(), USER_DS)
>> > +#define __VERIFY_PRE_USERMODE_STATE() \
>> > +       if (user_caller) verify_pre_usermode_state()
>> > +#else
>> > +#define __CHECK_USER_CALLER()
>> > +#define __VERIFY_PRE_USERMODE_STATE()
>> > +asmlinkage void address_limit_check_failed(void);
>> > +#endif
>
>> > +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>
> That Kconfig name is way too long.
>
> Plus please don't put logical operations into Kconfig names.
>
>> > +/*
>> > + * This function is called when an architecture specific implementation detected
>> > + * an invalid address limit. The generic user-mode state checker will finish on
>> > + * the appropriate BUG_ON.
>> > + */
>> > +asmlinkage void address_limit_check_failed(void)
>> > +{
>> > +       verify_pre_usermode_state();
>> > +       panic("address_limit_check_failed called with a valid user-mode state");
>> > +}
>> > +#endif
>
> Awful naming all around:
>
>         verify_pre_usermode_state()
>         address_limit_check_failed()
>
> Both names start with very common names that makes one read these again and again.
> (And yes, there's lots of bad names in the kernel, but we should not follow bad
> examples.)
>
> Best practice for such functionality is to use a common prefix that is both easy
> to recognize and easy to skip. For example we could use 'addr_limit_check' as the
> prefix:
>
>         addr_limit_check_failed()
>         addr_limit_check_syscall()
>
> No need to over-specify it that it's a "pre" check - it's obvious from existing
> implementation and should be documented in the function itself for new
> implementations.
>
> Harmonize the Kconfig namespace to the common prefix as well, i.e. use something
> like:
>
>         CONFIG_ADDR_LIMIT_CHECK
>
> No need to add 'ARCH' I think - an architecture that enables this should get it
> unconditionally.
>
> etc.
>
> It's all cobbled together I'm afraid and will need more iterations.

Make sense. Thanks for the feedback.

>
> Thanks,
>
>         Ingo
Thomas Garnier April 25, 2017, 2:18 p.m. UTC | #5
On Mon, Apr 24, 2017 at 11:33 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Thomas Garnier <thgarnie@google.com> wrote:
>
>> This patch ensures a syscall does not return to user-mode with a kernel
>> address limit. If that happened, a process can corrupt kernel-mode
>> memory and elevate privileges.
>
> Don't start changelogs with 'This patch' - it's obvious that we are talking about
> this patch. Writing:
>
>    Ensure that a syscall does not return to user-mode with a kernel address limit.
>    If that happens, a process can corrupt kernel-mode memory and elevate
>    privileges.
>
> also note the spelling fix I did. (There's another spelling error elsewhere in
> this changelog as well.)

I will take your change and go over other commits.

>
> Please read changelogs!
>
>> For example, it would mitigation this bug:
>>
>> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
>> added so each architecture can optimize this change.
>
> As I pointed it out in my previous reply this Kconfig name is awfully long - but
> it should have been obvious when this changelog was written ...
>
>> Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> Tested-by: Kees Cook <keescook@chromium.org>
>> ---
>> Based on next-20170410
>> ---
>>  arch/s390/Kconfig        |  1 +
>>  include/linux/syscalls.h | 26 +++++++++++++++++++++++++-
>>  init/Kconfig             |  6 ++++++
>>  kernel/sys.c             | 13 +++++++++++++
>>  4 files changed, 45 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
>> index d25435d94b6e..489a0cc6e46b 100644
>> --- a/arch/s390/Kconfig
>> +++ b/arch/s390/Kconfig
>> @@ -103,6 +103,7 @@ config S390
>>       select ARCH_INLINE_WRITE_UNLOCK_BH
>>       select ARCH_INLINE_WRITE_UNLOCK_IRQ
>>       select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
>> +     select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>>       select ARCH_SAVE_PAGE_KEYS if HIBERNATION
>>       select ARCH_SUPPORTS_ATOMIC_RMW
>>       select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index 980c3c9b06f8..801a7a74fe28 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>>       SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
>>       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>>
>> +
>> +/*
>> + * Called before coming back to user-mode. Returning to user-mode with an
>> + * address limit different than USER_DS can allow to overwrite kernel memory.
>> + */
>> +static inline void verify_pre_usermode_state(void) {
>> +     BUG_ON(!segment_eq(get_fs(), USER_DS));
>> +}
>
> Non-standard coding style.
>
>> +
>> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> +#define __CHECK_USER_CALLER() \
>> +     bool user_caller = segment_eq(get_fs(), USER_DS)
>> +#define __VERIFY_PRE_USERMODE_STATE() \
>> +     if (user_caller) verify_pre_usermode_state()
>> +#else
>> +#define __CHECK_USER_CALLER()
>> +#define __VERIFY_PRE_USERMODE_STATE()
>> +asmlinkage void address_limit_check_failed(void);
>> +#endif
>> +
>> +
>>  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
>>  #define __SYSCALL_DEFINEx(x, name, ...)                                      \
>>       asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
>> @@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>>       asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
>>       asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
>>       {                                                               \
>> -             long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
>> +             long ret;                                               \
>> +             __CHECK_USER_CALLER();                                  \
>> +             ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));       \
>> +             __VERIFY_PRE_USERMODE_STATE();                          \
>>               __MAP(x,__SC_TEST,__VA_ARGS__);                         \
>>               __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
>>               return ret;                                             \
>
> BTW., the '__VERIFY_PRE_USERMODE_STATE()' name is highly misleading: the 'pre'
> prefix suggests that this is done before a system call - while it's done
> afterwards.
>
> The solution is to not try to specify the exact call placement in the name, just
> describe the functionality (and harmonize along the common prefix).

Yes, I think BEFORE would have made more sense but things are getting
really long so I am fine with your proposal in the previous thread.

>
>> +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> +     bool
>> +     help
>> +       Disable the generic pre-usermode state verification. Allow each
>> +       architecture to optimize how and when the verification is done.
>> +
>
> Please name the Kconfig symbols something like this:
>
>         CONFIG_ADDR_LIMIT_CHECK
>         CONFIG_ADDR_LIMIT_CHECK_ARCH
>
> or so, which tells us whether the check is done by the architecture code, without
> breaking the col80 limit with a single Kconfig name.
>
> BTW:
>
>> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> +/*
>> + * This function is called when an architecture specific implementation detected
>> + * an invalid address limit. The generic user-mode state checker will finish on
>> + * the appropriate BUG_ON.
>> + */
>> +asmlinkage void address_limit_check_failed(void)
>> +{
>> +     verify_pre_usermode_state();
>> +     panic("address_limit_check_failed called with a valid user-mode state");
>
> It's very unconstructive to unconditionally panic the system, just because some
> kernel code leaked the address limit! Do a warn-once printout and kill the current
> task (i.e. don't continue execution), but don't crash everything else!

The original change did not crash the kernel for this exact reason.
Through reviews, there was an overall agreement that the kernel should
not continue in this state.

>
> Thanks,
>
>         Ingo
Ingo Molnar April 26, 2017, 8:12 a.m. UTC | #6
* Thomas Garnier <thgarnie@google.com> wrote:

> >> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> >> +/*
> >> + * This function is called when an architecture specific implementation detected
> >> + * an invalid address limit. The generic user-mode state checker will finish on
> >> + * the appropriate BUG_ON.
> >> + */
> >> +asmlinkage void address_limit_check_failed(void)
> >> +{
> >> +     verify_pre_usermode_state();
> >> +     panic("address_limit_check_failed called with a valid user-mode state");
> >
> > It's very unconstructive to unconditionally panic the system, just because some
> > kernel code leaked the address limit! Do a warn-once printout and kill the current
> > task (i.e. don't continue execution), but don't crash everything else!
> 
> The original change did not crash the kernel for this exact reason.
> Through reviews, there was an overall agreement that the kernel should
> not continue in this state.

Ok, I guess we can try that - but the panic message is still pretty misleading:

	panic("address_limit_check_failed called with a valid user-mode state");

... so it was called with a _valid_ user-mode state, and we crash due to something 
valid? Huh?

( Also, the style rule applies to kernel messages as well: function names should 
  be referred to as "function_name()". )

Thanks,

	Ingo
Thomas Garnier April 26, 2017, 2:09 p.m. UTC | #7
On Wed, Apr 26, 2017 at 1:12 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Thomas Garnier <thgarnie@google.com> wrote:
>
>> >> +#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> >> +/*
>> >> + * This function is called when an architecture specific implementation detected
>> >> + * an invalid address limit. The generic user-mode state checker will finish on
>> >> + * the appropriate BUG_ON.
>> >> + */
>> >> +asmlinkage void address_limit_check_failed(void)
>> >> +{
>> >> +     verify_pre_usermode_state();
>> >> +     panic("address_limit_check_failed called with a valid user-mode state");
>> >
>> > It's very unconstructive to unconditionally panic the system, just because some
>> > kernel code leaked the address limit! Do a warn-once printout and kill the current
>> > task (i.e. don't continue execution), but don't crash everything else!
>>
>> The original change did not crash the kernel for this exact reason.
>> Through reviews, there was an overall agreement that the kernel should
>> not continue in this state.
>
> Ok, I guess we can try that - but the panic message is still pretty misleading:
>
>         panic("address_limit_check_failed called with a valid user-mode state");
>
> ... so it was called with a _valid_ user-mode state, and we crash due to something
> valid? Huh?

Yes the message is accurate but I agree that it is misleading and I
will improve it. The address_limit_check_failed function is called by
assembly code on different architectures once the state was detected
as invalid. Instead of crashing at different places, we redirect to
the generic handler (verify_pre_usermode_state) that will crash on the
appropriate BUG_ON line. The address_limit_check_failed function is
not supposed to comeback, the panic call is just a safe guard.

>
> ( Also, the style rule applies to kernel messages as well: function names should
>   be referred to as "function_name()". )

Will change.

>
> Thanks,
>
>         Ingo

Patch
diff mbox

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d25435d94b6e..489a0cc6e46b 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -103,6 +103,7 @@  config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
 	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..801a7a74fe28 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,27 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 
+
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void verify_pre_usermode_state(void) {
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+}
+
+#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+#define __CHECK_USER_CALLER() \
+	bool user_caller = segment_eq(get_fs(), USER_DS)
+#define __VERIFY_PRE_USERMODE_STATE() \
+	if (user_caller) verify_pre_usermode_state()
+#else
+#define __CHECK_USER_CALLER()
+#define __VERIFY_PRE_USERMODE_STATE()
+asmlinkage void address_limit_check_failed(void);
+#endif
+
+
 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
@@ -199,7 +220,10 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
-		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		long ret;						\
+		__CHECK_USER_CALLER();					\
+		ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		__VERIFY_PRE_USERMODE_STATE();				\
 		__MAP(x,__SC_TEST,__VA_ARGS__);				\
 		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
 		return ret;						\
diff --git a/init/Kconfig b/init/Kconfig
index 7f7027817bce..e5fbd0becfa7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1958,6 +1958,12 @@  config PROFILING
 config TRACEPOINTS
 	bool
 
+config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+	bool
+	help
+	  Disable the generic pre-usermode state verification. Allow each
+	  architecture to optimize how and when the verification is done.
+
 source "arch/Kconfig"
 
 endmenu		# General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 196c7134bee6..d30530ff8166 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2459,3 +2459,16 @@  COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
 	return 0;
 }
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+/*
+ * This function is called when an architecture specific implementation detected
+ * an invalid address limit. The generic user-mode state checker will finish on
+ * the appropriate BUG_ON.
+ */
+asmlinkage void address_limit_check_failed(void)
+{
+	verify_pre_usermode_state();
+	panic("address_limit_check_failed called with a valid user-mode state");
+}
+#endif