diff mbox

[v2,1/4] syscalls: Restore address limit after a syscall

Message ID 20170309012456.5631-1-thgarnie@google.com
State New, archived
Headers show

Commit Message

Thomas Garnier March 9, 2017, 1:24 a.m. UTC
This patch ensures a syscall does not return to user-mode with a kernel
address limit. If that happened, a process can corrupt kernel-mode
memory and elevate privileges.

For example, it would mitigation this bug:

- https://bugs.chromium.org/p/project-zero/issues/detail?id=990

If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
state will result in a BUG_ON.

The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
added so each architecture can optimize this change.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
Based on next-20170308
---
 include/linux/syscalls.h | 19 +++++++++++++++++++
 init/Kconfig             |  7 +++++++
 kernel/sys.c             |  8 ++++++++
 3 files changed, 34 insertions(+)

Comments

Borislav Petkov March 9, 2017, 8:42 a.m. UTC | #1
On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
> @@ -191,6 +191,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
>  	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>  
> +asmlinkage void verify_pre_usermode_state(void);
> +
> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +static inline bool has_user_ds(void) {
> +	bool ret = segment_eq(get_fs(), USER_DS);
> +	// Prevent re-ordering the call

This is not the kernel comments style. Use /* */ instead.

> +	barrier();
> +	return ret;
> +}
> +#else
> +static inline bool has_user_ds(void) {
> +	return false;
> +}
> +#endif

... and then you could slim down the ifdeffery a bit:

static inline bool has_user_ds(void) {
	bool ret = false;

#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
	ret = segment_eq(get_fs(), USER_DS);
	/* Prevent re-ordering the call. */
	barrier();
#endif

	return ret;
}
Sergey Senozhatsky March 9, 2017, 10:39 a.m. UTC | #2
Hello,

On (03/08/17 17:24), Thomas Garnier wrote:
> This patch ensures a syscall does not return to user-mode with a kernel
> address limit. If that happened, a process can corrupt kernel-mode
> memory and elevate privileges.


I like the patch set.


a side note (perhaps a bit irrelevant), the WARN backtrace does not
really tell more than "incorrect get_fs() on user-mode return" message
does

 incorrect get_fs() on user-mode return
 ------------[ cut here ]------------
 kernel BUG at kernel/sys.c:2467!
 invalid opcode: 0000 [#1] PREEMPT SMP
 Modules linked in: FOO
 CPU: 2 PID: 355 Comm: BAR
 Hardware name: BUZ
 task: ffff8801329f4e00 task.stack: ffffc900005d8000
 RIP: 0010:verify_pre_usermode_state+0x31/0x34
 RSP: 0018:ffffc900005dbf48 EFLAGS: 00010096
 RAX: 0000000000000026 RBX: 0000000000000002 RCX: 0000000000000001
 RDX: 0000000000000046 RSI: ffff880130cead88 RDI: ffffffff81095594
 RBP: ffffc900005dbf48 R08: 0000000000000001 R09: 0000000000000001
 R10: ffffc900005dbd58 R11: ffff8801329f4e00 R12: 0000000000000002
 R13: 0000000000000001 R14: 00007fb6c6a7b5e0 R15: 0000000000000002
 FS:  00007fb6c70d3b40(0000) GS:ffff880137d00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007ffd9d94f8f8 CR3: 00000001295b2000 CR4: 00000000000006e0
 Call Trace:
  entry_SYSCALL_64_fastpath+0x3a/0xb2
 RIP: 0033:0x7fb6c67ba3c0
 RSP: 002b:00007ffd9d94f5c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
 RAX: 0000000000000002 RBX: 0000000000000000 RCX: 00007fb6c67ba3c0
 RDX: 0000000000000002 RSI: 0000000000ba7310 RDI: 0000000000000001
 RBP: 0000000000000001 R08: 00007fb6c6a7c740 R09: 00007fb6c70d3b40
 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 Code: 48 8b 14 25 40 c5 00 00 48 b8 00 f0 ff ff ff 7f 00 00 48 39 82 28 11 00 00 74 12 55 48 c7 c7 9b f5 78 81 48 89 e5 e8 14 19 0b 00 <0f> 0b c3 66 66 66 66 90 55 48 89 e5 53 48 8b 47 50 48 89 fb 48 


may be some day someone would be interested in something like
    "incorrect get_fs() on user-mode return from %pS"
and set_fs() would save _RET_IP_.

just a side note.

	-ss
Mark Rutland March 9, 2017, 12:09 p.m. UTC | #3
Hi,

On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
> This patch ensures a syscall does not return to user-mode with a kernel
> address limit. If that happened, a process can corrupt kernel-mode
> memory and elevate privileges.
> 
> For example, it would mitigation this bug:
> 
> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> 
> If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
> state will result in a BUG_ON.
> 
> The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
> added so each architecture can optimize this change.

> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +static inline bool has_user_ds(void) {
> +	bool ret = segment_eq(get_fs(), USER_DS);
> +	// Prevent re-ordering the call
> +	barrier();

What ordering are we trying to ensure, that isn't otherwise given?

We expect get_fs() and set_fs() to be ordered w.r.t. each other and
w.r.t. uaccess uses, or we'd need barriers all over the place.

Given that, I can't see why we need a barrier here. So this needs a
better comment, at least.

> +	return ret;
> +}
> +#else
> +static inline bool has_user_ds(void) {
> +	return false;
> +}
> +#endif

It would be simpler to wrap the call entirely, e.g. have:

#ifdef CONFIG_WHATEVER
static inline void verify_pre_usermode_state(void)
{
	if (segment_eq(get_fs(), USER_DS))
		__verify_pre_usermode_state();
}
#else
static inline void verify_pre_usermode_state(void) { }
#endif

> @@ -199,7 +215,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
>  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
>  	{								\
> +		bool user_caller = has_user_ds();			\
>  		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> +		if (user_caller)					\
> +			verify_pre_usermode_state();			\

... then we can unconditionally use verify_pre_usermode_state() here ... 

>  		__MAP(x,__SC_TEST,__VA_ARGS__);				\
>  		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
>  		return ret;						\

[...]

> +/* Called before coming back to user-mode */
> +asmlinkage void verify_pre_usermode_state(void)

... and we just prepend a couple of underscores here.

> +{
> +	if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
> +				  "incorrect get_fs() on user-mode return"))
> +		set_fs(USER_DS);
> +}

Thanks,
Mark.
Christian Borntraeger March 9, 2017, 12:32 p.m. UTC | #4
On 03/09/2017 02:24 AM, Thomas Garnier wrote:
> This patch ensures a syscall does not return to user-mode with a kernel
> address limit. If that happened, a process can corrupt kernel-mode
> memory and elevate privileges.
> 
> For example, it would mitigation this bug:
> 
> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> 
> If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
> state will result in a BUG_ON.
> 
> The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
> added so each architecture can optimize this change.
> 
> Signed-off-by: Thomas Garnier <thgarnie@google.com>
> ---
> Based on next-20170308
> ---
>  include/linux/syscalls.h | 19 +++++++++++++++++++
>  init/Kconfig             |  7 +++++++
>  kernel/sys.c             |  8 ++++++++
>  3 files changed, 34 insertions(+)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9b06f8..78a2268ecd6e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -191,6 +191,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
>  	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
> 
> +asmlinkage void verify_pre_usermode_state(void);
> +
> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +static inline bool has_user_ds(void) {
> +	bool ret = segment_eq(get_fs(), USER_DS);
> +	// Prevent re-ordering the call
> +	barrier();
> +	return ret;
> +}

Can you please disable that for s390?  (e.g. by setting 
CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE for s390)
We have a separate address space for kernel/user so the logic will
be slightly different and is already handled in

commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
Author: Heiko Carstens <heiko.carstens@de.ibm.com>
Date:   Fri Feb 17 08:13:28 2017 +0100

    s390: restore address space when returning to user space



> +#else
> +static inline bool has_user_ds(void) {
> +	return false;
> +}
> +#endif
> +
> +
>  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
>  #define __SYSCALL_DEFINEx(x, name, ...)					\
>  	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
> @@ -199,7 +215,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
>  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
>  	{								\
> +		bool user_caller = has_user_ds();			\
>  		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> +		if (user_caller)					\
> +			verify_pre_usermode_state();			\
>  		__MAP(x,__SC_TEST,__VA_ARGS__);				\
>  		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
>  		return ret;						\
> diff --git a/init/Kconfig b/init/Kconfig
> index c859c993c26f..c4efc3a95e4a 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1929,6 +1929,13 @@ config PROFILING
>  config TRACEPOINTS
>  	bool
> 
> +#
> +# Set by each architecture that want to optimize how verify_pre_usermode_state
> +# is called.
> +#
> +config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> +	bool
> +
>  source "arch/Kconfig"
> 
>  endmenu		# General setup
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 196c7134bee6..411163ac9dc3 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2459,3 +2459,11 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
>  	return 0;
>  }
>  #endif /* CONFIG_COMPAT */
> +
> +/* Called before coming back to user-mode */
> +asmlinkage void verify_pre_usermode_state(void)
> +{
> +	if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
> +				  "incorrect get_fs() on user-mode return"))
> +		set_fs(USER_DS);
> +}
>
Russell King - ARM Linux admin March 9, 2017, 1:44 p.m. UTC | #5
On Thu, Mar 09, 2017 at 12:09:55PM +0000, Mark Rutland wrote:
> Hi,
> 
> On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
> > This patch ensures a syscall does not return to user-mode with a kernel
> > address limit. If that happened, a process can corrupt kernel-mode
> > memory and elevate privileges.
> > 
> > For example, it would mitigation this bug:
> > 
> > - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> > 
> > If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
> > state will result in a BUG_ON.
> > 
> > The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
> > added so each architecture can optimize this change.
> 
> > +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> > +static inline bool has_user_ds(void) {
> > +	bool ret = segment_eq(get_fs(), USER_DS);
> > +	// Prevent re-ordering the call
> > +	barrier();
> 
> What ordering are we trying to ensure, that isn't otherwise given?
> 
> We expect get_fs() and set_fs() to be ordered w.r.t. each other and
> w.r.t. uaccess uses, or we'd need barriers all over the place.
> 
> Given that, I can't see why we need a barrier here. So this needs a
> better comment, at least.
> 
> > +	return ret;
> > +}
> > +#else
> > +static inline bool has_user_ds(void) {
> > +	return false;
> > +}
> > +#endif
> 
> It would be simpler to wrap the call entirely, e.g. have:
> 
> #ifdef CONFIG_WHATEVER
> static inline void verify_pre_usermode_state(void)
> {
> 	if (segment_eq(get_fs(), USER_DS))
> 		__verify_pre_usermode_state();
> }
> #else
> static inline void verify_pre_usermode_state(void) { }
> #endif

That's utterly pointless - you've missed a detail.

> > @@ -199,7 +215,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
> >  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
> >  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
> >  	{								\
> > +		bool user_caller = has_user_ds();			\
> >  		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> > +		if (user_caller)					\
> > +			verify_pre_usermode_state();			\
> 
> ... then we can unconditionally use verify_pre_usermode_state() here ... 

Look at this closely.  has_user_ds() is called _before_ the syscall code
is invoked.  It's checking what conditions the syscall was entered from.
If the syscall was entered with the user segment selected, then we run
a check on the system state _after_ the syscall code has returned.

Putting both after the syscall code has returned is completely pointless -
it turns it into this code:

	if (segment_eq(get_fs(), USER_DS))
		if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
					  "incorrect get_fs() on user-mode return"))
			set_fs(USER_DS);

which is obviously bogus (it'll never fire.)
Mark Rutland March 9, 2017, 3:21 p.m. UTC | #6
On Thu, Mar 09, 2017 at 01:44:56PM +0000, Russell King - ARM Linux wrote:
> On Thu, Mar 09, 2017 at 12:09:55PM +0000, Mark Rutland wrote:
> > On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:

> > It would be simpler to wrap the call entirely, e.g. have:
> > 
> > #ifdef CONFIG_WHATEVER
> > static inline void verify_pre_usermode_state(void)
> > {
> > 	if (segment_eq(get_fs(), USER_DS))
> > 		__verify_pre_usermode_state();
> > }
> > #else
> > static inline void verify_pre_usermode_state(void) { }
> > #endif
> 
> That's utterly pointless - you've missed a detail.
> 
> > > @@ -199,7 +215,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
> > >  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
> > >  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
> > >  	{								\
> > > +		bool user_caller = has_user_ds();			\
> > >  		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> > > +		if (user_caller)					\
> > > +			verify_pre_usermode_state();			\
> > 
> > ... then we can unconditionally use verify_pre_usermode_state() here ... 
> 
> Look at this closely.  has_user_ds() is called _before_ the syscall code
> is invoked.  It's checking what conditions the syscall was entered from.
> If the syscall was entered with the user segment selected, then we run
> a check on the system state _after_ the syscall code has returned.

Indeed; I clearly did not consider this correctly. 

Sorry for the noise.

Thanks,
Mark.
Thomas Garnier March 9, 2017, 3:48 p.m. UTC | #7
On Thu, Mar 9, 2017 at 12:42 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
>> @@ -191,6 +191,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>>       SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
>>       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>>
>> +asmlinkage void verify_pre_usermode_state(void);
>> +
>> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> +static inline bool has_user_ds(void) {
>> +     bool ret = segment_eq(get_fs(), USER_DS);
>> +     // Prevent re-ordering the call
>
> This is not the kernel comments style. Use /* */ instead.
>
>> +     barrier();
>> +     return ret;
>> +}
>> +#else
>> +static inline bool has_user_ds(void) {
>> +     return false;
>> +}
>> +#endif
>
> ... and then you could slim down the ifdeffery a bit:
>
> static inline bool has_user_ds(void) {
>         bool ret = false;
>
> #ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>         ret = segment_eq(get_fs(), USER_DS);
>         /* Prevent re-ordering the call. */
>         barrier();
> #endif
>
>         return ret;
> }
>

I agree, cleaner. I will look to do this change on next iteration.

> --
> Regards/Gruss,
>     Boris.
>
> Good mailing practices for 400: avoid top-posting and trim the reply.
Thomas Garnier March 9, 2017, 3:52 p.m. UTC | #8
On Thu, Mar 9, 2017 at 4:09 AM, Mark Rutland <mark.rutland@arm.com> wrote:
> Hi,
>
> On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
>> This patch ensures a syscall does not return to user-mode with a kernel
>> address limit. If that happened, a process can corrupt kernel-mode
>> memory and elevate privileges.
>>
>> For example, it would mitigation this bug:
>>
>> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
>> state will result in a BUG_ON.
>>
>> The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
>> added so each architecture can optimize this change.
>
>> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> +static inline bool has_user_ds(void) {
>> +     bool ret = segment_eq(get_fs(), USER_DS);
>> +     // Prevent re-ordering the call
>> +     barrier();
>
> What ordering are we trying to ensure, that isn't otherwise given?
>
> We expect get_fs() and set_fs() to be ordered w.r.t. each other and
> w.r.t. uaccess uses, or we'd need barriers all over the place.
>
> Given that, I can't see why we need a barrier here. So this needs a
> better comment, at least.
>

I was half sure of that so that's why I added the barrier. If it is
not needed then I can remove it. Thanks!

>> +     return ret;
>> +}
>> +#else
>> +static inline bool has_user_ds(void) {
>> +     return false;
>> +}
>> +#endif
>
> It would be simpler to wrap the call entirely, e.g. have:
>
> #ifdef CONFIG_WHATEVER
> static inline void verify_pre_usermode_state(void)
> {
>         if (segment_eq(get_fs(), USER_DS))
>                 __verify_pre_usermode_state();
> }
> #else
> static inline void verify_pre_usermode_state(void) { }
> #endif
>
>> @@ -199,7 +215,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>>       asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
>>       asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
>>       {                                                               \
>> +             bool user_caller = has_user_ds();                       \
>>               long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
>> +             if (user_caller)                                        \
>> +                     verify_pre_usermode_state();                    \
>
> ... then we can unconditionally use verify_pre_usermode_state() here ...

Not sure I understood that point. The goal is to see if get_fs was
changed, that's why I check before the syscall and I want to ensure
the call is not shuffled after the syscall, therefore the original
barrier.

>
>>               __MAP(x,__SC_TEST,__VA_ARGS__);                         \
>>               __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
>>               return ret;                                             \
>
> [...]
>
>> +/* Called before coming back to user-mode */
>> +asmlinkage void verify_pre_usermode_state(void)
>
> ... and we just prepend a couple of underscores here.
>
>> +{
>> +     if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
>> +                               "incorrect get_fs() on user-mode return"))
>> +             set_fs(USER_DS);
>> +}
>
> Thanks,
> Mark.
Thomas Garnier March 9, 2017, 3:53 p.m. UTC | #9
> On Thu, Mar 9, 2017 at 4:32 AM, Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> Can you please disable that for s390?  (e.g. by setting
> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE for s390)
> We have a separate address space for kernel/user so the logic will
> be slightly different and is already handled in
>
> commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
> Author: Heiko Carstens <heiko.carstens@de.ibm.com>
> Date:   Fri Feb 17 08:13:28 2017 +0100
>
>     s390: restore address space when returning to user space
>

No problem, I will add it on the next iteration of this change.
Thomas Garnier March 9, 2017, 3:54 p.m. UTC | #10
On Thu, Mar 9, 2017 at 7:21 AM, Mark Rutland <mark.rutland@arm.com> wrote:
> On Thu, Mar 09, 2017 at 01:44:56PM +0000, Russell King - ARM Linux wrote:
>> On Thu, Mar 09, 2017 at 12:09:55PM +0000, Mark Rutland wrote:
>> > On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
>
>> > It would be simpler to wrap the call entirely, e.g. have:
>> >
>> > #ifdef CONFIG_WHATEVER
>> > static inline void verify_pre_usermode_state(void)
>> > {
>> >     if (segment_eq(get_fs(), USER_DS))
>> >             __verify_pre_usermode_state();
>> > }
>> > #else
>> > static inline void verify_pre_usermode_state(void) { }
>> > #endif
>>
>> That's utterly pointless - you've missed a detail.
>>
>> > > @@ -199,7 +215,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>> > >   asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
>> > >   asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
>> > >   {                                                               \
>> > > +         bool user_caller = has_user_ds();                       \
>> > >           long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
>> > > +         if (user_caller)                                        \
>> > > +                 verify_pre_usermode_state();                    \
>> >
>> > ... then we can unconditionally use verify_pre_usermode_state() here ...
>>
>> Look at this closely.  has_user_ds() is called _before_ the syscall code
>> is invoked.  It's checking what conditions the syscall was entered from.
>> If the syscall was entered with the user segment selected, then we run
>> a check on the system state _after_ the syscall code has returned.
>
> Indeed; I clearly did not consider this correctly.
>
> Sorry for the noise.
>

No problem, I missed that reply so discard my question on the email
few seconds ago.

> Thanks,
> Mark.
Andy Lutomirski March 9, 2017, 5:27 p.m. UTC | #11
On Thu, Mar 9, 2017 at 12:42 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
>> @@ -191,6 +191,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>>       SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
>>       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>>
>> +asmlinkage void verify_pre_usermode_state(void);
>> +
>> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> +static inline bool has_user_ds(void) {
>> +     bool ret = segment_eq(get_fs(), USER_DS);
>> +     // Prevent re-ordering the call
>
> This is not the kernel comments style. Use /* */ instead.
>
>> +     barrier();
>> +     return ret;
>> +}
>> +#else
>> +static inline bool has_user_ds(void) {
>> +     return false;
>> +}
>> +#endif
>
> ... and then you could slim down the ifdeffery a bit:
>
> static inline bool has_user_ds(void) {
>         bool ret = false;
>
> #ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>         ret = segment_eq(get_fs(), USER_DS);
>         /* Prevent re-ordering the call. */
>         barrier();
> #endif
>
>         return ret;
> }

I don't like having any kernel configuration in which has_user_ds()
unconditionally returns false.  Can we put the ifdeffery in the caller
instead?

--Andy
Thomas Garnier March 9, 2017, 5:41 p.m. UTC | #12
On Thu, Mar 9, 2017 at 9:27 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Mar 9, 2017 at 12:42 AM, Borislav Petkov <bp@alien8.de> wrote:
>> On Wed, Mar 08, 2017 at 05:24:53PM -0800, Thomas Garnier wrote:
>>> @@ -191,6 +191,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>>>       SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
>>>       __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>>>
>>> +asmlinkage void verify_pre_usermode_state(void);
>>> +
>>> +#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>>> +static inline bool has_user_ds(void) {
>>> +     bool ret = segment_eq(get_fs(), USER_DS);
>>> +     // Prevent re-ordering the call
>>
>> This is not the kernel comments style. Use /* */ instead.
>>
>>> +     barrier();
>>> +     return ret;
>>> +}
>>> +#else
>>> +static inline bool has_user_ds(void) {
>>> +     return false;
>>> +}
>>> +#endif
>>
>> ... and then you could slim down the ifdeffery a bit:
>>
>> static inline bool has_user_ds(void) {
>>         bool ret = false;
>>
>> #ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>>         ret = segment_eq(get_fs(), USER_DS);
>>         /* Prevent re-ordering the call. */
>>         barrier();
>> #endif
>>
>>         return ret;
>> }
>
> I don't like having any kernel configuration in which has_user_ds()
> unconditionally returns false.  Can we put the ifdeffery in the caller
> instead?

I don't like it either to be honest. We could have something like:

#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
#define CHECK_USER_CALLER(_x) bool _x = segment_eq(get_fs(), USER_DS)
#else
#define CHECK_USER_CALLER(_x) bool _x = false
#endif

// In the syscall macro:
CHECK_CALLED_BY_USER(user_caller);

>
> --Andy
diff mbox

Patch

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..78a2268ecd6e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,22 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 
+asmlinkage void verify_pre_usermode_state(void);
+
+#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+static inline bool has_user_ds(void) {
+	bool ret = segment_eq(get_fs(), USER_DS);
+	// Prevent re-ordering the call
+	barrier();
+	return ret;
+}
+#else
+static inline bool has_user_ds(void) {
+	return false;
+}
+#endif
+
+
 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
@@ -199,7 +215,10 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
+		bool user_caller = has_user_ds();			\
 		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		if (user_caller)					\
+			verify_pre_usermode_state();			\
 		__MAP(x,__SC_TEST,__VA_ARGS__);				\
 		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
 		return ret;						\
diff --git a/init/Kconfig b/init/Kconfig
index c859c993c26f..c4efc3a95e4a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1929,6 +1929,13 @@  config PROFILING
 config TRACEPOINTS
 	bool
 
+#
+# Set by each architecture that want to optimize how verify_pre_usermode_state
+# is called.
+#
+config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+	bool
+
 source "arch/Kconfig"
 
 endmenu		# General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 196c7134bee6..411163ac9dc3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2459,3 +2459,11 @@  COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
 	return 0;
 }
 #endif /* CONFIG_COMPAT */
+
+/* Called before coming back to user-mode */
+asmlinkage void verify_pre_usermode_state(void)
+{
+	if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
+				  "incorrect get_fs() on user-mode return"))
+		set_fs(USER_DS);
+}