[v8,1/4] syscalls: Verify address limit before returning to user-mode
diff mbox

Message ID 20170426183425.32158-1-thgarnie@google.com
State New
Headers show

Commit Message

Thomas Garnier April 26, 2017, 6:34 p.m. UTC
Ensure that a syscall does not return to user-mode with a kernel address
limit. If that happens, a process can corrupt kernel-mode memory and
elevate privileges [1].

The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
architecture can create optimized versions.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
---
Based on next-20170426
---
 arch/s390/Kconfig        |  1 +
 include/linux/syscalls.h | 27 ++++++++++++++++++++++++++-
 init/Kconfig             |  6 ++++++
 kernel/sys.c             | 13 +++++++++++++
 4 files changed, 46 insertions(+), 1 deletion(-)

Comments

Ingo Molnar April 27, 2017, 6:49 a.m. UTC | #1
* Thomas Garnier <thgarnie@google.com> wrote:

> +
> +/*
> + * Called before coming back to user-mode. Returning to user-mode with an
> + * address limit different than USER_DS can allow to overwrite kernel memory.
> + */
> +static inline void addr_limit_check_syscall(void)
> +{
> +	BUG_ON(!segment_eq(get_fs(), USER_DS));
> +}
> +
> +#ifndef CONFIG_ADDR_LIMIT_CHECK
> +#define __CHECK_USERMODE_SYSCALL() \
> +	bool user_caller = segment_eq(get_fs(), USER_DS)
> +#define __VERIFY_ADDR_LIMIT() \
> +	if (user_caller) addr_limit_check_syscall()
> +#else
> +#define __CHECK_USERMODE_SYSCALL()
> +#define __VERIFY_ADDR_LIMIT()
> +asmlinkage void addr_limit_check_failed(void) __noreturn;
> +#endif

_Please_ harmonize all the externally exposed names and symbols.

There's no reason for this mismash of names:

	CONFIG_ADDR_LIMIT_CHECK

	__CHECK_USERMODE_SYSCALL
	__VERIFY_ADDR_LIMIT

When we could just as easily name them consistently, along the existing pattern:

	CONFIG_ADDR_LIMIT_CHECK

	__SYSCALL_ADDR_LIMIT_CHECK
	__ADDR_LIMIT_CHECK

which should fit into existing nomenclature:

>  #define __SYSCALL_DEFINEx(x, name, ...)					\

But even with that fixed, the whole construct still looks pretty weird:

>  	{								\
> -		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> +		long ret;						\
> +		__CHECK_USERMODE_SYSCALL();				\
> +		ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> +		__ADDR_LIMIT_CHECK();					\
>  		__MAP(x,__SC_TEST,__VA_ARGS__);				\
>  		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
>  		return ret;						\

I think something like this would be more natural to read:

> +		ADDR_LIMIT_CHECK_PRE();					\
> +		ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
> +		ADDR_LIMIT_CHECK_POST();				\

it's a clear pre/post construct. Also note the lack of double underscores.

BTW., a further simplification would be:

#ifndef ADDR_LIMIT_CHECK_PRE
# define ADDR_LIMIT_CHECK_PRE ...
#endif

This way architectures could override this generic functionality simply by 
defining the helpers. Architectures that don't do that get the generic version. 

Thanks,

	Ingo
Thomas Garnier April 27, 2017, 2:16 p.m. UTC | #2
On Wed, Apr 26, 2017 at 11:49 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Thomas Garnier <thgarnie@google.com> wrote:
>
>> +
>> +/*
>> + * Called before coming back to user-mode. Returning to user-mode with an
>> + * address limit different than USER_DS can allow to overwrite kernel memory.
>> + */
>> +static inline void addr_limit_check_syscall(void)
>> +{
>> +     BUG_ON(!segment_eq(get_fs(), USER_DS));
>> +}
>> +
>> +#ifndef CONFIG_ADDR_LIMIT_CHECK
>> +#define __CHECK_USERMODE_SYSCALL() \
>> +     bool user_caller = segment_eq(get_fs(), USER_DS)
>> +#define __VERIFY_ADDR_LIMIT() \
>> +     if (user_caller) addr_limit_check_syscall()
>> +#else
>> +#define __CHECK_USERMODE_SYSCALL()
>> +#define __VERIFY_ADDR_LIMIT()
>> +asmlinkage void addr_limit_check_failed(void) __noreturn;
>> +#endif
>
> _Please_ harmonize all the externally exposed names and symbols.
>
> There's no reason for this mismash of names:
>
>         CONFIG_ADDR_LIMIT_CHECK
>
>         __CHECK_USERMODE_SYSCALL
>         __VERIFY_ADDR_LIMIT
>
> When we could just as easily name them consistently, along the existing pattern:
>
>         CONFIG_ADDR_LIMIT_CHECK
>
>         __SYSCALL_ADDR_LIMIT_CHECK
>         __ADDR_LIMIT_CHECK
>
> which should fit into existing nomenclature:
>
>>  #define __SYSCALL_DEFINEx(x, name, ...)                                      \
>
> But even with that fixed, the whole construct still looks pretty weird:
>
>>       {                                                               \
>> -             long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
>> +             long ret;                                               \
>> +             __CHECK_USERMODE_SYSCALL();                             \
>> +             ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));       \
>> +             __ADDR_LIMIT_CHECK();                                   \
>>               __MAP(x,__SC_TEST,__VA_ARGS__);                         \
>>               __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
>>               return ret;                                             \
>
> I think something like this would be more natural to read:
>
>> +             ADDR_LIMIT_CHECK_PRE();                                 \
>> +             ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));       \
>> +             ADDR_LIMIT_CHECK_POST();                                \
>
> it's a clear pre/post construct. Also note the lack of double underscores.

I think this construct makes more sense because the first macro check
if the syscall was called by user-mode. I will send an update for this
on this thread.

>
> BTW., a further simplification would be:
>
> #ifndef ADDR_LIMIT_CHECK_PRE
> # define ADDR_LIMIT_CHECK_PRE ...
> #endif
>
> This way architectures could override this generic functionality simply by
> defining the helpers. Architectures that don't do that get the generic version.

I don't think architectures need to do that. The optimizations are
embedding the checks on their architecture-specific code to make it
faster and remove the size impact. The pre/post is fine for the rest.

>
> Thanks,
>
>         Ingo
Ingo Molnar April 28, 2017, 6:33 a.m. UTC | #3
* Thomas Garnier <thgarnie@google.com> wrote:

> > BTW., a further simplification would be:
> >
> > #ifndef ADDR_LIMIT_CHECK_PRE
> > # define ADDR_LIMIT_CHECK_PRE ...
> > #endif
> >
> > This way architectures could override this generic functionality simply by
> > defining the helpers. Architectures that don't do that get the generic version.
> 
> I don't think architectures need to do that. The optimizations are
> embedding the checks on their architecture-specific code to make it
> faster and remove the size impact. The pre/post is fine for the rest.

Indeed, only the generic code needs to turn off that code - architectures will 
place these callbacks elsewhere.

Thanks,

	Ingo

Patch
diff mbox

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d25435d94b6e..164de1d24e92 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -103,6 +103,7 @@  config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	select ADDR_LIMIT_CHECK
 	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..ebde64f1622c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,28 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 
+
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_check_syscall(void)
+{
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+}
+
+#ifndef CONFIG_ADDR_LIMIT_CHECK
+#define __CHECK_USERMODE_SYSCALL() \
+	bool user_caller = segment_eq(get_fs(), USER_DS)
+#define __VERIFY_ADDR_LIMIT() \
+	if (user_caller) addr_limit_check_syscall()
+#else
+#define __CHECK_USERMODE_SYSCALL()
+#define __VERIFY_ADDR_LIMIT()
+asmlinkage void addr_limit_check_failed(void) __noreturn;
+#endif
+
+
 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
@@ -199,7 +221,10 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
-		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		long ret;						\
+		__CHECK_USERMODE_SYSCALL();				\
+		ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		__VERIFY_ADDR_LIMIT();					\
 		__MAP(x,__SC_TEST,__VA_ARGS__);				\
 		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
 		return ret;						\
diff --git a/init/Kconfig b/init/Kconfig
index 42a346b0df43..599d9fe30703 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1961,6 +1961,12 @@  config PROFILING
 config TRACEPOINTS
 	bool
 
+config ADDR_LIMIT_CHECK
+	bool
+	help
+	  Disable the generic address limit check. Allow each architecture to
+	  optimize how and when the verification is done.
+
 source "arch/Kconfig"
 
 endmenu		# General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..a1cbcd715d62 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2458,3 +2458,16 @@  COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
 	return 0;
 }
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_ADDR_LIMIT_CHECK
+/*
+ * Used when an architecture specific implementation detects an invalid address
+ * limit. This function does not return.
+ */
+asmlinkage void addr_limit_check_failed(void)
+{
+	/* Try to fail on the generic address limit check */
+	addr_limit_check_syscall();
+	panic("Invalid address limit before returning to user-mode");
+}
+#endif