diff mbox

[v9,1/4] syscalls: Verify address limit before returning to user-mode

Message ID 20170428153213.137279-1-thgarnie@google.com
State New, archived
Headers show

Commit Message

Thomas Garnier April 28, 2017, 3:32 p.m. UTC
Ensure that a syscall does not return to user-mode with a kernel address
limit. If that happens, a process can corrupt kernel-mode memory and
elevate privileges [1].

The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
architecture can create optimized versions. This option is enabled by
default on s390 because a similar feature already exists.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
---
Based on next-20170426
---
 arch/s390/Kconfig        |  1 +
 include/linux/syscalls.h | 27 ++++++++++++++++++++++++++-
 init/Kconfig             |  6 ++++++
 kernel/sys.c             | 13 +++++++++++++
 4 files changed, 46 insertions(+), 1 deletion(-)

Comments

Thomas Garnier May 5, 2017, 10:18 p.m. UTC | #1
On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote:
> Ensure that a syscall does not return to user-mode with a kernel address
> limit. If that happens, a process can corrupt kernel-mode memory and
> elevate privileges [1].
>
> The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
> architecture can create optimized versions. This option is enabled by
> default on s390 because a similar feature already exists.
>
> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>
> Signed-off-by: Thomas Garnier <thgarnie@google.com>
> Tested-by: Kees Cook <keescook@chromium.org>

Ingo: Do you want to take the set?

> ---
> Based on next-20170426
> ---
>  arch/s390/Kconfig        |  1 +
>  include/linux/syscalls.h | 27 ++++++++++++++++++++++++++-
>  init/Kconfig             |  6 ++++++
>  kernel/sys.c             | 13 +++++++++++++
>  4 files changed, 46 insertions(+), 1 deletion(-)
>
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index d25435d94b6e..3d2ec084d5fc 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES
>
>  config S390
>         def_bool y
> +       select ADDR_LIMIT_CHECK
>         select ARCH_HAS_DEVMEM_IS_ALLOWED
>         select ARCH_HAS_ELF_RANDOMIZE
>         select ARCH_HAS_GCOV_PROFILE_ALL
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9b06f8..e534b93ce43a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -191,6 +191,28 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>         SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
>         __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>
> +
> +/*
> + * Called before coming back to user-mode. Returning to user-mode with an
> + * address limit different than USER_DS can allow to overwrite kernel memory.
> + */
> +static inline void addr_limit_check_syscall(void)
> +{
> +       BUG_ON(!segment_eq(get_fs(), USER_DS));
> +}
> +
> +#ifndef CONFIG_ADDR_LIMIT_CHECK
> +#define ADDR_LIMIT_CHECK_PRE() \
> +       bool user_caller = segment_eq(get_fs(), USER_DS)
> +#define ADDR_LIMIT_CHECK_POST() \
> +       if (user_caller) addr_limit_check_syscall()
> +#else
> +#define ADDR_LIMIT_CHECK_PRE()
> +#define ADDR_LIMIT_CHECK_POST()
> +asmlinkage void addr_limit_check_failed(void) __noreturn;
> +#endif
> +
> +
>  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
>  #define __SYSCALL_DEFINEx(x, name, ...)                                        \
>         asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
> @@ -199,7 +221,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>         asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
>         asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
>         {                                                               \
> -               long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
> +               long ret;                                               \
> +               ADDR_LIMIT_CHECK_PRE();                                 \
> +               ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));       \
> +               ADDR_LIMIT_CHECK_POST();                                \
>                 __MAP(x,__SC_TEST,__VA_ARGS__);                         \
>                 __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
>                 return ret;                                             \
> diff --git a/init/Kconfig b/init/Kconfig
> index 42a346b0df43..599d9fe30703 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1961,6 +1961,12 @@ config PROFILING
>  config TRACEPOINTS
>         bool
>
> +config ADDR_LIMIT_CHECK
> +       bool
> +       help
> +         Disable the generic address limit check. Allow each architecture to
> +         optimize how and when the verification is done.
> +
>  source "arch/Kconfig"
>
>  endmenu                # General setup
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 8a94b4eabcaa..a1cbcd715d62 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2458,3 +2458,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
>         return 0;
>  }
>  #endif /* CONFIG_COMPAT */
> +
> +#ifdef CONFIG_ADDR_LIMIT_CHECK
> +/*
> + * Used when an architecture specific implementation detects an invalid address
> + * limit. This function does not return.
> + */
> +asmlinkage void addr_limit_check_failed(void)
> +{
> +       /* Try to fail on the generic address limit check */
> +       addr_limit_check_syscall();
> +       panic("Invalid address limit before returning to user-mode");
> +}
> +#endif
> --
> 2.13.0.rc0.306.g87b477812d-goog
>
Ingo Molnar May 8, 2017, 7:33 a.m. UTC | #2
(added more Cc:s)

* Thomas Garnier <thgarnie@google.com> wrote:

> On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote:
> > Ensure that a syscall does not return to user-mode with a kernel address
> > limit. If that happens, a process can corrupt kernel-mode memory and
> > elevate privileges [1].
> >
> > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
> > architecture can create optimized versions. This option is enabled by
> > default on s390 because a similar feature already exists.
> >
> > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> >
> > Signed-off-by: Thomas Garnier <thgarnie@google.com>
> > Tested-by: Kees Cook <keescook@chromium.org>
> 
> Ingo: Do you want to take the set?

Yeah, so now I'm questioning the whole premise of the feature, sorry :-/

A big disavantage is that the "security check" will add 2-5 instructions to the 
system call fast path. Every one of them, and essentially forever. Just to handle 
a CVE that was caused by a buggy touch-screen driver helper function leaking 
KERNEL_DS and which was fixed long ago ...

And yes, I realize that there were other such bugs and that such bugs might occur 
in the future - but why not push the overhead of the security check to the kernel 
build phase? I.e. I'm wondering how well we could do static analysis during kernel 
build - would a limited mode of Sparse be good enough for that? Or we could add a 
new static checker to tools/, built from first principles and used primarily for 
extended syntactical checking.

For example I'd consider it a good practice to mandate that if a kernel function 
sets KERNEL_DS then it must restore it as well. Any function that does not do 
that, or is too complex for the static analysis to prove correctness for sure 
should be considered buggy!

Are there any common kernel APIs outside set_fs() that set KERNEL_DS 
intentionally? The overwhelming pattern ought to be:

        orig_fs = get_fs();
        set_fs(KERNEL_DS);
	...
        set_fs(orig_fs);

... and even a relatively simple static analysis tool ought to be able to see 
through that.

I'd even suggest we do it not like Sparse builds are done today, but in a more 
integrated fashion: do static analysis as part of a typical kernel defconfig build 
and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for 
modconfig builds.

_That_ solution I'd feel very, very good about - it would be so much better than 
any runtime checks...

Not to mention that such an integrated static analysis facility would allow many 
other things to be checked during build time, which we couldn't possibly check 
runtime.

Thanks,

	Ingo
Ingo Molnar May 8, 2017, 7:52 a.m. UTC | #3
* Ingo Molnar <mingo@kernel.org> wrote:

> ... and even a relatively simple static analysis tool ought to be able to see 
> through that.
> 
> I'd even suggest we do it not like Sparse builds are done today, but in a more 
> integrated fashion: do static analysis as part of a typical kernel defconfig 
> build and not tolerate warnings but go for a 'zero warnings' policy like Linus 
> uses for modconfig builds.
> 
> _That_ solution I'd feel very, very good about - it would be so much better than 
> any runtime checks...

So the problem I have with Sparse is that it is very spammy. For example:

	make C=1 kernel/sched/

... produces:

    kernel/sched/core.c:792:6: warning: symbol 'sched_set_stop_task' was not declared. Should it be static?
    kernel/sched/core.c:1298:5: warning: symbol 'migrate_swap' was not declared. Should it be static?
    kernel/sched/core.c:3648:35: warning: symbol 'preempt_schedule_irq' was not declared. Should it be static?
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    kernel/sched/clock.c:80:19: warning: symbol 'sched_clock_running' was not declared. Should it be static?
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    kernel/sched/cputime.c:335:33: warning: context imbalance in 'thread_group_cputime' - different lock contexts for basic block
    kernel/sched/fair.c:54:14: warning: symbol 'normalized_sysctl_sched_latency' was not declared. Should it be static?
    kernel/sched/fair.c:75:14: warning: symbol 'normalized_sysctl_sched_min_granularity' was not declared. Should it be static?
    kernel/sched/fair.c:98:14: warning: symbol 'normalized_sysctl_sched_wakeup_granularity' was not declared. Should it be static?
    kernel/sched/fair.c:132:14: warning: symbol 'capacity_margin' was not declared. Should it be static?
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/fair.c:4688:35: error: marked inline, but without a definition
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/fair.c:5817:19: error: internal error: bad type in derived(11)
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/fair.c:5817:19: error: incompatible types in comparison expression (different base types)
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/rt.c:635:6: warning: symbol 'sched_rt_bandwidth_account' was not declared. Should it be static?
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    kernel/sched/topology.c:499:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:499:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:499:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:534:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:534:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:534:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:554:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:554:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:554:28:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:594:36: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:594:36:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:594:36:    got struct sched_domain **<noident>
    kernel/sched/topology.c:601:24: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:601:24:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:601:24:    got struct sched_group **<noident>
    kernel/sched/topology.c:602:31: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:602:31:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:602:31:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:1330:39: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1330:39:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1330:39:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1333:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1333:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1333:40:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1337:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1337:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1337:40:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:1339:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1339:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1339:40:    got struct sched_group **<noident>
    kernel/sched/topology.c:1341:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1341:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1341:40:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:1343:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1343:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1343:32:    got struct sched_domain **[noderef] sd
    kernel/sched/topology.c:1345:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1345:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1345:32:    got struct sched_domain_shared **[noderef] sds
    kernel/sched/topology.c:1347:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1347:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1347:32:    got struct sched_group **[noderef] sg
    kernel/sched/topology.c:1349:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1349:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1349:32:    got struct sched_group_capacity **[noderef] sgc
    kernel/sched/topology.c:1261:25: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1261:25:    expected struct sched_domain **[noderef] sd
    kernel/sched/topology.c:1261:25:    got struct sched_domain *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1265:26: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1265:26:    expected struct sched_domain_shared **[noderef] sds
    kernel/sched/topology.c:1265:26:    got struct sched_domain_shared *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1269:25: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1269:25:    expected struct sched_group **[noderef] sg
    kernel/sched/topology.c:1269:25:    got struct sched_group *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1273:26: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1273:26:    expected struct sched_group_capacity **[noderef] sgc
    kernel/sched/topology.c:1273:26:    got struct sched_group_capacity *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1288:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1288:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1288:26:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1295:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1295:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1295:26:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:1304:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1304:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1304:26:    got struct sched_group **<noident>
    kernel/sched/topology.c:1311:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1311:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1311:26:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:759:30: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:759:30:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:759:30:    got struct sched_domain **[noderef] sd
    kernel/sched/topology.c:776:15: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:776:15:    expected struct sched_domain **[noderef] sd
    kernel/sched/topology.c:776:15:    got struct sched_domain *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:794:9: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:794:9:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:794:9:    got struct sched_domain **<noident>
    kernel/sched/topology.c:795:10: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:795:10:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:795:10:    got struct sched_domain **<noident>
    kernel/sched/topology.c:797:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:797:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:797:28:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:798:18: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:798:18:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:798:18:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:800:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:800:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:800:28:    got struct sched_group **<noident>
    kernel/sched/topology.c:801:18: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:801:18:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:801:18:    got struct sched_group **<noident>
    kernel/sched/topology.c:803:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:803:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:803:28:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:804:18: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:804:18:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:804:18:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:848:36: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:848:36:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:848:36:    got struct sched_domain **<noident>
    kernel/sched/topology.c:954:31: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:954:31:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:954:31:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:1354:21: warning: symbol 'build_sched_domain' was not declared. Should it be static?
    kernel/sched/topology.c:1409:34: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1409:34:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1409:34:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1419:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1419:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1419:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1436:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1436:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1436:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1446:23: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1446:23:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1446:23:    got struct sched_domain **<noident>
    kernel/sched/topology.c:759:29: warning: dereference of noderef expression
    kernel/sched/topology.c:777:14: warning: dereference of noderef expression
    kernel/sched/topology.c:1262:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1266:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1270:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1274:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1329:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1336:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1338:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1340:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1343:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1344:17: warning: dereference of noderef expression
    kernel/sched/topology.c:1345:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1346:17: warning: dereference of noderef expression
    kernel/sched/topology.c:1347:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1348:17: warning: dereference of noderef expression
    kernel/sched/topology.c:1349:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1350:17: warning: dereference of noderef expression

... it's just not usable in that form for a regular maintenance flow.

So what would be more useful is to add a specific Sparse check that only checks 
KERNEL_DS, to add it as a regular (.config driven) build option and make sure the 
kernel build has zero warnings.

From that point on we can declare that this kind of bug won't occur anymore, if 
the Sparse implementation of the check is correct.

But there's a (big) problem with that development model: Sparse is not part of the 
kernel tree and adding a feature to it while making the kernel depend on that 
brand new feature is a logistical nightmare. The overhead is quite similar to 
adding new features to a compiler - it happens at a glacial pace and is only done 
for major features really, at considerable expense. I don't think this is an 
adequate model for 'extended syntax checking' of the kernel, especially when it 
comes to correctness that has such obvious security impact.

Thanks,

	Ingo
Greg KH May 8, 2017, 12:46 p.m. UTC | #4
On Mon, May 08, 2017 at 09:33:52AM +0200, Ingo Molnar wrote:
> 
> (added more Cc:s)
> 
> * Thomas Garnier <thgarnie@google.com> wrote:
> 
> > On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote:
> > > Ensure that a syscall does not return to user-mode with a kernel address
> > > limit. If that happens, a process can corrupt kernel-mode memory and
> > > elevate privileges [1].
> > >
> > > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
> > > architecture can create optimized versions. This option is enabled by
> > > default on s390 because a similar feature already exists.
> > >
> > > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> > >
> > > Signed-off-by: Thomas Garnier <thgarnie@google.com>
> > > Tested-by: Kees Cook <keescook@chromium.org>
> > 
> > Ingo: Do you want to take the set?
> 
> Yeah, so now I'm questioning the whole premise of the feature, sorry :-/
> 
> A big disavantage is that the "security check" will add 2-5 instructions to the 
> system call fast path. Every one of them, and essentially forever. Just to handle 
> a CVE that was caused by a buggy touch-screen driver helper function leaking 
> KERNEL_DS and which was fixed long ago ...
> 
> And yes, I realize that there were other such bugs and that such bugs might occur 
> in the future - but why not push the overhead of the security check to the kernel 
> build phase? I.e. I'm wondering how well we could do static analysis during kernel 
> build - would a limited mode of Sparse be good enough for that? Or we could add a 
> new static checker to tools/, built from first principles and used primarily for 
> extended syntactical checking.
> 
> For example I'd consider it a good practice to mandate that if a kernel function 
> sets KERNEL_DS then it must restore it as well. Any function that does not do 
> that, or is too complex for the static analysis to prove correctness for sure 
> should be considered buggy!
> 
> Are there any common kernel APIs outside set_fs() that set KERNEL_DS 
> intentionally? The overwhelming pattern ought to be:
> 
>         orig_fs = get_fs();
>         set_fs(KERNEL_DS);
> 	...
>         set_fs(orig_fs);
> 
> ... and even a relatively simple static analysis tool ought to be able to see 
> through that.
> 
> I'd even suggest we do it not like Sparse builds are done today, but in a more 
> integrated fashion: do static analysis as part of a typical kernel defconfig build 
> and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for 
> modconfig builds.
> 
> _That_ solution I'd feel very, very good about - it would be so much better than 
> any runtime checks...
> 
> Not to mention that such an integrated static analysis facility would allow many 
> other things to be checked during build time, which we couldn't possibly check 
> runtime.

What about a simple coccinelle script to test for this type of thing?
We write it once, add it to the in-kernel body of tests, and then 0-day
runs it on all trees all the time.  That should catch this type of
issue, like all of the other "bad programming bus" that the tool
currently catches.

thanks,

greg k-h
Kees Cook May 8, 2017, 1:09 p.m. UTC | #5
On Mon, May 8, 2017 at 12:33 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> (added more Cc:s)
>
> * Thomas Garnier <thgarnie@google.com> wrote:
>
>> On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote:
>> > Ensure that a syscall does not return to user-mode with a kernel address
>> > limit. If that happens, a process can corrupt kernel-mode memory and
>> > elevate privileges [1].
>> >
>> > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
>> > architecture can create optimized versions. This option is enabled by
>> > default on s390 because a similar feature already exists.
>> >
>> > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>> >
>> > Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> > Tested-by: Kees Cook <keescook@chromium.org>
>>
>> Ingo: Do you want to take the set?
>
> Yeah, so now I'm questioning the whole premise of the feature, sorry :-/
>
> A big disavantage is that the "security check" will add 2-5 instructions to the
> system call fast path. Every one of them, and essentially forever. Just to handle
> a CVE that was caused by a buggy touch-screen driver helper function leaking
> KERNEL_DS and which was fixed long ago ...
>
> And yes, I realize that there were other such bugs and that such bugs might occur
> in the future - but why not push the overhead of the security check to the kernel
> build phase? I.e. I'm wondering how well we could do static analysis during kernel
> build - would a limited mode of Sparse be good enough for that? Or we could add a
> new static checker to tools/, built from first principles and used primarily for
> extended syntactical checking.

Static analysis is just not going to cover all cases. We've had
vulnerabilities where interrupt handlers left KERNEL_DS set, for
example. If there are performance concerns, let's put this behind a
CONFIG. 2-5 instructions is not an issue for most people that want
this coverage.

> For example I'd consider it a good practice to mandate that if a kernel function
> sets KERNEL_DS then it must restore it as well. Any function that does not do
> that, or is too complex for the static analysis to prove correctness for sure
> should be considered buggy!
>
> Are there any common kernel APIs outside set_fs() that set KERNEL_DS
> intentionally? The overwhelming pattern ought to be:
>
>         orig_fs = get_fs();
>         set_fs(KERNEL_DS);
>         ...
>         set_fs(orig_fs);
>
> ... and even a relatively simple static analysis tool ought to be able to see
> through that.

This pattern was, in fact, what the interrupt handler bug escaped
from. We have to build proactive defenses, and this check has a clear
defensive advantage. It's a noble goal to improve the static analyzers
and simplify the source, but we have too much history to prove that
this just isn't enough. This instruction cost of this is extremely
small, too. Until we can eliminate set_fs(), we need to add this
check.


> I'd even suggest we do it not like Sparse builds are done today, but in a more
> integrated fashion: do static analysis as part of a typical kernel defconfig build
> and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for
> modconfig builds.
>
> _That_ solution I'd feel very, very good about - it would be so much better than
> any runtime checks...

I'm not opposed to this, but there will be push-back on "making the
build slower", and it still won't catch everything. Bug-finding is
different from making a bug class just unexploitable at all. As we've
done before, it's the difference between trying to find format string
attacks vs just removing %n from the format parser.

> Not to mention that such an integrated static analysis facility would allow many
> other things to be checked during build time, which we couldn't possibly check
> runtime.

Absolutely! But it's orthogonal to proactive runtime exploit blocking.
We've got one that works and defends against an entire class of
vulnerability for very low cost. It it's truly too costly for default,
let's put it behind a CONFIG and see who wants it. (Most distros, I
suspect, will enable it, just like hardened usercopy which is much
more expensive than this.)

-Kees
Ingo Molnar May 8, 2017, 2:02 p.m. UTC | #6
* Kees Cook <keescook@chromium.org> wrote:

> > And yes, I realize that there were other such bugs and that such bugs might 
> > occur in the future - but why not push the overhead of the security check to 
> > the kernel build phase? I.e. I'm wondering how well we could do static 
> > analysis during kernel build - would a limited mode of Sparse be good enough 
> > for that? Or we could add a new static checker to tools/, built from first 
> > principles and used primarily for extended syntactical checking.
> 
> Static analysis is just not going to cover all cases. We've had vulnerabilities 
> where interrupt handlers left KERNEL_DS set, for example. [...]

Got any commit ID of that bug - was it because a function executed by the 
interrupt handler leaked KERNEL_DS?

> [...] If there are performance concerns, let's put this behind a CONFIG. 2-5 
> instructions is not an issue for most people that want this coverage.

That doesn't really _solve_ the performance concerns, it just forces most people 
to enable it by creating a 'security or performance' false dichotomy ...

> [...] and it still won't catch everything. Bug-finding is different from making 
> a bug class just unexploitable at all. As we've done before, it's the difference 
> between trying to find format string attacks vs just removing %n from the format 
> parser.

No, it does not make it unexploitable, it could still be exploitable if the 
runtime check is buggy or if there's kernel execution outside of the regular 
system call paths - there's plenty of such hardware functionality on x86 for 
example.

Thanks,

	Ingo
Jann Horn May 8, 2017, 2:06 p.m. UTC | #7
On Mon, May 8, 2017 at 4:02 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Kees Cook <keescook@chromium.org> wrote:
>
>> > And yes, I realize that there were other such bugs and that such bugs might
>> > occur in the future - but why not push the overhead of the security check to
>> > the kernel build phase? I.e. I'm wondering how well we could do static
>> > analysis during kernel build - would a limited mode of Sparse be good enough
>> > for that? Or we could add a new static checker to tools/, built from first
>> > principles and used primarily for extended syntactical checking.
>>
>> Static analysis is just not going to cover all cases. We've had vulnerabilities
>> where interrupt handlers left KERNEL_DS set, for example. [...]
>
> Got any commit ID of that bug - was it because a function executed by the
> interrupt handler leaked KERNEL_DS?

I think Kees might be talking about
https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in
commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that
perf code that can run in pretty much any context called access_ok().
Daniel Micay May 8, 2017, 3:22 p.m. UTC | #8
On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote:
> 
> ... it's just not usable in that form for a regular maintenance flow.
> 
> So what would be more useful is to add a specific Sparse check that
> only checks 
> KERNEL_DS, to add it as a regular (.config driven) build option and
> make sure the 
> kernel build has zero warnings.
> 
> From that point on we can declare that this kind of bug won't occur
> anymore, if 
> the Sparse implementation of the check is correct.
> 
> But there's a (big) problem with that development model: Sparse is not
> part of the 
> kernel tree and adding a feature to it while making the kernel depend
> on that 
> brand new feature is a logistical nightmare. The overhead is quite
> similar to 
> adding new features to a compiler - it happens at a glacial pace and
> is only done 
> for major features really, at considerable expense. I don't think this
> is an 
> adequate model for 'extended syntax checking' of the kernel,
> especially when it 
> comes to correctness that has such obvious security impact.
> 
> Thanks,
> 
> 	Ingo

There's the option of using GCC plugins now that the infrastructure was
upstreamed from grsecurity. It can be used as part of the regular build
process and as long as the analysis is pretty simple it shouldn't hurt
compile time much.

The problem with doing that is I don't think there are people with much
experience with GCC contributing upstream and it's going to be more work
to develop/maintain than some kind of specialized DSL for analysis. I
think a few static analysis plugins are used as part of maintaining
grsecurity for solving issues like finding false positives for the
REFCOUNT overflow checking feature, so it's something that's already
being done in practice elsewhere.
Kees Cook May 8, 2017, 3:24 p.m. UTC | #9
On Mon, May 8, 2017 at 7:02 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Kees Cook <keescook@chromium.org> wrote:
>
>> > And yes, I realize that there were other such bugs and that such bugs might
>> > occur in the future - but why not push the overhead of the security check to
>> > the kernel build phase? I.e. I'm wondering how well we could do static
>> > analysis during kernel build - would a limited mode of Sparse be good enough
>> > for that? Or we could add a new static checker to tools/, built from first
>> > principles and used primarily for extended syntactical checking.
>>
>> Static analysis is just not going to cover all cases. We've had vulnerabilities
>> where interrupt handlers left KERNEL_DS set, for example. [...]
>
> Got any commit ID of that bug - was it because a function executed by the
> interrupt handler leaked KERNEL_DS?

Ah, it was an exception handler, but the one I was thinking of was this:
https://lwn.net/Articles/419141/

>> [...] If there are performance concerns, let's put this behind a CONFIG. 2-5
>> instructions is not an issue for most people that want this coverage.
>
> That doesn't really _solve_ the performance concerns, it just forces most people
> to enable it by creating a 'security or performance' false dichotomy ...

That's fair, but what I'm trying to say is that many people will want
this, so rejecting it because it's 2 more instructions seems
unreasonable. We have had much more invasive changes added to the
kernel.

>> [...] and it still won't catch everything. Bug-finding is different from making
>> a bug class just unexploitable at all. As we've done before, it's the difference
>> between trying to find format string attacks vs just removing %n from the format
>> parser.
>
> No, it does not make it unexploitable, it could still be exploitable if the
> runtime check is buggy or if there's kernel execution outside of the regular
> system call paths - there's plenty of such hardware functionality on x86 for
> example.

Fine, but this is splitting hairs. This does protect a specific
situation, and it does so very cheaply. The real fix would be to
remove set_fs() entirely. :P

-Kees
Kees Cook May 8, 2017, 3:26 p.m. UTC | #10
On Mon, May 8, 2017 at 8:22 AM, Daniel Micay <danielmicay@gmail.com> wrote:
> On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote:
>>
>> ... it's just not usable in that form for a regular maintenance flow.
>>
>> So what would be more useful is to add a specific Sparse check that
>> only checks
>> KERNEL_DS, to add it as a regular (.config driven) build option and
>> make sure the
>> kernel build has zero warnings.
>>
>> From that point on we can declare that this kind of bug won't occur
>> anymore, if
>> the Sparse implementation of the check is correct.
>>
>> But there's a (big) problem with that development model: Sparse is not
>> part of the
>> kernel tree and adding a feature to it while making the kernel depend
>> on that
>> brand new feature is a logistical nightmare. The overhead is quite
>> similar to
>> adding new features to a compiler - it happens at a glacial pace and
>> is only done
>> for major features really, at considerable expense. I don't think this
>> is an
>> adequate model for 'extended syntax checking' of the kernel,
>> especially when it
>> comes to correctness that has such obvious security impact.
>>
>> Thanks,
>>
>>       Ingo
>
> There's the option of using GCC plugins now that the infrastructure was
> upstreamed from grsecurity. It can be used as part of the regular build
> process and as long as the analysis is pretty simple it shouldn't hurt
> compile time much.

Well, and that the situation may arise due to memory corruption, not
from poorly-matched set_fs() calls, which static analysis won't help
solve. We need to catch this bad kernel state because it is a very bad
state to run in.

-Kees
Thomas Garnier May 8, 2017, 7:51 p.m. UTC | #11
On Mon, May 8, 2017 at 8:26 AM, Kees Cook <keescook@chromium.org> wrote:
> On Mon, May 8, 2017 at 8:22 AM, Daniel Micay <danielmicay@gmail.com> wrote:
>> On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote:
>>>
>>> ... it's just not usable in that form for a regular maintenance flow.
>>>
>>> So what would be more useful is to add a specific Sparse check that
>>> only checks
>>> KERNEL_DS, to add it as a regular (.config driven) build option and
>>> make sure the
>>> kernel build has zero warnings.
>>>
>>> From that point on we can declare that this kind of bug won't occur
>>> anymore, if
>>> the Sparse implementation of the check is correct.
>>>
>>> But there's a (big) problem with that development model: Sparse is not
>>> part of the
>>> kernel tree and adding a feature to it while making the kernel depend
>>> on that
>>> brand new feature is a logistical nightmare. The overhead is quite
>>> similar to
>>> adding new features to a compiler - it happens at a glacial pace and
>>> is only done
>>> for major features really, at considerable expense. I don't think this
>>> is an
>>> adequate model for 'extended syntax checking' of the kernel,
>>> especially when it
>>> comes to correctness that has such obvious security impact.
>>>
>>> Thanks,
>>>
>>>       Ingo
>>
>> There's the option of using GCC plugins now that the infrastructure was
>> upstreamed from grsecurity. It can be used as part of the regular build
>> process and as long as the analysis is pretty simple it shouldn't hurt
>> compile time much.
>
> Well, and that the situation may arise due to memory corruption, not
> from poorly-matched set_fs() calls, which static analysis won't help
> solve. We need to catch this bad kernel state because it is a very bad
> state to run in.

Of course, I agree with Kees points on this and previous emails.

A static analysis solution is hard to scale across functions and build
time can suffer. I don't think the coverage will be good enough to
consider this change and static analysis as similar.

>
> -Kees
>
> --
> Kees Cook
> Pixel Security
Al Viro May 8, 2017, 8:48 p.m. UTC | #12
On Mon, May 08, 2017 at 04:06:35PM +0200, Jann Horn wrote:

> I think Kees might be talking about
> https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in
> commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that
> perf code that can run in pretty much any context called access_ok().

And that commit has *NOT* solved the problem.  perf_callchain_user()
can be called synchronously, without passing through that code.
Tracepoint shite...

That set_fs() should be done in get_perf_callchain(), just around the call of
perf_callchain_user().  Along with pagefault_disable(), actually.

BTW, that's a nice example demonstrating why doing that on the kernel
boundary is wrong.  Wider (in theory) area being "protected" => easier
to miss the ways not crossing its border.
Ingo Molnar May 9, 2017, 6:34 a.m. UTC | #13
* Kees Cook <keescook@chromium.org> wrote:

> On Mon, May 8, 2017 at 7:02 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > * Kees Cook <keescook@chromium.org> wrote:
> >
> >> > And yes, I realize that there were other such bugs and that such bugs might
> >> > occur in the future - but why not push the overhead of the security check to
> >> > the kernel build phase? I.e. I'm wondering how well we could do static
> >> > analysis during kernel build - would a limited mode of Sparse be good enough
> >> > for that? Or we could add a new static checker to tools/, built from first
> >> > principles and used primarily for extended syntactical checking.
> >>
> >> Static analysis is just not going to cover all cases. We've had vulnerabilities
> >> where interrupt handlers left KERNEL_DS set, for example. [...]
> >
> > Got any commit ID of that bug - was it because a function executed by the
> > interrupt handler leaked KERNEL_DS?
> 
> Ah, it was an exception handler, but the one I was thinking of was this:
> https://lwn.net/Articles/419141/

Ok, so that's CVE-2010-4258, where an oops with KERNEL_DS set was used to escalate 
privileges, due to the kernel's oops handler not cleaning up the KERNEL_DS. The 
exploit used another bug, a crash in a network protocol handler, to execute the 
oops handler with KERNEL_DS set.

The explanation of the exploit itself points out that it's a very interesting bug 
and I agree, it's not a general kernel bug but a bug in a very narrow code path 
(oops handling) that caused this, and I don't see how that example can be turned 
into a general example: it was a bug in oops handling to let the process continue 
execution (and perform the CLEARTID operation) *and* leak the address limit at 
KERNEL_DS.

By similar argument a bug in the runtime checking of the address limit may allow 
exploits. Consider the oops path cleanup a similarly sensitive code path as the 
address limit check.

To handle this category of exploits it would be enough to add a runtime check to 
the _oops handling code itself_ (to make sure we've set addr_limit back to USER_DS 
even if we crash in a KERNEL_DS code area), not to every system call!

That check would avoid that particular historic pattern, if combined with static 
analysis that ensured that KERNEL_DS is always set/restored correctly. (Which btw. 
I believe some of the regular static scans of the kernel are already doing today.)

Furthermore, to go back to your original argument:

> Static analysis is just not going to cover all cases.

it's not even true that a runtime check will 'cover all cases': for example a 
similar bug to CVE-2010-4258 could still be exploited:

 - Note that the actual put_user() was not prevented via the runtime check - the
   runtime check would run *after* the buggy put_user() was done. The runtime 
   check warns or panics after the fact, which might (or might not) be enough to 
   prevent the exploit.

 - Also note that a slightly different form of the bug would still be exploitable, 
   even with the runtime check: for example if the task-shutdown code can be made 
   to unconditionally set KERNEL_DS, but after the put_user(), then the runtime
   check would not 'cover all cases'.

So the argument for doing this runtime check after every system call is very 
dubious.

Thanks,

	Ingo
Ingo Molnar May 9, 2017, 6:45 a.m. UTC | #14
* Greg KH <greg@kroah.com> wrote:

> What about a simple coccinelle script to test for this type of thing?
> We write it once, add it to the in-kernel body of tests, and then 0-day
> runs it on all trees all the time.  That should catch this type of
> issue, like all of the other "bad programming bus" that the tool
> currently catches.

Yeah, that would work - but today most of our coccinelle scripts are still pretty 
verbose, and I think it's important to make this a different category of 
coccinelle script, which is .config driven where a loud warning yells at us.

I.e. force the 'zero warnings tolerated' model.

I also noticed that Coccinelle builds are pretty slow, so it would still make 
sense to have a performance oriented static checking facility that does not have 
the performance baggage of high level functional languages.

I.e. either integrate it into Sparse - or start a kernel integrated static 
analysis tooling project that would only follow control flow initially - which is 
what we need here I believe.

We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would 
be a pity to add a runtime check to every system call ...

We could also add a runtime check to oops handling to make sure we don't leak 
KERNEL_DS through kernel crashes, to ease worries about CVE-2010-4258.

Thanks,

	Ingo
Ingo Molnar May 9, 2017, 6:56 a.m. UTC | #15
* Kees Cook <keescook@chromium.org> wrote:

> > There's the option of using GCC plugins now that the infrastructure was 
> > upstreamed from grsecurity. It can be used as part of the regular build 
> > process and as long as the analysis is pretty simple it shouldn't hurt compile 
> > time much.
> 
> Well, and that the situation may arise due to memory corruption, not from 
> poorly-matched set_fs() calls, which static analysis won't help solve. We need 
> to catch this bad kernel state because it is a very bad state to run in.

If memory corruption corrupted the task state into having addr_limit set to 
KERNEL_DS then there's already a fair chance that it's game over: it could also 
have set *uid to 0, or changed a sensitive PF_ flag, or a number of other 
things...

Furthermore, think about it: there's literally an infinite amount of corrupted 
task states that could be a security problem and that could be checked after every 
system call. Do we want to check every one of them?

Thanks,

	Ingo
Christoph Hellwig May 9, 2017, 8:56 a.m. UTC | #16
On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would 
> be a pity to add a runtime check to every system call ...

I think we should simply strive to remove all of them that aren't
in core scheduler / arch code.  Basically evetyytime we do the

	oldfs = get_fs();
	set_fs(KERNEL_DS); 
	..
	set_fs(oldfs);

trick we're doing something wrong, and there should always be better
ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
consistently would already remove most of them.
Greg KH May 9, 2017, 11:10 a.m. UTC | #17
On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote:
> 
> * Kees Cook <keescook@chromium.org> wrote:
> 
> > > There's the option of using GCC plugins now that the infrastructure was 
> > > upstreamed from grsecurity. It can be used as part of the regular build 
> > > process and as long as the analysis is pretty simple it shouldn't hurt compile 
> > > time much.
> > 
> > Well, and that the situation may arise due to memory corruption, not from 
> > poorly-matched set_fs() calls, which static analysis won't help solve. We need 
> > to catch this bad kernel state because it is a very bad state to run in.
> 
> If memory corruption corrupted the task state into having addr_limit set to 
> KERNEL_DS then there's already a fair chance that it's game over: it could also 
> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other 
> things...
> 
> Furthermore, think about it: there's literally an infinite amount of corrupted 
> task states that could be a security problem and that could be checked after every 
> system call. Do we want to check every one of them?

Ok, I'm all for not checking lots of stuff all the time, just to protect
from crappy drivers that.  Especially as we _can_ audit and run checks
on the source code for them in the kernel tree.

But, and here's the problem, outside of the desktop/enterprise world,
there are a ton of out-of-tree code that is crap.  The number of
security/bug fixes and kernel crashes for out-of-tree code in systems
like Android phones is just so high it's laughable.

When you have a device that is running 3.2 million lines of kernel code,
yet the diffstat of the tree compared to mainline adds 3 million lines
of code, there is bound to be a ton of issues/problems there.

So this is an entirely different thing we need to try to protect
ourselves from.  A long time ago I laughed when I saw that Microsoft had
to do lots of "hardening" of their kernel to protect themselves from
crappy drivers, as I knew we didn't have to do that because we had the
source for them and could fix the root issues.  But that has changed and
now we don't all have that option.  That code is out-of-tree because the
vendor doesn't care, and doesn't want to take any time at all to do
anything resembling a real code review[1].

So, how about options like the ones being proposed here, go behind a new
config option:
	CONFIG_PROTECT_FROM_CRAPPY_DRIVERS
that device owners can enable if they do not trust their vendor-provided
code (hint, I sure don't.)  That way the "normal" path that all of us
are used to running will be fine, but if you want to take the speed hit
to try to protect yourself, then you can do that as well.

Anyway, just an idea...

thanks,

greg k-h

[1] I am working really hard with lots of vendors to try to fix their
    broken development model, but that is going to take years to resolve
    as their device pipelines are years long, and changing their
    mindsets takes a long time...
Andy Lutomirski May 9, 2017, 1 p.m. UTC | #18
On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
>> be a pity to add a runtime check to every system call ...
>
> I think we should simply strive to remove all of them that aren't
> in core scheduler / arch code.  Basically evetyytime we do the
>
>         oldfs = get_fs();
>         set_fs(KERNEL_DS);
>         ..
>         set_fs(oldfs);
>
> trick we're doing something wrong, and there should always be better
> ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
> consistently would already remove most of them.

How about trying to remove all of them?  If we could actually get rid
of all of them, we could drop the arch support, and we'd get faster,
simpler, shorter uaccess code throughout the kernel.

The ones in kernel/compat.c are generally garbage.  They should be
using compat_alloc_user_space().  Ditto for kernel/power/user.c.

flush_module_icache() is a potentially silly arch thing.  Does the
code in kernel/module.c that uses set_fs() actually work?

kernel/signal.c's set_fs() is laziness.

__probe_kernel_read() and __probe_kernel_write() use set_fs(), but
that usage only matters on sane arches* like s390x.  We should
arguably have a set_uaccess_address_space() or similar for this
purpose that's a nop on normal arches like x86.

fs/splice.c has some, ahem, interesting uses that have been the source
of nasty exploits in the past.  Converting them to use iov_iter
properly would be really, really nice.  Christoph, I don't suppose
you'd like to do that?

The others seem to mostly be fixable, but I haven't looked that closely.

Overall, I suspect that a big part of why mitigations like the one
being discussed in this thread were developed is because addr_limit
used to be on the stack, making it (along with restart_block) a really
nice target.  This is fixed now on x86, arm64, and s390x, I believe,
and other arches can easily opt in to the fix.

* I'm strongly in favor of arches that have totally separate user and
kernel address spaces.  Sadly, the most common arches don't do this.
Christoph Hellwig May 9, 2017, 1:02 p.m. UTC | #19
On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> fs/splice.c has some, ahem, interesting uses that have been the source
> of nasty exploits in the past.  Converting them to use iov_iter
> properly would be really, really nice.  Christoph, I don't suppose
> you'd like to do that?

I can take care of all the fs code including this one.
Thomas Garnier May 9, 2017, 2:29 p.m. UTC | #20
On Tue, May 9, 2017 at 4:10 AM, Greg KH <greg@kroah.com> wrote:
> On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote:
>>
>> * Kees Cook <keescook@chromium.org> wrote:
>>
>> > > There's the option of using GCC plugins now that the infrastructure was
>> > > upstreamed from grsecurity. It can be used as part of the regular build
>> > > process and as long as the analysis is pretty simple it shouldn't hurt compile
>> > > time much.
>> >
>> > Well, and that the situation may arise due to memory corruption, not from
>> > poorly-matched set_fs() calls, which static analysis won't help solve. We need
>> > to catch this bad kernel state because it is a very bad state to run in.
>>
>> If memory corruption corrupted the task state into having addr_limit set to
>> KERNEL_DS then there's already a fair chance that it's game over: it could also
>> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other
>> things...
>>
>> Furthermore, think about it: there's literally an infinite amount of corrupted
>> task states that could be a security problem and that could be checked after every
>> system call. Do we want to check every one of them?
>
> Ok, I'm all for not checking lots of stuff all the time, just to protect
> from crappy drivers that.  Especially as we _can_ audit and run checks
> on the source code for them in the kernel tree.
>
> But, and here's the problem, outside of the desktop/enterprise world,
> there are a ton of out-of-tree code that is crap.  The number of
> security/bug fixes and kernel crashes for out-of-tree code in systems
> like Android phones is just so high it's laughable.
>
> When you have a device that is running 3.2 million lines of kernel code,
> yet the diffstat of the tree compared to mainline adds 3 million lines
> of code, there is bound to be a ton of issues/problems there.
>
> So this is an entirely different thing we need to try to protect
> ourselves from.  A long time ago I laughed when I saw that Microsoft had
> to do lots of "hardening" of their kernel to protect themselves from
> crappy drivers, as I knew we didn't have to do that because we had the
> source for them and could fix the root issues.  But that has changed and
> now we don't all have that option.  That code is out-of-tree because the
> vendor doesn't care, and doesn't want to take any time at all to do
> anything resembling a real code review[1].

That's a big part of why I thought would be useful. I am less worried
about edge cases upstream right now than forks with custom codes not
using set_fs correctly.

>
> So, how about options like the ones being proposed here, go behind a new
> config option:
>         CONFIG_PROTECT_FROM_CRAPPY_DRIVERS
> that device owners can enable if they do not trust their vendor-provided
> code (hint, I sure don't.)  That way the "normal" path that all of us
> are used to running will be fine, but if you want to take the speed hit
> to try to protect yourself, then you can do that as well.

Maybe another name but why not.

>
> Anyway, just an idea...
>
> thanks,
>
> greg k-h
>
> [1] I am working really hard with lots of vendors to try to fix their
>     broken development model, but that is going to take years to resolve
>     as their device pipelines are years long, and changing their
>     mindsets takes a long time...
Christoph Hellwig May 9, 2017, 4:03 p.m. UTC | #21
On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> > fs/splice.c has some, ahem, interesting uses that have been the source
> > of nasty exploits in the past.  Converting them to use iov_iter
> > properly would be really, really nice.  Christoph, I don't suppose
> > you'd like to do that?
> 
> I can take care of all the fs code including this one.

I spent the afternoon hacking up where I'd like this to head.  It's
completely untested as of now:

	http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
Brian Gerst May 9, 2017, 4:05 p.m. UTC | #22
On Tue, May 9, 2017 at 9:00 AM, Andy Lutomirski <luto@kernel.org> wrote:
> On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
>>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
>>> be a pity to add a runtime check to every system call ...
>>
>> I think we should simply strive to remove all of them that aren't
>> in core scheduler / arch code.  Basically evetyytime we do the
>>
>>         oldfs = get_fs();
>>         set_fs(KERNEL_DS);
>>         ..
>>         set_fs(oldfs);
>>
>> trick we're doing something wrong, and there should always be better
>> ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
>> consistently would already remove most of them.
>
> How about trying to remove all of them?  If we could actually get rid
> of all of them, we could drop the arch support, and we'd get faster,
> simpler, shorter uaccess code throughout the kernel.
>
> The ones in kernel/compat.c are generally garbage.  They should be
> using compat_alloc_user_space().  Ditto for kernel/power/user.c.

compat_alloc_user_space() is a hack that should go away too.  It ends
up copying the data three times.

The more efficient solution to this is to have a core syscall function
that only accesses kernel memory, and then have two front-end
functions (native and compat) that do the actual reads and writes to
userspace, with conversion in the compat case.

--
Brian Gerst
Kees Cook May 9, 2017, 4:30 p.m. UTC | #23
On Mon, May 8, 2017 at 11:56 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Kees Cook <keescook@chromium.org> wrote:
>
>> > There's the option of using GCC plugins now that the infrastructure was
>> > upstreamed from grsecurity. It can be used as part of the regular build
>> > process and as long as the analysis is pretty simple it shouldn't hurt compile
>> > time much.
>>
>> Well, and that the situation may arise due to memory corruption, not from
>> poorly-matched set_fs() calls, which static analysis won't help solve. We need
>> to catch this bad kernel state because it is a very bad state to run in.

[attempting some thread-merging]

> Ok, so that's CVE-2010-4258, where an oops with KERNEL_DS set was used to escalate
> privileges, due to the kernel's oops handler not cleaning up the KERNEL_DS. The
> exploit used another bug, a crash in a network protocol handler, to execute the
> oops handler with KERNEL_DS set.

Right, I didn't mean to suggest that vulnerability would be fixed by
this solution. I was trying to show how there can be some pretty
complex interaction with exceptions/interrupts/etc that would make
pure static analysis still miss things.

> If memory corruption corrupted the task state into having addr_limit set to
> KERNEL_DS then there's already a fair chance that it's game over: it could also
> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other
> things...
>
> Furthermore, think about it: there's literally an infinite amount of corrupted
> task states that could be a security problem and that could be checked after every
> system call. Do we want to check every one of them?

Right, but this "slippery slope" argument isn't the best way to reject
security changes. Let me take a step back and describe the threat, and
where we should likely spend time:

The primary threat with addr_limit getting changed is that a
narrowly-scoped attack (traditionally stack exhaustion or
adjacent-stack large-index writes) could be leveraged into opening the
entire kernel to writes (by allowing all syscalls with a
copy_to_user() call to suddenly be able to write to kernel memory).
So, really, the flaw is having addr_limit at all. Removing set_fs()
should, I think, allow this to become a const (or at least should get
us a lot closer).

The main path to corrupting addr_limit has been via stack corruption.
On architectures with CONFIG_THREAD_INFO_IN_TASK, this risk is greatly
reduced already, but it's not universally available yet. (And as long
as we're talking about stack attacks, CONFIG_VMAP_STACK makes
cross-stack overflows go away, and cross-stack indexing harder, but
that's not really about addr_limit since currently nothing with
VMAP_STACK doesn't already have THREAD_INFO_IN_TASK.)

So, left with a still exploitable target in memory that allows such an
expansion of attack method, I still think it's worth keeping this
patch series, but if we can drop set_fs() I could probably be
convinced the benefit of the series doesn't exceed the cost on
THREAD_INFO_IN_TASK-architectures (x86, arm64, s390). But that means
at least currently keeping it on arm, for example. If we can make
addr_limit const, well, we don't need the series at all.

-Kees
Kees Cook May 9, 2017, 4:50 p.m. UTC | #24
On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>> > fs/splice.c has some, ahem, interesting uses that have been the source
>> > of nasty exploits in the past.  Converting them to use iov_iter
>> > properly would be really, really nice.  Christoph, I don't suppose
>> > you'd like to do that?
>>
>> I can take care of all the fs code including this one.
>
> I spent the afternoon hacking up where I'd like this to head.  It's
> completely untested as of now:
>
>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

Ooooh yes! Nice work.

I love this:
http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
but I wonder what it'll cause out-of-tree code to do. I mean, I'd
rather nothing out-of-tree be calling these, but I'd hate 3rd party
hacks even more.

http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
This accidentally(?) removes the kernel-doc comments.

http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
Could this be made defensive? (Return 0 if ret wraps, for example?) I
see what the comment says, but not everyone will read that. :(

http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
Perhaps unconditionally set USER_DS on exit instead of retaining
whatever was there?

-Kees
Andy Lutomirski May 9, 2017, 10:52 p.m. UTC | #25
On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook@chromium.org> wrote:
> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>>> > fs/splice.c has some, ahem, interesting uses that have been the source
>>> > of nasty exploits in the past.  Converting them to use iov_iter
>>> > properly would be really, really nice.  Christoph, I don't suppose
>>> > you'd like to do that?
>>>
>>> I can take care of all the fs code including this one.
>>
>> I spent the afternoon hacking up where I'd like this to head.  It's
>> completely untested as of now:
>>
>>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
>
> Ooooh yes! Nice work.
>
> I love this:
> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
> but I wonder what it'll cause out-of-tree code to do. I mean, I'd
> rather nothing out-of-tree be calling these, but I'd hate 3rd party
> hacks even more.
>
> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
> This accidentally(?) removes the kernel-doc comments.
>
> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
> Could this be made defensive? (Return 0 if ret wraps, for example?) I
> see what the comment says, but not everyone will read that. :(
>
> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
> Perhaps unconditionally set USER_DS on exit instead of retaining
> whatever was there?

I don't like silent fixups.  If we want to do this, we should BUG or
at least WARN, not just change the addr limit.  But I'm also not
convinced it's indicative of an actual bug here.

--Andy
Kees Cook May 9, 2017, 11:31 p.m. UTC | #26
On Tue, May 9, 2017 at 3:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook@chromium.org> wrote:
>> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
>>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>>>> > fs/splice.c has some, ahem, interesting uses that have been the source
>>>> > of nasty exploits in the past.  Converting them to use iov_iter
>>>> > properly would be really, really nice.  Christoph, I don't suppose
>>>> > you'd like to do that?
>>>>
>>>> I can take care of all the fs code including this one.
>>>
>>> I spent the afternoon hacking up where I'd like this to head.  It's
>>> completely untested as of now:
>>>
>>>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
>>
>> Ooooh yes! Nice work.
>>
>> I love this:
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
>> but I wonder what it'll cause out-of-tree code to do. I mean, I'd
>> rather nothing out-of-tree be calling these, but I'd hate 3rd party
>> hacks even more.
>>
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
>> This accidentally(?) removes the kernel-doc comments.
>>
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
>> Could this be made defensive? (Return 0 if ret wraps, for example?) I
>> see what the comment says, but not everyone will read that. :(
>>
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
>> Perhaps unconditionally set USER_DS on exit instead of retaining
>> whatever was there?
>
> I don't like silent fixups.  If we want to do this, we should BUG or
> at least WARN, not just change the addr limit.  But I'm also not
> convinced it's indicative of an actual bug here.

Nothing should enter that function with KERNEL_DS set, right?

BUG_ON(get_fs() != USER_DS);
set_fs(KERNEL_DS);
...
set_fs(USER_DS);

-Kees
Andy Lutomirski May 10, 2017, 1:59 a.m. UTC | #27
On Tue, May 9, 2017 at 4:31 PM, Kees Cook <keescook@chromium.org> wrote:
> On Tue, May 9, 2017 at 3:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook@chromium.org> wrote:
>>> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
>>>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>>>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>>>>> > fs/splice.c has some, ahem, interesting uses that have been the source
>>>>> > of nasty exploits in the past.  Converting them to use iov_iter
>>>>> > properly would be really, really nice.  Christoph, I don't suppose
>>>>> > you'd like to do that?
>>>>>
>>>>> I can take care of all the fs code including this one.
>>>>
>>>> I spent the afternoon hacking up where I'd like this to head.  It's
>>>> completely untested as of now:
>>>>
>>>>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
>>>
>>> Ooooh yes! Nice work.
>>>
>>> I love this:
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
>>> but I wonder what it'll cause out-of-tree code to do. I mean, I'd
>>> rather nothing out-of-tree be calling these, but I'd hate 3rd party
>>> hacks even more.
>>>
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
>>> This accidentally(?) removes the kernel-doc comments.
>>>
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
>>> Could this be made defensive? (Return 0 if ret wraps, for example?) I
>>> see what the comment says, but not everyone will read that. :(
>>>
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
>>> Perhaps unconditionally set USER_DS on exit instead of retaining
>>> whatever was there?
>>
>> I don't like silent fixups.  If we want to do this, we should BUG or
>> at least WARN, not just change the addr limit.  But I'm also not
>> convinced it's indicative of an actual bug here.
>
> Nothing should enter that function with KERNEL_DS set, right?
>
> BUG_ON(get_fs() != USER_DS);
> set_fs(KERNEL_DS);
> ...
> set_fs(USER_DS);
>

It's not immediately obvious to me that this shouldn't happen.  Why
not do it the way Christoph did and then, once the rest of set_fs() is
tamed, consider this change (or trying to mass-convert file_operations
implementations to get rid of it entirely)?

--Andy
Al Viro May 10, 2017, 2:11 a.m. UTC | #28
On Tue, May 09, 2017 at 09:03:22AM -0700, Christoph Hellwig wrote:
> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
> > On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> > > fs/splice.c has some, ahem, interesting uses that have been the source
> > > of nasty exploits in the past.  Converting them to use iov_iter
> > > properly would be really, really nice.  Christoph, I don't suppose
> > > you'd like to do that?
> > 
> > I can take care of all the fs code including this one.

Oh?

> I spent the afternoon hacking up where I'd like this to head.  It's
> completely untested as of now:
> 
> 	http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

And just what happens to driver that has no ->read_iter()?  Unless I'm
seriously misreading that, NAK with extreme prejudice.
Al Viro May 10, 2017, 2:45 a.m. UTC | #29
On Wed, May 10, 2017 at 03:11:18AM +0100, Al Viro wrote:
> On Tue, May 09, 2017 at 09:03:22AM -0700, Christoph Hellwig wrote:
> > On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
> > > On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> > > > fs/splice.c has some, ahem, interesting uses that have been the source
> > > > of nasty exploits in the past.  Converting them to use iov_iter
> > > > properly would be really, really nice.  Christoph, I don't suppose
> > > > you'd like to do that?
> > > 
> > > I can take care of all the fs code including this one.
> 
> Oh?
> 
> > I spent the afternoon hacking up where I'd like this to head.  It's
> > completely untested as of now:
> > 
> > 	http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
> 
> And just what happens to driver that has no ->read_iter()?  Unless I'm
> seriously misreading that, NAK with extreme prejudice.

FWIW, some parts of that queue are obviously sane; it's the conversions of
kernel_write() and friends to ->read_iter/->write_iter() that are non-starters.
That stuff is used in too many situations; we can't guarantee that all of
them will be for files that have those.

As for default_file_splice_read(), I seriously suspect that with your change
we could as well just make it return -EINVAL and be done with that; places
that have ->read_iter() tend to have explicit ->splice_read() and it looks
like the ones that do not should simply use generic_file_read_iter().
I hadn't checked that, but there's not a lot of those:

arch/s390/hypfs/inode.c:437:    .read_iter      = hypfs_read_iter,
drivers/char/mem.c:798: .read_iter      = read_iter_null,
drivers/char/mem.c:813: .read_iter      = read_iter_zero,
drivers/char/mem.c:824: .read_iter      = read_iter_zero,
drivers/char/raw.c:286: .read_iter      = blkdev_read_iter,
drivers/net/tap.c:1134: .read_iter      = tap_read_iter,
drivers/net/tun.c:2423: .read_iter  = tun_chr_read_iter,
drivers/usb/gadget/function/f_fs.c:1255:        .read_iter =    ffs_epfile_read_iter,
drivers/usb/gadget/legacy/inode.c:703:  .read_iter =    ep_read_iter,
drivers/vhost/net.c:1252:       .read_iter      = vhost_net_chr_read_iter,
fs/9p/vfs_file.c:641:   .read_iter = generic_file_read_iter,
fs/9p/vfs_file.c:652:   .read_iter = generic_file_read_iter,
fs/9p/vfs_file.c:664:   .read_iter = v9fs_file_read_iter,
fs/9p/vfs_file.c:675:   .read_iter = v9fs_file_read_iter,
fs/9p/vfs_file.c:687:   .read_iter = v9fs_mmap_file_read_iter,
fs/9p/vfs_file.c:698:   .read_iter = v9fs_mmap_file_read_iter,
fs/fuse/cuse.c:180:     .read_iter              = cuse_read_iter,
fs/fuse/file.c:3015:    .read_iter      = fuse_direct_read_iter,
fs/hugetlbfs/inode.c:980:       .read_iter              = hugetlbfs_read_iter,
fs/ncpfs/file.c:248:    .read_iter      = ncp_file_read_iter,
fs/orangefs/file.c:742: .read_iter      = orangefs_file_read_iter,
fs/pipe.c:1011: .read_iter      = pipe_read,
sound/core/pcm_native.c:3696:           .read_iter =            snd_pcm_readv,

is the full list and I'm fairly certain that most of them will work with
generic_file_splice_read() just fine.  drivers/char definitely will, so
will ncpfs/orangefs/hugetlbfs/most of 9p ones (two of the latter might
need some care in p9_client_read(), but that should be doable easily enough).
pipe is irrelevant (->splice_read() won't be called for those).  fuse ones
should be doable, but that might take a bit more infrastructure work in
lib/iov_iter.c.  vhost, gadgetfs, tun/tap - no idea at the moment.
Al Viro May 10, 2017, 3:12 a.m. UTC | #30
On Wed, May 10, 2017 at 03:45:24AM +0100, Al Viro wrote:

> FWIW, some parts of that queue are obviously sane; it's the conversions of
> kernel_write() and friends to ->read_iter/->write_iter() that are non-starters.

Egads...  OK, I *have* misread what you are doing there.  Your vfs_iter_read()
works for files sans ->read_iter().  For strange values of "works" - you
hardwire "it's either iovec or kvec iterator" into its calling conventions,
which is a trouble waiting to happen.

What's the point?  What's wrong with having kernel_read()/kernel_readv()/etc.?
You still have set_fs() in there; doing that one level up in call chain would
be just fine...  IDGI.

Broken commit: "net: don't play with address limits in kernel_recvmsg".
It would be OK if it was only about data.  Unfortunately, that's not
true in one case: svc_udp_recvfrom() wants ->msg_control.

Another delicate place: you can't assume that write() always advances
file position by its (positive) return value.  btrfs stuff is sensitive
to that.

ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
scratch that - it *is* racy.
Al Viro May 10, 2017, 3:21 a.m. UTC | #31
On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:

> Broken commit: "net: don't play with address limits in kernel_recvmsg".
> It would be OK if it was only about data.  Unfortunately, that's not
> true in one case: svc_udp_recvfrom() wants ->msg_control.
> 
> Another delicate place: you can't assume that write() always advances
> file position by its (positive) return value.  btrfs stuff is sensitive
> to that.
> 
> ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> scratch that - it *is* racy.

kvec_length(): please, don't.  I would rather have the last remaining
iov_length() gone...   What do you need it for, anyway?  You have only
two users and both have the count passed to them (as *count and *cnt resp.)
Al Viro May 10, 2017, 3:39 a.m. UTC | #32
On Wed, May 10, 2017 at 04:21:37AM +0100, Al Viro wrote:
> On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:
> 
> > Broken commit: "net: don't play with address limits in kernel_recvmsg".
> > It would be OK if it was only about data.  Unfortunately, that's not
> > true in one case: svc_udp_recvfrom() wants ->msg_control.
> > 
> > Another delicate place: you can't assume that write() always advances
> > file position by its (positive) return value.  btrfs stuff is sensitive
> > to that.
> > 
> > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> > about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> > scratch that - it *is* racy.
> 
> kvec_length(): please, don't.  I would rather have the last remaining
> iov_length() gone...   What do you need it for, anyway?  You have only
> two users and both have the count passed to them (as *count and *cnt resp.)

fcntl stuff: I've decided not to put something similar into work.compat
since I couldn't decide what to do with compat stuff - word-by-word copy
from userland converting to struct flock + conversion to posix_lock +
actual work + conversion to flock + word-by-word copy to userland...  Smells
like we might be better off with compat_flock_to_posix_lock() et.al.
I'm still not sure; played a bit one way and another and dediced to drop
it for now.  Hell knows...
Christoph Hellwig May 10, 2017, 6:46 a.m. UTC | #33
On Tue, May 09, 2017 at 09:50:32AM -0700, Kees Cook wrote:
> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
> This accidentally(?) removes the kernel-doc comments.

That was sort of intentional, the kerneldoc boilerplate adds very little
value for method instances.

> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
> Could this be made defensive? (Return 0 if ret wraps, for example?) I
> see what the comment says, but not everyone will read that. :(

It's a straight copy of iov_length, and I'd rather keep it that way.
Christoph Hellwig May 10, 2017, 6:49 a.m. UTC | #34
On Wed, May 10, 2017 at 03:45:24AM +0100, Al Viro wrote:
> FWIW, some parts of that queue are obviously sane; it's the conversions of
> kernel_write() and friends to ->read_iter/->write_iter() that are non-starters.

And that part is the main point!

> That stuff is used in too many situations; we can't guarantee that all of
> them will be for files that have those.

That's why this series handles ITER_KVEC for this case, which is all
that's really needed for kernel_read/write.  If you insiste the bvec
and pipe cases are handled as well that couod be added fairly easily.

> As for default_file_splice_read(), I seriously suspect that with your change
> we could as well just make it return -EINVAL and be done with that; places
> that have ->read_iter() tend to have explicit ->splice_read() and it looks
> like the ones that do not should simply use generic_file_read_iter().
> I hadn't checked that, but there's not a lot of those:

Making ->splice_read to default to the ->read_iter based implementation
and returning -EINVAL if neither that nor an explicit ->splice_read
is provided is useful, but wasn't the aim of this series.  Similar
on the write side.
Christoph Hellwig May 10, 2017, 6:53 a.m. UTC | #35
On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:
> What's the point?  What's wrong with having kernel_read()/kernel_readv()/etc.?
> You still have set_fs() in there; doing that one level up in call chain would
> be just fine...  IDGI.

The problem is that they modify the address limit, which the whole
subthread here wants to get rid of.

> Broken commit: "net: don't play with address limits in kernel_recvmsg".
> It would be OK if it was only about data.  Unfortunately, that's not
> true in one case: svc_udp_recvfrom() wants ->msg_control.

Dropped, but we'll need to fix that eventually.

> Another delicate place: you can't assume that write() always advances
> file position by its (positive) return value.  btrfs stuff is sensitive
> to that.

If we don't want to assume that we need to pass pointer to pos to
kernel_read/write.  Which might be a good idea in general.

> ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> scratch that - it *is* racy.

I think the proper fix is to not even bother to maintain f_pos of the
backing file, as we don't ever use it - all reads from it pass in
an explicit position anyway.
Christoph Hellwig May 10, 2017, 6:54 a.m. UTC | #36
On Wed, May 10, 2017 at 04:39:12AM +0100, Al Viro wrote:
> fcntl stuff: I've decided not to put something similar into work.compat
> since I couldn't decide what to do with compat stuff - word-by-word copy
> from userland converting to struct flock + conversion to posix_lock +
> actual work + conversion to flock + word-by-word copy to userland...  Smells
> like we might be better off with compat_flock_to_posix_lock() et.al.
> I'm still not sure; played a bit one way and another and dediced to drop
> it for now.  Hell knows...

My version already is an improvement in lines of code alone.  Between
that and stopping to mess with the address limit I think it's a clear
winner.  But it's pretty independent of the rest, and I'll just run
it through Jeff and Bruce and ask them what they think.
Christoph Hellwig May 10, 2017, 7:15 a.m. UTC | #37
On Tue, May 09, 2017 at 04:31:00PM -0700, Kees Cook wrote:
> > I don't like silent fixups.  If we want to do this, we should BUG or
> > at least WARN, not just change the addr limit.  But I'm also not
> > convinced it's indicative of an actual bug here.
> 
> Nothing should enter that function with KERNEL_DS set, right?

It might very well do.  Various drivers or the networking code mess
with the address limits for fairly broad sections of code.
Al Viro May 10, 2017, 7:27 a.m. UTC | #38
On Tue, May 09, 2017 at 11:53:01PM -0700, Christoph Hellwig wrote:
> On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:
> > What's the point?  What's wrong with having kernel_read()/kernel_readv()/etc.?
> > You still have set_fs() in there; doing that one level up in call chain would
> > be just fine...  IDGI.
> 
> The problem is that they modify the address limit, which the whole
> subthread here wants to get rid of.

And you *still* do the same.  Christoph, this is ridiculous - the worst
part of the area is not a couple of functions in fs/read_write.c, it's
a fucking lot of ->read() and ->write() instances in shitty driver code,
pardon the redundance.  And _that_ is still done under set_fs(KERNEL_DS).

Claiming that set_fs() done one function deeper in callchain (both in
fs/read_write.c) is somehow better because it reduces the amount of code
under that thing...  Get real, please - helpers that encapsulate those
set_fs() pairs (a-la kernel_read(), etc.) absolutely make sense and
converting their open-coded instances to calls of those helpers is clearly
a good thing.  However, we are not
	* getting rid of low-quality code run under KERNEL_DS
	* gettind rid of set_fs() itself
	* getting a generic kernel_read() variant that would really take
an iov_iter.

That's what I'm objecting to.  Centralized kernel_readv() et.al. - sure,
and fs/read_write.c is the right place for those.  No arguments here.
Conversion to those - absolutely; drivers have no fucking business touching
set_fs() at all.  But your primitives are trouble waiting to happen.
Let them take kvec arrays.  And let them, in case when there's no
->read_iter()/->write_iter(), do set_fs().  Statically, without this
if (iter->type & ITER_KVEC) ... stuff.

> > Another delicate place: you can't assume that write() always advances
> > file position by its (positive) return value.  btrfs stuff is sensitive
> > to that.
> 
> If we don't want to assume that we need to pass pointer to pos to
> kernel_read/write.  Which might be a good idea in general.

Yes.

> > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> > about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> > scratch that - it *is* racy.
> 
> I think the proper fix is to not even bother to maintain f_pos of the
> backing file, as we don't ever use it - all reads from it pass in
> an explicit position anyway.

vfs_llseek() used by ashmem_llseek()...
Arnd Bergmann May 10, 2017, 7:28 a.m. UTC | #39
On Tue, May 9, 2017 at 6:03 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>> > fs/splice.c has some, ahem, interesting uses that have been the source
>> > of nasty exploits in the past.  Converting them to use iov_iter
>> > properly would be really, really nice.  Christoph, I don't suppose
>> > you'd like to do that?
>>
>> I can take care of all the fs code including this one.
>
> I spent the afternoon hacking up where I'd like this to head.  It's
> completely untested as of now:
>
>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

My older time64_t syscall series has the side-effect of doing something
like this to the time-related compat handlers in kernel/compat.c. If nobody
else has started looking at removing set_fs from those, I can extract
the relevant parts from my series.

      Arnd
Christoph Hellwig May 10, 2017, 7:35 a.m. UTC | #40
On Wed, May 10, 2017 at 08:27:47AM +0100, Al Viro wrote:
> And you *still* do the same.  Christoph, this is ridiculous - the worst
> part of the area is not a couple of functions in fs/read_write.c, it's
> a fucking lot of ->read() and ->write() instances in shitty driver code,
> pardon the redundance.  And _that_ is still done under set_fs(KERNEL_DS).

For now yes.  But it provides a sane API on top, and then we can
start moving the drivers that matter to the iter variants and drop
support for the rest soon.  Most in-kernel I/O is done to files, and
the rest to a very limited set of devices. (not accounting for sockets
through their own APIs, thats another story)

> That's what I'm objecting to.  Centralized kernel_readv() et.al. - sure,
> and fs/read_write.c is the right place for those.  No arguments here.
> Conversion to those - absolutely; drivers have no fucking business touching
> set_fs() at all.  But your primitives are trouble waiting to happen.
> Let them take kvec arrays.  And let them, in case when there's no
> ->read_iter()/->write_iter(), do set_fs().  Statically, without this
> if (iter->type & ITER_KVEC) ... stuff.

Oh weĺl.  I can do that first, but I think eventually we'll have to
get back to what I've done now.
Christoph Hellwig May 10, 2017, 7:35 a.m. UTC | #41
On Wed, May 10, 2017 at 09:28:48AM +0200, Arnd Bergmann wrote:
> My older time64_t syscall series has the side-effect of doing something
> like this to the time-related compat handlers in kernel/compat.c. If nobody
> else has started looking at removing set_fs from those, I can extract
> the relevant parts from my series.

That would be great.
Arnd Bergmann May 10, 2017, 7:37 a.m. UTC | #42
On Tue, May 9, 2017 at 3:00 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
>>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
>>> be a pity to add a runtime check to every system call ...
>>
>> I think we should simply strive to remove all of them that aren't
>> in core scheduler / arch code.  Basically evetyytime we do the
>>
>>         oldfs = get_fs();
>>         set_fs(KERNEL_DS);
>>         ..
>>         set_fs(oldfs);
>>
>> trick we're doing something wrong, and there should always be better
>> ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
>> consistently would already remove most of them.
>
> How about trying to remove all of them?  If we could actually get rid
> of all of them, we could drop the arch support, and we'd get faster,
> simpler, shorter uaccess code throughout the kernel.
>
> The ones in kernel/compat.c are generally garbage.  They should be
> using compat_alloc_user_space().  Ditto for kernel/power/user.c.

compat_alloc_user_space() has some problems too, it adds
complexity to a rarely-tested code path and can add some noticeable
overhead in cases where user space access is slow because of
extra checks.

It's clearly better than set_fs(), but the way I prefer to convert the
code is to avoid both and instead move compat handlers next to
the native code, and splitting out the common code between native
and compat mode into a helper that takes a regular kernel pointer.

I think that's what both Al has done in the past on compat_ioctl()
and select() and what Christoph does in his latest series, but
it seems worth pointing out for others that decide to help out here.

     Arnd
Al Viro May 10, 2017, 8:08 a.m. UTC | #43
On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote:

> > How about trying to remove all of them?  If we could actually get rid
> > of all of them, we could drop the arch support, and we'd get faster,
> > simpler, shorter uaccess code throughout the kernel.

BTW, not all get_user() under KERNEL_DS are plain loads.  There is an
exception - probe_kernel_read().

> > The ones in kernel/compat.c are generally garbage.  They should be
> > using compat_alloc_user_space().  Ditto for kernel/power/user.c.
> 
> compat_alloc_user_space() has some problems too, it adds
> complexity to a rarely-tested code path and can add some noticeable
> overhead in cases where user space access is slow because of
> extra checks.
> 
> It's clearly better than set_fs(), but the way I prefer to convert the
> code is to avoid both and instead move compat handlers next to
> the native code, and splitting out the common code between native
> and compat mode into a helper that takes a regular kernel pointer.
> 
> I think that's what both Al has done in the past on compat_ioctl()
> and select() and what Christoph does in his latest series, but
> it seems worth pointing out for others that decide to help out here.

Folks, reducing the amount of places where we play with set_fs() is certainly
a good thing.  Getting rid of them completely is something entirely different;
I have tried to plot out patch series in this direction many times during the
last 5 years or so, but it's not going to be easy.  Tomorrow I can start
posting my notes in that direction (and there are tons of those, unfortunately
mixed with git grep results, highly unprintable personal comments, etc.);
just let me grab some sleep first...

BTW, slow userland access is not just due to extra checks; access_ok(),
in particular, is pretty much noise.  The real PITA comes from the things
like STAC/CLAC on recent x86.  Or hardware overhead of cross-address-space
block copy insn (e.g. on s390, where it's optimized for multi-cacheline
blocks).  Or things like uml, where it's a matter of walking the page
tables for each sodding __get_user().  It's not always just a matter of
address space limit...
Christoph Hellwig May 10, 2017, 8:14 a.m. UTC | #44
On Wed, May 10, 2017 at 09:08:41AM +0100, Al Viro wrote:
> On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote:
> 
> > > How about trying to remove all of them?  If we could actually get rid
> > > of all of them, we could drop the arch support, and we'd get faster,
> > > simpler, shorter uaccess code throughout the kernel.
> 
> BTW, not all get_user() under KERNEL_DS are plain loads.  There is an
> exception - probe_kernel_read().

And various calls that looks like opencoded versions, e.g. drivers/dio
or the ELF loader.

But in the long run we'll just need a separate primitive for that,
but that can wait until the set_fs calls outside the core code are
gone.
Andy Lutomirski May 11, 2017, 12:18 a.m. UTC | #45
On Wed, May 10, 2017 at 1:14 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, May 10, 2017 at 09:08:41AM +0100, Al Viro wrote:
>> On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote:
>>
>> > > How about trying to remove all of them?  If we could actually get rid
>> > > of all of them, we could drop the arch support, and we'd get faster,
>> > > simpler, shorter uaccess code throughout the kernel.
>>
>> BTW, not all get_user() under KERNEL_DS are plain loads.  There is an
>> exception - probe_kernel_read().
>
> And various calls that looks like opencoded versions, e.g. drivers/dio
> or the ELF loader.
>
> But in the long run we'll just need a separate primitive for that,
> but that can wait until the set_fs calls outside the core code are
> gone.

I suspect that, on most arches, the primitive is called
__copy_from_user().  We could make the generic code do that except
where overridden.
Borislav Petkov May 11, 2017, 11:22 a.m. UTC | #46
On Tue, May 09, 2017 at 04:31:00PM -0700, Kees Cook wrote:
> > I don't like silent fixups.  If we want to do this, we should BUG or
> > at least WARN, not just change the addr limit.  But I'm also not
> > convinced it's indicative of an actual bug here.
> 
> Nothing should enter that function with KERNEL_DS set, right?
> 
> BUG_ON(get_fs() != USER_DS);

We're feeling triggerhappy, aren't we? A nice juicy WARN-splat along
with a fixup looks much better than killing the box, to me.
Thomas Garnier May 11, 2017, 11:17 p.m. UTC | #47
On Tue, May 9, 2017 at 7:29 AM, Thomas Garnier <thgarnie@google.com> wrote:
>
> On Tue, May 9, 2017 at 4:10 AM, Greg KH <greg@kroah.com> wrote:
> > On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote:
> >>
> >> * Kees Cook <keescook@chromium.org> wrote:
> >>
> >> > > There's the option of using GCC plugins now that the infrastructure was
> >> > > upstreamed from grsecurity. It can be used as part of the regular build
> >> > > process and as long as the analysis is pretty simple it shouldn't hurt compile
> >> > > time much.
> >> >
> >> > Well, and that the situation may arise due to memory corruption, not from
> >> > poorly-matched set_fs() calls, which static analysis won't help solve. We need
> >> > to catch this bad kernel state because it is a very bad state to run in.
> >>
> >> If memory corruption corrupted the task state into having addr_limit set to
> >> KERNEL_DS then there's already a fair chance that it's game over: it could also
> >> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other
> >> things...
> >>
> >> Furthermore, think about it: there's literally an infinite amount of corrupted
> >> task states that could be a security problem and that could be checked after every
> >> system call. Do we want to check every one of them?
> >
> > Ok, I'm all for not checking lots of stuff all the time, just to protect
> > from crappy drivers that.  Especially as we _can_ audit and run checks
> > on the source code for them in the kernel tree.
> >
> > But, and here's the problem, outside of the desktop/enterprise world,
> > there are a ton of out-of-tree code that is crap.  The number of
> > security/bug fixes and kernel crashes for out-of-tree code in systems
> > like Android phones is just so high it's laughable.
> >
> > When you have a device that is running 3.2 million lines of kernel code,
> > yet the diffstat of the tree compared to mainline adds 3 million lines
> > of code, there is bound to be a ton of issues/problems there.
> >
> > So this is an entirely different thing we need to try to protect
> > ourselves from.  A long time ago I laughed when I saw that Microsoft had
> > to do lots of "hardening" of their kernel to protect themselves from
> > crappy drivers, as I knew we didn't have to do that because we had the
> > source for them and could fix the root issues.  But that has changed and
> > now we don't all have that option.  That code is out-of-tree because the
> > vendor doesn't care, and doesn't want to take any time at all to do
> > anything resembling a real code review[1].
>
> That's a big part of why I thought would be useful. I am less worried
> about edge cases upstream right now than forks with custom codes not
> using set_fs correctly.
>
> >
> > So, how about options like the ones being proposed here, go behind a new
> > config option:
> >         CONFIG_PROTECT_FROM_CRAPPY_DRIVERS
> > that device owners can enable if they do not trust their vendor-provided
> > code (hint, I sure don't.)  That way the "normal" path that all of us
> > are used to running will be fine, but if you want to take the speed hit
> > to try to protect yourself, then you can do that as well.
>
> Maybe another name but why not.

Ingo: Do you want the change as-is? Would you like it to be optional?
What do you think?

>
> >
> > Anyway, just an idea...
> >
> > thanks,
> >
> > greg k-h
> >
> > [1] I am working really hard with lots of vendors to try to fix their
> >     broken development model, but that is going to take years to resolve
> >     as their device pipelines are years long, and changing their
> >     mindsets takes a long time...
>
>
>
> --
> Thomas
Linus Torvalds May 11, 2017, 11:44 p.m. UTC | #48
On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
>
> Ingo: Do you want the change as-is? Would you like it to be optional?
> What do you think?

I'm not ingo, but I don't like that patch. It's in the wrong place -
that system call return code is too timing-critical to add address
limit checks.

Now what I think you *could* do is:

 - make "set_fs()" actually set a work flag in the current thread flags

 - do the test in the slow-path (syscall_return_slowpath).

Yes, yes, that ends up being architecture-specific, but it's fairly simple.

And it only slows down the system calls that actually use "set_fs()".
Sure, it will slow those down a fair amount, but they are hopefully a
small subset of all cases.

How does that sound to people?  Thats' where we currently do that

        if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
            WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
regs->orig_ax))
                local_irq_enable();

check too, which is a fairly similar issue.

               Linus
Martin Schwidefsky May 12, 2017, 5:28 a.m. UTC | #49
On Thu, 11 May 2017 16:44:07 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
> >
> > Ingo: Do you want the change as-is? Would you like it to be optional?
> > What do you think?  
> 
> I'm not ingo, but I don't like that patch. It's in the wrong place -
> that system call return code is too timing-critical to add address
> limit checks.
> 
> Now what I think you *could* do is:
> 
>  - make "set_fs()" actually set a work flag in the current thread flags
> 
>  - do the test in the slow-path (syscall_return_slowpath).
> 
> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
> 
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.
> 
> How does that sound to people?  Thats' where we currently do that
> 
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> regs->orig_ax))
>                 local_irq_enable();
> 
> check too, which is a fairly similar issue.

This is exactly what Heiko did for the s390 backend as a result of this
discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
for the hot patch the check for the bit is included in the general
_CIF_WORK test. Only the slow patch gets a bit slower.

git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
"s390: restore address space when returning to user space".
Kees Cook May 12, 2017, 5:34 a.m. UTC | #50
On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky
<schwidefsky@de.ibm.com> wrote:
> On Thu, 11 May 2017 16:44:07 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
>> >
>> > Ingo: Do you want the change as-is? Would you like it to be optional?
>> > What do you think?
>>
>> I'm not ingo, but I don't like that patch. It's in the wrong place -
>> that system call return code is too timing-critical to add address
>> limit checks.
>>
>> Now what I think you *could* do is:
>>
>>  - make "set_fs()" actually set a work flag in the current thread flags
>>
>>  - do the test in the slow-path (syscall_return_slowpath).
>>
>> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>>
>> And it only slows down the system calls that actually use "set_fs()".
>> Sure, it will slow those down a fair amount, but they are hopefully a
>> small subset of all cases.
>>
>> How does that sound to people?  Thats' where we currently do that
>>
>>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
>> regs->orig_ax))
>>                 local_irq_enable();
>>
>> check too, which is a fairly similar issue.
>
> This is exactly what Heiko did for the s390 backend as a result of this
> discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
> for the hot patch the check for the bit is included in the general
> _CIF_WORK test. Only the slow patch gets a bit slower.
>
> git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
> "s390: restore address space when returning to user space".

If I'm understanding this, it won't catch corruption of addr_limit
during fast-path syscalls, though (i.e. addr_limit changed without a
call to set_fs()). :( This addr_limit corruption is mostly only a risk
archs without THREAD_INFO_IN_TASK, but it would still be nice to catch
unbalanced set_fs() code, so I like the idea. I like getting rid of
addr_limit entirely even more, but that'll take some time. :)

-Kees
Martin Schwidefsky May 12, 2017, 5:54 a.m. UTC | #51
On Thu, 11 May 2017 22:34:31 -0700
Kees Cook <keescook@chromium.org> wrote:

> On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky
> <schwidefsky@de.ibm.com> wrote:
> > On Thu, 11 May 2017 16:44:07 -0700
> > Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >  
> >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:  
> >> >
> >> > Ingo: Do you want the change as-is? Would you like it to be optional?
> >> > What do you think?  
> >>
> >> I'm not ingo, but I don't like that patch. It's in the wrong place -
> >> that system call return code is too timing-critical to add address
> >> limit checks.
> >>
> >> Now what I think you *could* do is:
> >>
> >>  - make "set_fs()" actually set a work flag in the current thread flags
> >>
> >>  - do the test in the slow-path (syscall_return_slowpath).
> >>
> >> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
> >>
> >> And it only slows down the system calls that actually use "set_fs()".
> >> Sure, it will slow those down a fair amount, but they are hopefully a
> >> small subset of all cases.
> >>
> >> How does that sound to people?  Thats' where we currently do that
> >>
> >>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
> >>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> >> regs->orig_ax))
> >>                 local_irq_enable();
> >>
> >> check too, which is a fairly similar issue.  
> >
> > This is exactly what Heiko did for the s390 backend as a result of this
> > discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
> > for the hot patch the check for the bit is included in the general
> > _CIF_WORK test. Only the slow patch gets a bit slower.
> >
> > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
> > "s390: restore address space when returning to user space".  
> 
> If I'm understanding this, it won't catch corruption of addr_limit
> during fast-path syscalls, though (i.e. addr_limit changed without a
> call to set_fs()). :( This addr_limit corruption is mostly only a risk
> archs without THREAD_INFO_IN_TASK, but it would still be nice to catch
> unbalanced set_fs() code, so I like the idea. I like getting rid of
> addr_limit entirely even more, but that'll take some time. :)

Well for s390 there is no addr_limit as we use two separate address space
for kernel vs. user. The equivalent to the addr_limit corruption on a
fast-path syscall would be changing CR7 outside of set_fs. This boils
down to the question what we are protection against? Bad code with 
unbalanced set_fs or evil code that changes addr_limit/CR7 outside of
set_fs
Andy Lutomirski May 12, 2017, 6:13 a.m. UTC | #52
[resending because kernel.org seems to have mangled my SMTP
credentials.  I wonder if this is a common problem.]

On Thu, May 11, 2017 at 4:44 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
>>
>> Ingo: Do you want the change as-is? Would you like it to be optional?
>> What do you think?
>
> I'm not ingo, but I don't like that patch. It's in the wrong place -
> that system call return code is too timing-critical to add address
> limit checks.
>
> Now what I think you *could* do is:
>
>  - make "set_fs()" actually set a work flag in the current thread flags
>
>  - do the test in the slow-path (syscall_return_slowpath).
>
> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.
>
> How does that sound to people?  Thats' where we currently do that
>
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> regs->orig_ax))
>                 local_irq_enable();
>
> check too, which is a fairly similar issue.
>

I like this.  It wouldn't help the problem that I suspect is a major
part of the motivation for this patch: a stack overflow could
overwrite addr_limit.  But we fixed that for real already.

Slightly off-topic: I would *love* to see syscall_return_slowpath() or
similar moved or at least mostly moved into generic code.  Aside from
the fact that it used to be written in asm, there's nothing
fundamentally arch-specific about it.

>
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.

It won't even slow them down that much.  The slow path is reasonably
fast these days.
Ingo Molnar May 12, 2017, 6:57 a.m. UTC | #53
* Kees Cook <keescook@chromium.org> wrote:

> > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
> > "s390: restore address space when returning to user space".
> 
> If I'm understanding this, it won't catch corruption of addr_limit
> during fast-path syscalls, though (i.e. addr_limit changed without a
> call to set_fs()). :(

Nor does it, or the patch you propose, protect against against something 
corrupting task->mm pointer, or the task->*uid values, or any of the myriads of 
security relevant values stored in the task structure!

Making sure API (set_fs()) usage is bug-free and protecting against the effects of 
general data corruption are two unrelated things that should not mixed.

Thanks,

	Ingo
Ingo Molnar May 12, 2017, 6:58 a.m. UTC | #54
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
> >
> > Ingo: Do you want the change as-is? Would you like it to be optional?
> > What do you think?
> 
> I'm not ingo, but I don't like that patch. It's in the wrong place -
> that system call return code is too timing-critical to add address
> limit checks.
> 
> Now what I think you *could* do is:
> 
>  - make "set_fs()" actually set a work flag in the current thread flags
> 
>  - do the test in the slow-path (syscall_return_slowpath).
> 
> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
> 
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.
> 
> How does that sound to people?  Thats' where we currently do that
> 
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> regs->orig_ax))
>                 local_irq_enable();
> 
> check too, which is a fairly similar issue.

I really like that idea and I'd be perfectly fine with that solution, because it 
puts the overhead where the problem comes from, and adds an extra incentive for 
code to move away from set_fs() facilities. Win-win.

Thanks,

	Ingo
Ingo Molnar May 12, 2017, 7 a.m. UTC | #55
* Andy Lutomirski <luto@kernel.org> wrote:

> On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote:
> > On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
> >> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
> >> be a pity to add a runtime check to every system call ...
> >
> > I think we should simply strive to remove all of them that aren't
> > in core scheduler / arch code.  Basically evetyytime we do the
> >
> >         oldfs = get_fs();
> >         set_fs(KERNEL_DS);
> >         ..
> >         set_fs(oldfs);
> >
> > trick we're doing something wrong, and there should always be better
> > ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
> > consistently would already remove most of them.
> 
> How about trying to remove all of them?  If we could actually get rid
> of all of them, we could drop the arch support, and we'd get faster,
> simpler, shorter uaccess code throughout the kernel.

I'm all for that!

Thanks,

	Ingo
Al Viro May 12, 2017, 7:15 a.m. UTC | #56
On Fri, May 12, 2017 at 09:00:12AM +0200, Ingo Molnar wrote:

> > How about trying to remove all of them?  If we could actually get rid
> > of all of them, we could drop the arch support, and we'd get faster,
> > simpler, shorter uaccess code throughout the kernel.
> 
> I'm all for that!

Oh, for...  Ingo, do you really want to go through all ->write() and ->read()
instances, converting all of them to iov_iter?  Or, better yet, deal with
the patch flood from Nick Krause sock puppet brigade?

Folks, seriously, have you even looked through that zoo?  I have, and it's
really, really not fun.  Sure, we can say "fuck 'em, no need to allow
splice() on random crap".  Would be perfectly reasonable, expect that
it's not the only place doing kernel_write() and its ilk...

And converting everything to ->read_iter()/->write_iter() means an insane
amount of code churn, not to mention coping with random bogosities in
semantics.  ->read() and ->write() are going to stay around, pretty
much indefinitely.
Christoph Hellwig May 12, 2017, 7:35 a.m. UTC | #57
On Fri, May 12, 2017 at 08:15:49AM +0100, Al Viro wrote:
> And converting everything to ->read_iter()/->write_iter() means an insane
> amount of code churn, not to mention coping with random bogosities in
> semantics.  ->read() and ->write() are going to stay around, pretty
> much indefinitely.

But I don't think kernel users of them have to.  I've been digging
through the calllers and will send an analysis to the list in a bit.
Arnd Bergmann May 12, 2017, 7:43 a.m. UTC | #58
On Fri, May 12, 2017 at 9:15 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Fri, May 12, 2017 at 09:00:12AM +0200, Ingo Molnar wrote:
>
>> > How about trying to remove all of them?  If we could actually get rid
>> > of all of them, we could drop the arch support, and we'd get faster,
>> > simpler, shorter uaccess code throughout the kernel.
>>
>> I'm all for that!
>
> Oh, for...  Ingo, do you really want to go through all ->write() and ->read()
> instances, converting all of them to iov_iter?  Or, better yet, deal with
> the patch flood from Nick Krause sock puppet brigade?

How realistic and how useful would it be to first completely eliminate
the ones that are in loadable modules and then wrapping the definition
in #ifndef MODULE (or even make it an extern function)?

This should be a fairly complete list of the modular users:

drivers/block/drbd/drbd_main.c: set_fs(KERNEL_DS);
drivers/input/serio/hp_sdc.c:   set_fs(KERNEL_DS);
drivers/media/v4l2-core/v4l2-compat-ioctl32.c:          set_fs(KERNEL_DS);
drivers/misc/lkdtm_bugs.c:      set_fs(KERNEL_DS);
drivers/s390/crypto/pkey_api.c: set_fs(KERNEL_DS);
drivers/staging/comedi/drivers/serial2002.c:    set_fs(KERNEL_DS);
drivers/staging/lustre/lnet/libcfs/tracefile.c: set_fs(get_ds());
drivers/staging/media/atomisp/pci/atomisp2/atomisp_compat_ioctl32.c:
 set_fs(KERNEL_DS);
drivers/staging/rtl8723bs/os_dep/osdep_service.c:               oldfs
= get_fs(); set_fs(get_ds());
drivers/usb/gadget/function/f_mass_storage.c:   set_fs(get_ds());
drivers/usb/gadget/function/u_uac1.c:   set_fs(KERNEL_DS);
drivers/vhost/vhost.c:  set_fs(USER_DS);
drivers/video/fbdev/core/fbmem.c:       set_fs(KERNEL_DS);
drivers/video/fbdev/hpfb.c:     set_fs(KERNEL_DS);
fs/autofs4/waitq.c:     set_fs(KERNEL_DS);
fs/binfmt_aout.c:       set_fs(KERNEL_DS);
fs/binfmt_elf.c:                set_fs(USER_DS);
fs/binfmt_elf_fdpic.c:  set_fs(KERNEL_DS);
fs/btrfs/send.c:        set_fs(KERNEL_DS);
fs/ext4/ioctl.c:                set_fs(KERNEL_DS);
fs/nfsd/vfs.c:  set_fs(KERNEL_DS);
net/9p/trans_fd.c:      set_fs(get_ds());
net/ipv6/addrconf.c:                    set_fs(KERNEL_DS);
net/ipv6/exthdrs.c:     set_fs(KERNEL_DS);
net/sunrpc/svcsock.c:   oldfs = get_fs(); set_fs(KERNEL_DS);
sound/core/oss/pcm_oss.c:       set_fs(get_ds());
sound/core/pcm_native.c:        set_fs(get_ds());
sound/drivers/opl3/opl3_oss.c:  set_fs(get_ds());
sound/oss/dmabuf.c:     set_fs(get_ds());
sound/oss/swarm_cs4297a.c:                set_fs(KERNEL_DS);
sound/pci/emu10k1/emufx.c:      set_fs(get_ds());
sound/pci/hda/hda_codec.c:              set_fs(get_ds());

     Arnd
Christoph Hellwig May 12, 2017, 8:07 a.m. UTC | #59
On Fri, May 12, 2017 at 12:35:21AM -0700, Christoph Hellwig wrote:
> On Fri, May 12, 2017 at 08:15:49AM +0100, Al Viro wrote:
> > And converting everything to ->read_iter()/->write_iter() means an insane
> > amount of code churn, not to mention coping with random bogosities in
> > semantics.  ->read() and ->write() are going to stay around, pretty
> > much indefinitely.
> 
> But I don't think kernel users of them have to.  I've been digging
> through the calllers and will send an analysis to the list in a bit.

Ok, here is the cleaned up list.  The targets for in-kernel I/O
are basically regular files on normal fs, block devices, pipes
and sockets, with a few narrow exceptions and a few unclear cases.

I've added a few maintainers to Cc to clarify, and the list is below:

When looking at kernel_read/kernel_readv/kernel_write/__kernel_write
instances, or vfs_read/vfs_readv/vfs_write/vfs_writev inststances with
set_fs tricks most of them simply require a regular file on a "real"
fs or a block device, pipe, socket:

 - various binary loaders: need to be regular files (+ on a "real" fs)
 - nandsim: expect a regular file or maybe a block device
 - code: expects a regular file
 - ecryptfs: expects a regular file
 - splice: can be a regular file or device file
 - security/keys: regular file
 - cachefiles: needs to be a regular file
 - coredump: usually a regular file, but possibly a pipe
 - acct: regular file
 - target: regular file
 - nfsd: regular file
 - lustre: regular file
 - f_mass_storage: regular file, or maybe block device
 - autofs: pipe
 - btrfs: regular file / pipe / socket
 - ima: regular file on specific file systems

Then there are a few interesting ones with specific targets:

 - sysctl: regular file on procfs
 - mconsole: procfs as far as I can tell.  Might be able to further
   narrow it down?
 - ashmem: regular file on shmem

The nommu mmap code needs to read everything that wants to support
a MAP_PRIVATE mmap, but as far as a I can tell: do a) address limits
not matter for nommu, and b) I have no f***cking idea why it doesn't
use readpage(s) to start with like the MMU code.

And a few that I can't figure:

 - cx25821: no idea what this opens, need to confirm with the maintainers
   (on Cc)
 - 9p: not clear.  looks like it might be sockets/pipes
 
 Last but not least we have a driver that's a complete mess:

 - series2002: /dev/tty* (drivers is in staging and needs to be rewritten
   using the proper in-kernel APIs anyway)
Christoph Hellwig May 12, 2017, 8:11 a.m. UTC | #60
On Fri, May 12, 2017 at 09:43:40AM +0200, Arnd Bergmann wrote:
> How realistic and how useful would it be to first completely eliminate
> the ones that are in loadable modules and then wrapping the definition
> in #ifndef MODULE (or even make it an extern function)?

Should be fairly doable and might be a nice step towards cleaning the
mess up.  In fact with my seres a large part of those are gone, and
most of the remaining handler are ioctl handlers or what seems like
opencoded versions of probe_kernel_read.

But it won't help against exploits modifying addr_limit manually.
Al Viro May 12, 2017, 8:11 a.m. UTC | #61
On Fri, May 12, 2017 at 09:43:40AM +0200, Arnd Bergmann wrote:

> How realistic and how useful would it be to first completely eliminate
> the ones that are in loadable modules and then wrapping the definition
> in #ifndef MODULE (or even make it an extern function)?

Eliminate _what_?  ->read() and ->write() instances?

> This should be a fairly complete list of the modular users:
> 
> drivers/block/drbd/drbd_main.c: set_fs(KERNEL_DS);

Ah, set_fs()...  Sure, many of those can be killed off.  Wouldn't be
a bad idea, but I don't understand what difference does modular/built-in
make here...

This one: AFAICS doesn't give a damn about set_fs() at all.

> drivers/input/serio/hp_sdc.c:   set_fs(KERNEL_DS);

Open-coded probe_kernel_read(), apparently.

> drivers/media/v4l2-core/v4l2-compat-ioctl32.c:          set_fs(KERNEL_DS);

massive compat ioctl crap.

> drivers/misc/lkdtm_bugs.c:      set_fs(KERNEL_DS);

insane.

> drivers/s390/crypto/pkey_api.c: set_fs(KERNEL_DS);

No idea.

> drivers/staging/comedi/drivers/serial2002.c:    set_fs(KERNEL_DS);

Open-coded kernel_write(); to some character device, no less...  And similar
for kernel_read(), apparently.

> drivers/staging/lustre/lnet/libcfs/tracefile.c: set_fs(get_ds());

Fuck knows; kernel_write() might do it.  Depends upon what it's writing
to.

You've missed other places in lustre, BTW - including the ioctls on
sockets, etc.

> drivers/staging/media/atomisp/pci/atomisp2/atomisp_compat_ioctl32.c:
>  set_fs(KERNEL_DS);

Compat ioctl crap, again.

> drivers/staging/rtl8723bs/os_dep/osdep_service.c:               oldfs
> = get_fs(); set_fs(get_ds());

Oh, lovely - reading an arbitrary (as in, specified by pathname) file.
Firmware (mis)handling?

> drivers/usb/gadget/function/f_mass_storage.c:   set_fs(get_ds());

No idea.

> drivers/usb/gadget/function/u_uac1.c:   set_fs(KERNEL_DS);

kernel_write(), by the look of it.  Or something similar.

> drivers/vhost/vhost.c:  set_fs(USER_DS);

kernel thread doing use_mm()

> drivers/video/fbdev/core/fbmem.c:       set_fs(KERNEL_DS);

compat ioctl.

> drivers/video/fbdev/hpfb.c:     set_fs(KERNEL_DS);

probe_kernel_read()

> fs/autofs4/waitq.c:     set_fs(KERNEL_DS);

kernel_write()

> fs/binfmt_aout.c:       set_fs(KERNEL_DS);
> fs/binfmt_elf.c:                set_fs(USER_DS);
> fs/binfmt_elf_fdpic.c:  set_fs(KERNEL_DS);

coredump stuff.

> fs/btrfs/send.c:        set_fs(KERNEL_DS);
kernel_write()

Anyway, what's special about modules?  IDGI...
Al Viro May 12, 2017, 8:16 a.m. UTC | #62
On Fri, May 12, 2017 at 01:11:26AM -0700, Christoph Hellwig wrote:

> But it won't help against exploits modifying addr_limit manually.

Or the ones setting current->cred to that of init.  Your point being?
Arnd Bergmann May 12, 2017, 8:20 a.m. UTC | #63
On Fri, May 12, 2017 at 10:11 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> Anyway, what's special about modules?  IDGI...

One of the arguments that came up earlier was code in external modules
being mostly unaudited, sometimes without any source code available
at all but still used in devices.

If modules can't do set_fs() any more, this could eliminate bugs with
unpaired set_fs in those modules.

Limiting factors of course are:

- embedded systems that ship come with their own kernels (as opposed
  to using whatever users have, or relying on binary distros) can just
  make it available to modules again, by reverting the patch

- As Christoph said, they could have an open-coded set_fs in the
  driver

- Whatever other method a clueless driver write might come up with
  isn't necessarily better than set_fs().

     Arnd
Greg KH May 12, 2017, 8:23 a.m. UTC | #64
On Fri, May 12, 2017 at 01:07:41AM -0700, Christoph Hellwig wrote:
>  Last but not least we have a driver that's a complete mess:
> 
>  - series2002: /dev/tty* (drivers is in staging and needs to be rewritten
>    using the proper in-kernel APIs anyway)

I agree, that's a mess, and it needs to use the new serdev interface at
the least.  Worse case, we just delete the thing :)

thanks,

greg k-h
Thomas Garnier May 12, 2017, 5:05 p.m. UTC | #65
On Thu, May 11, 2017 at 11:58 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
>> >
>> > Ingo: Do you want the change as-is? Would you like it to be optional?
>> > What do you think?
>>
>> I'm not ingo, but I don't like that patch. It's in the wrong place -
>> that system call return code is too timing-critical to add address
>> limit checks.
>>
>> Now what I think you *could* do is:
>>
>>  - make "set_fs()" actually set a work flag in the current thread flags
>>
>>  - do the test in the slow-path (syscall_return_slowpath).
>>
>> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>>
>> And it only slows down the system calls that actually use "set_fs()".
>> Sure, it will slow those down a fair amount, but they are hopefully a
>> small subset of all cases.
>>
>> How does that sound to people?  Thats' where we currently do that
>>
>>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
>> regs->orig_ax))
>>                 local_irq_enable();
>>
>> check too, which is a fairly similar issue.
>
> I really like that idea and I'd be perfectly fine with that solution, because it
> puts the overhead where the problem comes from, and adds an extra incentive for
> code to move away from set_fs() facilities. Win-win.

Great, I will adapt the patch for that.

>
> Thanks,
>
>         Ingo
Kees Cook May 12, 2017, 7:01 p.m. UTC | #66
On Thu, May 11, 2017 at 10:54 PM, Martin Schwidefsky
<schwidefsky@de.ibm.com> wrote:
> On Thu, 11 May 2017 22:34:31 -0700
> Kees Cook <keescook@chromium.org> wrote:
>
>> On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky
>> <schwidefsky@de.ibm.com> wrote:
>> > On Thu, 11 May 2017 16:44:07 -0700
>> > Linus Torvalds <torvalds@linux-foundation.org> wrote:
>> >
>> >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
>> >> >
>> >> > Ingo: Do you want the change as-is? Would you like it to be optional?
>> >> > What do you think?
>> >>
>> >> I'm not ingo, but I don't like that patch. It's in the wrong place -
>> >> that system call return code is too timing-critical to add address
>> >> limit checks.
>> >>
>> >> Now what I think you *could* do is:
>> >>
>> >>  - make "set_fs()" actually set a work flag in the current thread flags
>> >>
>> >>  - do the test in the slow-path (syscall_return_slowpath).
>> >>
>> >> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>> >>
>> >> And it only slows down the system calls that actually use "set_fs()".
>> >> Sure, it will slow those down a fair amount, but they are hopefully a
>> >> small subset of all cases.
>> >>
>> >> How does that sound to people?  Thats' where we currently do that
>> >>
>> >>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>> >>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
>> >> regs->orig_ax))
>> >>                 local_irq_enable();
>> >>
>> >> check too, which is a fairly similar issue.
>> >
>> > This is exactly what Heiko did for the s390 backend as a result of this
>> > discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
>> > for the hot patch the check for the bit is included in the general
>> > _CIF_WORK test. Only the slow patch gets a bit slower.
>> >
>> > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
>> > "s390: restore address space when returning to user space".
>>
>> If I'm understanding this, it won't catch corruption of addr_limit
>> during fast-path syscalls, though (i.e. addr_limit changed without a
>> call to set_fs()). :( This addr_limit corruption is mostly only a risk
>> archs without THREAD_INFO_IN_TASK, but it would still be nice to catch
>> unbalanced set_fs() code, so I like the idea. I like getting rid of
>> addr_limit entirely even more, but that'll take some time. :)
>
> Well for s390 there is no addr_limit as we use two separate address space
> for kernel vs. user. The equivalent to the addr_limit corruption on a
> fast-path syscall would be changing CR7 outside of set_fs. This boils
> down to the question what we are protection against? Bad code with
> unbalanced set_fs or evil code that changes addr_limit/CR7 outside of
> set_fs

Yeah, the risk for "corrupted addr_limit" is mainly a concern for
archs with addr_limit on the kernel stack. If I'm reading things
correctly, that means, from the archs I've been paying closer
attention to, it's an issue for arm, mips, and powerpc:

arch/arm/include/asm/uaccess.h: current_thread_info()->addr_limit = fs;
arch/arm/include/asm/thread_info.h:             (current_stack_pointer
& ~(THREAD_SIZE - 1));

arch/mips/include/asm/uaccess.h:#define set_fs(x)
(current_thread_info()->addr_limit = (x))
arch/mips/kernel/process.c:        * task stacks at THREAD_SIZE - 32

arch/powerpc/include/asm/uaccess.h:#define set_fs(val)
(current->thread.fs = (val))
arch/powerpc/kernel/process.c:          struct pt_regs *regs =
task_stack_page(current) + THREAD_SIZE;

(s390 uses a register, x86 and arm64 implement THREAD_INFO_IN_TASK.)
Targeting addr_limit through arbitrary write attacks isn't too common
since ... it's an arbitrary write. The issue with addr_limit was that
it can live on the kernel stack, which meant all kinds of
stack-related bugs can lead to it getting stomped on.

So, two goals to protect addr_limit:

- get it off the stack to make the difficulty of corruption on par
with other sensitive things that would require an arbitrary write
flaw.

- detect/block unbalanced set_fs() calls.

If we can get the former addressed by the remaining architectures,
then that class of attack will go away. For the latter, it sounds like
Linus's slowpath-exit will work nicely.

To me it looks like he architectures with addr_limit still on the
stack would still benefit from always-check-addr_limit on syscall
exit, but that would be arch-specific anyway.

And then, of course, we've got the parallel task of just removing
set_fs() entirely. :)

-Kees
Russell King - ARM Linux admin May 12, 2017, 7:08 p.m. UTC | #67
On Fri, May 12, 2017 at 12:01:59PM -0700, Kees Cook wrote:
> Yeah, the risk for "corrupted addr_limit" is mainly a concern for
> archs with addr_limit on the kernel stack. If I'm reading things
> correctly, that means, from the archs I've been paying closer
> attention to, it's an issue for arm, mips, and powerpc:

I'd first want to uninline everything in uaccess.h first that makes
use of access_ok() - which I think is something that needs to happen
anyway.
Linus Torvalds May 12, 2017, 7:08 p.m. UTC | #68
On Fri, May 12, 2017 at 12:01 PM, Kees Cook <keescook@chromium.org> wrote:
>
> Yeah, the risk for "corrupted addr_limit" is mainly a concern for
> archs with addr_limit on the kernel stack. If I'm reading things
> correctly, that means, from the archs I've been paying closer
> attention to, it's an issue for arm, mips, and powerpc:

I don't understand why people are looking at addr_limit as some kind
of special thing.

If somebody is smashing the stack and corrupting thread info data, the
game is over. addr_limit is the *least* of your problems, and it's not
even all that likely that it will be increasing (it's much more likely
that it would be overwritten with a smaller value).

Quite frankly, this kind of idiotic discussion just makes me question
the whole idea of the patch.

Any "security" that is this specific is not real security, it's just
masturbatory garbage.

It may be worth checking that people use "set_fs()" properly. But stop
this idiotic crap. It just makes the kernel security people look like
the crazies.

There are enough incompetent crazy security people, don't go there.
The kinds of things it is worth protecting against are the big class
of generic issues, not the kind of "oh, but imagine if a cosmic ray
flips this particular word in memory" kind of crap that ignores all
the other words of memory.

Seriously, Kees. You are just making security people look bad. Stop it.

                       Linus
Kees Cook May 12, 2017, 7:30 p.m. UTC | #69
On Fri, May 12, 2017 at 12:08 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Fri, May 12, 2017 at 12:01 PM, Kees Cook <keescook@chromium.org> wrote:
>> Yeah, the risk for "corrupted addr_limit" is mainly a concern for
>> archs with addr_limit on the kernel stack. If I'm reading things
>> correctly, that means, from the archs I've been paying closer
>> attention to, it's an issue for arm, mips, and powerpc:
>
> I don't understand why people are looking at addr_limit as some kind
> of special thing.
>
> If somebody is smashing the stack and corrupting thread info data, the
> game is over. addr_limit is the *least* of your problems, and it's not
> even all that likely that it will be increasing (it's much more likely
> that it would be overwritten with a smaller value).
>
> Quite frankly, this kind of idiotic discussion just makes me question
> the whole idea of the patch.
>
> Any "security" that is this specific is not real security, it's just
> masturbatory garbage.
>
> It may be worth checking that people use "set_fs()" properly. But stop
> this idiotic crap. It just makes the kernel security people look like
> the crazies.
>
> There are enough incompetent crazy security people, don't go there.
> The kinds of things it is worth protecting against are the big class
> of generic issues, not the kind of "oh, but imagine if a cosmic ray
> flips this particular word in memory" kind of crap that ignores all
> the other words of memory.
>
> Seriously, Kees. You are just making security people look bad. Stop it.

I'm clearly not explaining things well enough. I shouldn't say
"corruption", I should say "malicious manipulation". The methodology
of attacks against the stack are quite different from the other kinds
of attacks like use-after-free, heap overflow, etc. Being able to
exhaust the kernel stack (either due to deep recursion or unbounded
alloca()) means attackers can control a write to addr_limit, and then
leverage that into an actual arbitrary write via subsequent calls to
copy_to_user() pointed at kernel memory. This isn't theoretical, this
is how those attacks are performed. It may sound crazy, but it's real.

With thread_info off the stack, the whole problem goes away. It's
wonderful that this has happened for x86, arm64, and s390. There are
always going to be new methods of attack for everything, but while we
slowly address the design weakness (set_fs()), we can fix the low
hanging fruit too.

-Kees
Russell King - ARM Linux admin May 12, 2017, 8:21 p.m. UTC | #70
On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> I'm clearly not explaining things well enough. I shouldn't say
> "corruption", I should say "malicious manipulation". The methodology
> of attacks against the stack are quite different from the other kinds
> of attacks like use-after-free, heap overflow, etc. Being able to
> exhaust the kernel stack (either due to deep recursion or unbounded
> alloca())

I really hope we don't have alloca() use in the kernel.  Do you have
evidence to support that assertion?

IMHO alloca() (or similar) should not be present in any kernel code
because we have a limited stack - we have kmalloc() etc for that kind
of thing.
Peter Zijlstra May 12, 2017, 8:30 p.m. UTC | #71
On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
> On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > I'm clearly not explaining things well enough. I shouldn't say
> > "corruption", I should say "malicious manipulation". The methodology
> > of attacks against the stack are quite different from the other kinds
> > of attacks like use-after-free, heap overflow, etc. Being able to
> > exhaust the kernel stack (either due to deep recursion or unbounded
> > alloca())
> 
> I really hope we don't have alloca() use in the kernel.  Do you have
> evidence to support that assertion?
> 
> IMHO alloca() (or similar) should not be present in any kernel code
> because we have a limited stack - we have kmalloc() etc for that kind
> of thing.

On stack variable length arrays get implemented by the compiler doing
alloca(), and we sadly have a few of those around.

But yes, fully agreed on the desirability of alloca() and things.
Russell King - ARM Linux admin May 12, 2017, 8:45 p.m. UTC | #72
On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > > I'm clearly not explaining things well enough. I shouldn't say
> > > "corruption", I should say "malicious manipulation". The methodology
> > > of attacks against the stack are quite different from the other kinds
> > > of attacks like use-after-free, heap overflow, etc. Being able to
> > > exhaust the kernel stack (either due to deep recursion or unbounded
> > > alloca())
> > 
> > I really hope we don't have alloca() use in the kernel.  Do you have
> > evidence to support that assertion?
> > 
> > IMHO alloca() (or similar) should not be present in any kernel code
> > because we have a limited stack - we have kmalloc() etc for that kind
> > of thing.
> 
> On stack variable length arrays get implemented by the compiler doing
> alloca(), and we sadly have a few of those around.

I hope their size is appropriately limited, but something tells me it
would be foolish to assume that.

> But yes, fully agreed on the desirability of alloca() and things.

Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks
like it certainly would prevent an explicit alloca() call.
Kees Cook May 12, 2017, 9 p.m. UTC | #73
On Fri, May 12, 2017 at 1:45 PM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
>> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
>> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
>> > > I'm clearly not explaining things well enough. I shouldn't say
>> > > "corruption", I should say "malicious manipulation". The methodology
>> > > of attacks against the stack are quite different from the other kinds
>> > > of attacks like use-after-free, heap overflow, etc. Being able to
>> > > exhaust the kernel stack (either due to deep recursion or unbounded
>> > > alloca())
>> >
>> > I really hope we don't have alloca() use in the kernel.  Do you have
>> > evidence to support that assertion?
>> >
>> > IMHO alloca() (or similar) should not be present in any kernel code
>> > because we have a limited stack - we have kmalloc() etc for that kind
>> > of thing.
>>
>> On stack variable length arrays get implemented by the compiler doing
>> alloca(), and we sadly have a few of those around.
>
> I hope their size is appropriately limited, but something tells me it
> would be foolish to assume that.
>
>> But yes, fully agreed on the desirability of alloca() and things.
>
> Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks
> like it certainly would prevent an explicit alloca() call.

Building with -Werror=vla is exciting. :)

A lot of it is in crypto (which are relatively static sizes, just
using function callbacks), but there is plenty more.

-Kees
Kees Cook May 12, 2017, 9:04 p.m. UTC | #74
On Fri, May 12, 2017 at 2:00 PM, Kees Cook <keescook@chromium.org> wrote:
> On Fri, May 12, 2017 at 1:45 PM, Russell King - ARM Linux
> <linux@armlinux.org.uk> wrote:
>> On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
>>> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
>>> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
>>> > > I'm clearly not explaining things well enough. I shouldn't say
>>> > > "corruption", I should say "malicious manipulation". The methodology
>>> > > of attacks against the stack are quite different from the other kinds
>>> > > of attacks like use-after-free, heap overflow, etc. Being able to
>>> > > exhaust the kernel stack (either due to deep recursion or unbounded
>>> > > alloca())
>>> >
>>> > I really hope we don't have alloca() use in the kernel.  Do you have
>>> > evidence to support that assertion?
>>> >
>>> > IMHO alloca() (or similar) should not be present in any kernel code
>>> > because we have a limited stack - we have kmalloc() etc for that kind
>>> > of thing.
>>>
>>> On stack variable length arrays get implemented by the compiler doing
>>> alloca(), and we sadly have a few of those around.
>>
>> I hope their size is appropriately limited, but something tells me it
>> would be foolish to assume that.
>>
>>> But yes, fully agreed on the desirability of alloca() and things.
>>
>> Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks
>> like it certainly would prevent an explicit alloca() call.
>
> Building with -Werror=vla is exciting. :)
>
> A lot of it is in crypto (which are relatively static sizes, just
> using function callbacks), but there is plenty more.

I meant to also paste an example (which is harmless, I haven't looked
extensively at other examples):

        unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
        unsigned int size = crypto_tfm_alg_blocksize(tfm);
        u8 buffer[size + alignmask];

Looking at all the places (and having tried to remove a few of these
in pstore), I think it might be quite frustrating to eliminate them
all and then declare VLAs dead. I'm not against trying, though. :)

-Kees
Al Viro May 12, 2017, 9:06 p.m. UTC | #75
On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
> On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > I'm clearly not explaining things well enough. I shouldn't say
> > "corruption", I should say "malicious manipulation". The methodology
> > of attacks against the stack are quite different from the other kinds
> > of attacks like use-after-free, heap overflow, etc. Being able to
> > exhaust the kernel stack (either due to deep recursion or unbounded
> > alloca())
> 
> I really hope we don't have alloca() use in the kernel.  Do you have
> evidence to support that assertion?
> 
> IMHO alloca() (or similar) should not be present in any kernel code
> because we have a limited stack - we have kmalloc() etc for that kind
> of thing.

No alloca(), but there are VLAs.  Said that, the whole "what if they
can bugger thread_info and/or task_struct and go after set_fs() state"
is idiocy, of course - in that case the box is fucked, no matter what.
Daniel Micay May 12, 2017, 9:16 p.m. UTC | #76
On Fri, 2017-05-12 at 22:06 +0100, Al Viro wrote:
> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux
> wrote:
> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > > I'm clearly not explaining things well enough. I shouldn't say
> > > "corruption", I should say "malicious manipulation". The
> > > methodology
> > > of attacks against the stack are quite different from the other
> > > kinds
> > > of attacks like use-after-free, heap overflow, etc. Being able to
> > > exhaust the kernel stack (either due to deep recursion or
> > > unbounded
> > > alloca())
> > 
> > I really hope we don't have alloca() use in the kernel.  Do you have
> > evidence to support that assertion?
> > 
> > IMHO alloca() (or similar) should not be present in any kernel code
> > because we have a limited stack - we have kmalloc() etc for that
> > kind
> > of thing.
> 
> No alloca(), but there are VLAs.  Said that, the whole "what if they
> can bugger thread_info and/or task_struct and go after set_fs() state"
> is idiocy, of course - in that case the box is fucked, no matter what.

VMAP_STACK + -fstack-check would prevent exploiting even an unbounded
VLA / alloca size vs. it being an arbitrary write. -fstack-check
guarantees that there's one byte per page as the stack grows, although
there are some unfortunate GCC bugs making it less than perfect right
now... but they recently started caring about it more including making
it near zero overhead as it was always supposed to be.
Kees Cook May 12, 2017, 9:17 p.m. UTC | #77
On Fri, May 12, 2017 at 2:06 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
>> On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
>> > I'm clearly not explaining things well enough. I shouldn't say
>> > "corruption", I should say "malicious manipulation". The methodology
>> > of attacks against the stack are quite different from the other kinds
>> > of attacks like use-after-free, heap overflow, etc. Being able to
>> > exhaust the kernel stack (either due to deep recursion or unbounded
>> > alloca())
>>
>> I really hope we don't have alloca() use in the kernel.  Do you have
>> evidence to support that assertion?
>>
>> IMHO alloca() (or similar) should not be present in any kernel code
>> because we have a limited stack - we have kmalloc() etc for that kind
>> of thing.
>
> No alloca(), but there are VLAs.  Said that, the whole "what if they
> can bugger thread_info and/or task_struct and go after set_fs() state"
> is idiocy, of course - in that case the box is fucked, no matter what.

Two things are at risk from stack exhaustion: thread_info (mainly
addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and
overflow into adjacent allocations (fixed by VMAP_STACK). The latter
is fundamentally a heap overflow.

-Kees
Daniel Micay May 12, 2017, 9:23 p.m. UTC | #78
> overflow into adjacent allocations (fixed by VMAP_STACK).

99% fixed, but it's possible to skip over the guard page without
-fstack-check enabled (plus some edge cases need to be fixed in GCC),
unless VLAs were forbidden in addition to the existing large frame size
warning.

I'm not sure about in-tree code, but Qualcomm had some of these
improperly bounded VLA vulnerabilities in their MSM kernel...
Al Viro May 12, 2017, 9:41 p.m. UTC | #79
On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote:

> Two things are at risk from stack exhaustion: thread_info (mainly
> addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and

Really?  Let's take a look at arm, for example:

struct thread_info {
        unsigned long           flags;          /* low level flags */
        int                     preempt_count;  /* 0 => preemptable, <0 => bug */
        mm_segment_t            addr_limit;     /* address limit */
        struct task_struct      *task;          /* main task structure */

and current() is defined as current_thread_info()->task.

Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere near
the top threat.  If attacker can overwrite thread_info, you have lost.
Rik van Riel May 12, 2017, 9:47 p.m. UTC | #80
On Fri, 2017-05-12 at 22:41 +0100, Al Viro wrote:
> On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote:
> 
> > Two things are at risk from stack exhaustion: thread_info (mainly
> > addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and
> 
> Really?  Let's take a look at arm, for example:
> 
> struct thread_info {
>         unsigned long           flags;          /* low level flags */
>         int                     preempt_count;  /* 0 => preemptable,
> <0 => bug */
>         mm_segment_t            addr_limit;     /* address limit */
>         struct task_struct      *task;          /* main task
> structure */
> 
> and current() is defined as current_thread_info()->task.
> 
> Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere
> near
> the top threat.  If attacker can overwrite thread_info, you have
> lost.

That is why THREAD_INFO_IN_TASK exists. It moves
the struct thread_info to a location away from the
stack, which means a stack overflow will not overwrite
the thread_info.
Kees Cook May 12, 2017, 9:50 p.m. UTC | #81
On Fri, May 12, 2017 at 2:41 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote:
>
>> Two things are at risk from stack exhaustion: thread_info (mainly
>> addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and
>
> Really?  Let's take a look at arm, for example:
>
> struct thread_info {
>         unsigned long           flags;          /* low level flags */
>         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
>         mm_segment_t            addr_limit;     /* address limit */
>         struct task_struct      *task;          /* main task structure */
>
> and current() is defined as current_thread_info()->task.
>
> Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere near
> the top threat.  If attacker can overwrite thread_info, you have lost.

I don't disagree, but the type of attack is different. If the attacker
overwrites task_struct pointer, then they need to have built an false
one, and that may be made difficult by PAN, or need to know more about
kernel memory layout (rather than only stack depth), etc. Attacking
addr_limit makes it very very easy to upgrade attack capabilities. I'm
not say thread_info shouldn't be moved off the stack.

-Kees
Al Viro May 12, 2017, 10:57 p.m. UTC | #82
On Fri, May 12, 2017 at 05:47:55PM -0400, Rik van Riel wrote:

> > Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere
> > near
> > the top threat.  If attacker can overwrite thread_info, you have
> > lost.
> 
> That is why THREAD_INFO_IN_TASK exists. It moves
> the struct thread_info to a location away from the
> stack, which means a stack overflow will not overwrite
> the thread_info.

... in which case such attacks on ->addr_limit also become a non-issue.

AFAICS, we are mixing several unrelated issues here:
	* amount of places where set_fs() is called.  Sure, reducing it
is a good idea and we want to move to primitives like kernel_write() et.al.
Fewer users => lower odds of screwing it up.
	* making sure that remaining callers are properly paired.  Ditto.
	* switching to ->read_iter()/->write_iter() where it makes sense.
Again, no problem with that.
	* providing sane environment for places like perf/oprofile.  Again,
a good idea, and set_fs(USER_DS) is only a part of what's needed there.
	* switching _everything_ to ->read_iter()/->write_iter().  Flat-out
insane and AFAICS nobody is signing up for that.
	* getting rid of set_fs() entirely.  I'm afraid that it's not feasible
without the previous one and frankly, I don't see much point.
	* sanity-checking on return to userland.  Maybe useful, maybe not.
	* taking thread_info out of the way of stack overflows.  Reasonable,
but has very little to do with the rest of that.
	* protecting against Lovecraftian horrors slithering in from the outer
space only to commit unspeakable acts against ->addr_limit and ignoring much
tastier targets next to it, but then what do you expect from degenerate
spawn of Great Old Ones - sanity?
Andy Lutomirski May 12, 2017, 11:15 p.m. UTC | #83
On Mon, May 8, 2017 at 1:48 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Mon, May 08, 2017 at 04:06:35PM +0200, Jann Horn wrote:
>
>> I think Kees might be talking about
>> https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in
>> commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that
>> perf code that can run in pretty much any context called access_ok().
>
> And that commit has *NOT* solved the problem.  perf_callchain_user()
> can be called synchronously, without passing through that code.
> Tracepoint shite...
>
> That set_fs() should be done in get_perf_callchain(), just around the call of
> perf_callchain_user().  Along with pagefault_disable(), actually.
>

Even that's not quite enough because of a different issue: perf nmis
can hit during scheduling or when we're in lazy mm, leading to the
entirely wrong set of page tables being used.  We need
nmi_uaccess_begin() and nmi_uaccess_end(), and the former needs to be
allowed to fail.

AFAIK this isn't presently a security problem because it mainly
affects kernel threads, and you need to be root to profile them, but
maybe there's some race where it does matter.
Andy Lutomirski May 12, 2017, 11:20 p.m. UTC | #84
On Fri, May 12, 2017 at 12:15 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:

> Folks, seriously, have you even looked through that zoo?  I have, and it's
> really, really not fun.  Sure, we can say "fuck 'em, no need to allow
> splice() on random crap".  Would be perfectly reasonable, expect that
> it's not the only place doing kernel_write() and its ilk...

Can you clarify this?  I think we really may be able to do exactly
this.  From Christoph's list, there are only two things that need
kernel_read/kernel_write to user-supplied fds that may come from a
variety of sources: splice and exec.  If you're execing a chardev from
a crappy driver, something is seriously wrong.  And returning -EINVAL
from splice() to or from files that use ->read and ->write seems find
(and splice(2) even documents -EINVAL as meaning that the target
doesn't support splicing).

--Andy
Christoph Hellwig May 13, 2017, 7:21 a.m. UTC | #85
On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
> On stack variable length arrays get implemented by the compiler doing
> alloca(), and we sadly have a few of those around.

I've just got rid of one of those and I wish they would appear
entirely as they are horrible in so many different ways.  Sparse
warns about them, btw.
diff mbox

Patch

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d25435d94b6e..3d2ec084d5fc 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -64,6 +64,7 @@  config ARCH_SUPPORTS_UPROBES
 
 config S390
 	def_bool y
+	select ADDR_LIMIT_CHECK
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..e534b93ce43a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,28 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 
+
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_check_syscall(void)
+{
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+}
+
+#ifndef CONFIG_ADDR_LIMIT_CHECK
+#define ADDR_LIMIT_CHECK_PRE() \
+	bool user_caller = segment_eq(get_fs(), USER_DS)
+#define ADDR_LIMIT_CHECK_POST() \
+	if (user_caller) addr_limit_check_syscall()
+#else
+#define ADDR_LIMIT_CHECK_PRE()
+#define ADDR_LIMIT_CHECK_POST()
+asmlinkage void addr_limit_check_failed(void) __noreturn;
+#endif
+
+
 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
@@ -199,7 +221,10 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
-		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		long ret;						\
+		ADDR_LIMIT_CHECK_PRE();					\
+		ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		ADDR_LIMIT_CHECK_POST();				\
 		__MAP(x,__SC_TEST,__VA_ARGS__);				\
 		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
 		return ret;						\
diff --git a/init/Kconfig b/init/Kconfig
index 42a346b0df43..599d9fe30703 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1961,6 +1961,12 @@  config PROFILING
 config TRACEPOINTS
 	bool
 
+config ADDR_LIMIT_CHECK
+	bool
+	help
+	  Disable the generic address limit check. Allow each architecture to
+	  optimize how and when the verification is done.
+
 source "arch/Kconfig"
 
 endmenu		# General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..a1cbcd715d62 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2458,3 +2458,16 @@  COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
 	return 0;
 }
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_ADDR_LIMIT_CHECK
+/*
+ * Used when an architecture specific implementation detects an invalid address
+ * limit. This function does not return.
+ */
+asmlinkage void addr_limit_check_failed(void)
+{
+	/* Try to fail on the generic address limit check */
+	addr_limit_check_syscall();
+	panic("Invalid address limit before returning to user-mode");
+}
+#endif