Message ID | 20170428153213.137279-1-thgarnie@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote: > Ensure that a syscall does not return to user-mode with a kernel address > limit. If that happens, a process can corrupt kernel-mode memory and > elevate privileges [1]. > > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each > architecture can create optimized versions. This option is enabled by > default on s390 because a similar feature already exists. > > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990 > > Signed-off-by: Thomas Garnier <thgarnie@google.com> > Tested-by: Kees Cook <keescook@chromium.org> Ingo: Do you want to take the set? > --- > Based on next-20170426 > --- > arch/s390/Kconfig | 1 + > include/linux/syscalls.h | 27 ++++++++++++++++++++++++++- > init/Kconfig | 6 ++++++ > kernel/sys.c | 13 +++++++++++++ > 4 files changed, 46 insertions(+), 1 deletion(-) > > diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig > index d25435d94b6e..3d2ec084d5fc 100644 > --- a/arch/s390/Kconfig > +++ b/arch/s390/Kconfig > @@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES > > config S390 > def_bool y > + select ADDR_LIMIT_CHECK > select ARCH_HAS_DEVMEM_IS_ALLOWED > select ARCH_HAS_ELF_RANDOMIZE > select ARCH_HAS_GCOV_PROFILE_ALL > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 980c3c9b06f8..e534b93ce43a 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -191,6 +191,28 @@ extern struct trace_event_functions exit_syscall_print_funcs; > SYSCALL_METADATA(sname, x, __VA_ARGS__) \ > __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) > > + > +/* > + * Called before coming back to user-mode. Returning to user-mode with an > + * address limit different than USER_DS can allow to overwrite kernel memory. > + */ > +static inline void addr_limit_check_syscall(void) > +{ > + BUG_ON(!segment_eq(get_fs(), USER_DS)); > +} > + > +#ifndef CONFIG_ADDR_LIMIT_CHECK > +#define ADDR_LIMIT_CHECK_PRE() \ > + bool user_caller = segment_eq(get_fs(), USER_DS) > +#define ADDR_LIMIT_CHECK_POST() \ > + if (user_caller) addr_limit_check_syscall() > +#else > +#define ADDR_LIMIT_CHECK_PRE() > +#define ADDR_LIMIT_CHECK_POST() > +asmlinkage void addr_limit_check_failed(void) __noreturn; > +#endif > + > + > #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) > #define __SYSCALL_DEFINEx(x, name, ...) \ > asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ > @@ -199,7 +221,10 @@ extern struct trace_event_functions exit_syscall_print_funcs; > asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ > asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ > { \ > - long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ > + long ret; \ > + ADDR_LIMIT_CHECK_PRE(); \ > + ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ > + ADDR_LIMIT_CHECK_POST(); \ > __MAP(x,__SC_TEST,__VA_ARGS__); \ > __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ > return ret; \ > diff --git a/init/Kconfig b/init/Kconfig > index 42a346b0df43..599d9fe30703 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -1961,6 +1961,12 @@ config PROFILING > config TRACEPOINTS > bool > > +config ADDR_LIMIT_CHECK > + bool > + help > + Disable the generic address limit check. Allow each architecture to > + optimize how and when the verification is done. > + > source "arch/Kconfig" > > endmenu # General setup > diff --git a/kernel/sys.c b/kernel/sys.c > index 8a94b4eabcaa..a1cbcd715d62 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -2458,3 +2458,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) > return 0; > } > #endif /* CONFIG_COMPAT */ > + > +#ifdef CONFIG_ADDR_LIMIT_CHECK > +/* > + * Used when an architecture specific implementation detects an invalid address > + * limit. This function does not return. > + */ > +asmlinkage void addr_limit_check_failed(void) > +{ > + /* Try to fail on the generic address limit check */ > + addr_limit_check_syscall(); > + panic("Invalid address limit before returning to user-mode"); > +} > +#endif > -- > 2.13.0.rc0.306.g87b477812d-goog >
(added more Cc:s) * Thomas Garnier <thgarnie@google.com> wrote: > On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote: > > Ensure that a syscall does not return to user-mode with a kernel address > > limit. If that happens, a process can corrupt kernel-mode memory and > > elevate privileges [1]. > > > > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each > > architecture can create optimized versions. This option is enabled by > > default on s390 because a similar feature already exists. > > > > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990 > > > > Signed-off-by: Thomas Garnier <thgarnie@google.com> > > Tested-by: Kees Cook <keescook@chromium.org> > > Ingo: Do you want to take the set? Yeah, so now I'm questioning the whole premise of the feature, sorry :-/ A big disavantage is that the "security check" will add 2-5 instructions to the system call fast path. Every one of them, and essentially forever. Just to handle a CVE that was caused by a buggy touch-screen driver helper function leaking KERNEL_DS and which was fixed long ago ... And yes, I realize that there were other such bugs and that such bugs might occur in the future - but why not push the overhead of the security check to the kernel build phase? I.e. I'm wondering how well we could do static analysis during kernel build - would a limited mode of Sparse be good enough for that? Or we could add a new static checker to tools/, built from first principles and used primarily for extended syntactical checking. For example I'd consider it a good practice to mandate that if a kernel function sets KERNEL_DS then it must restore it as well. Any function that does not do that, or is too complex for the static analysis to prove correctness for sure should be considered buggy! Are there any common kernel APIs outside set_fs() that set KERNEL_DS intentionally? The overwhelming pattern ought to be: orig_fs = get_fs(); set_fs(KERNEL_DS); ... set_fs(orig_fs); ... and even a relatively simple static analysis tool ought to be able to see through that. I'd even suggest we do it not like Sparse builds are done today, but in a more integrated fashion: do static analysis as part of a typical kernel defconfig build and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for modconfig builds. _That_ solution I'd feel very, very good about - it would be so much better than any runtime checks... Not to mention that such an integrated static analysis facility would allow many other things to be checked during build time, which we couldn't possibly check runtime. Thanks, Ingo
* Ingo Molnar <mingo@kernel.org> wrote: > ... and even a relatively simple static analysis tool ought to be able to see > through that. > > I'd even suggest we do it not like Sparse builds are done today, but in a more > integrated fashion: do static analysis as part of a typical kernel defconfig > build and not tolerate warnings but go for a 'zero warnings' policy like Linus > uses for modconfig builds. > > _That_ solution I'd feel very, very good about - it would be so much better than > any runtime checks... So the problem I have with Sparse is that it is very spammy. For example: make C=1 kernel/sched/ ... produces: kernel/sched/core.c:792:6: warning: symbol 'sched_set_stop_task' was not declared. Should it be static? kernel/sched/core.c:1298:5: warning: symbol 'migrate_swap' was not declared. Should it be static? kernel/sched/core.c:3648:35: warning: symbol 'preempt_schedule_irq' was not declared. Should it be static? ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers) ./include/linux/uaccess.h:166:18: expected void *<noident> ./include/linux/uaccess.h:166:18: got void const *from ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers) ./include/linux/uaccess.h:166:18: expected void *<noident> ./include/linux/uaccess.h:166:18: got void const *from ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers) ./include/linux/uaccess.h:166:18: expected void *<noident> ./include/linux/uaccess.h:166:18: got void const *from ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers) ./include/linux/uaccess.h:166:18: expected void *<noident> ./include/linux/uaccess.h:166:18: got void const *from kernel/sched/clock.c:80:19: warning: symbol 'sched_clock_running' was not declared. Should it be static? ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool kernel/sched/cputime.c:335:33: warning: context imbalance in 'thread_group_cputime' - different lock contexts for basic block kernel/sched/fair.c:54:14: warning: symbol 'normalized_sysctl_sched_latency' was not declared. Should it be static? kernel/sched/fair.c:75:14: warning: symbol 'normalized_sysctl_sched_min_granularity' was not declared. Should it be static? kernel/sched/fair.c:98:14: warning: symbol 'normalized_sysctl_sched_wakeup_granularity' was not declared. Should it be static? kernel/sched/fair.c:132:14: warning: symbol 'capacity_margin' was not declared. Should it be static? kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/fair.c:4688:35: error: marked inline, but without a definition kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/fair.c:5817:19: error: cannot dereference this type kernel/sched/fair.c:5817:19: error: cannot dereference this type kernel/sched/fair.c:5817:19: error: internal error: bad type in derived(11) kernel/sched/fair.c:5817:19: error: cannot dereference this type kernel/sched/fair.c:5817:19: error: incompatible types in comparison expression (different base types) kernel/sched/fair.c:5817:19: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/rt.c:635:6: warning: symbol 'sched_rt_bandwidth_account' was not declared. Should it be static? kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11) kernel/sched/sched.h:1988:16: error: cannot dereference this type kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types) kernel/sched/sched.h:1988:16: error: cannot dereference this type ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool kernel/sched/topology.c:499:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:499:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:499:28: got struct sched_domain **<noident> kernel/sched/topology.c:534:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:534:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:534:28: got struct sched_domain **<noident> kernel/sched/topology.c:554:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:554:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:554:28: got struct sched_group_capacity **<noident> kernel/sched/topology.c:594:36: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:594:36: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:594:36: got struct sched_domain **<noident> kernel/sched/topology.c:601:24: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:601:24: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:601:24: got struct sched_group **<noident> kernel/sched/topology.c:602:31: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:602:31: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:602:31: got struct sched_group_capacity **<noident> kernel/sched/topology.c:1330:39: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1330:39: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1330:39: got struct sched_domain **<noident> kernel/sched/topology.c:1333:40: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1333:40: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1333:40: got struct sched_domain **<noident> kernel/sched/topology.c:1337:40: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1337:40: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1337:40: got struct sched_domain_shared **<noident> kernel/sched/topology.c:1339:40: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1339:40: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1339:40: got struct sched_group **<noident> kernel/sched/topology.c:1341:40: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1341:40: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1341:40: got struct sched_group_capacity **<noident> kernel/sched/topology.c:1343:32: warning: incorrect type in argument 1 (different address spaces) kernel/sched/topology.c:1343:32: expected void [noderef] <asn:3>*__pdata kernel/sched/topology.c:1343:32: got struct sched_domain **[noderef] sd kernel/sched/topology.c:1345:32: warning: incorrect type in argument 1 (different address spaces) kernel/sched/topology.c:1345:32: expected void [noderef] <asn:3>*__pdata kernel/sched/topology.c:1345:32: got struct sched_domain_shared **[noderef] sds kernel/sched/topology.c:1347:32: warning: incorrect type in argument 1 (different address spaces) kernel/sched/topology.c:1347:32: expected void [noderef] <asn:3>*__pdata kernel/sched/topology.c:1347:32: got struct sched_group **[noderef] sg kernel/sched/topology.c:1349:32: warning: incorrect type in argument 1 (different address spaces) kernel/sched/topology.c:1349:32: expected void [noderef] <asn:3>*__pdata kernel/sched/topology.c:1349:32: got struct sched_group_capacity **[noderef] sgc kernel/sched/topology.c:1261:25: warning: incorrect type in assignment (different address spaces) kernel/sched/topology.c:1261:25: expected struct sched_domain **[noderef] sd kernel/sched/topology.c:1261:25: got struct sched_domain *[noderef] <asn:3>*<noident> kernel/sched/topology.c:1265:26: warning: incorrect type in assignment (different address spaces) kernel/sched/topology.c:1265:26: expected struct sched_domain_shared **[noderef] sds kernel/sched/topology.c:1265:26: got struct sched_domain_shared *[noderef] <asn:3>*<noident> kernel/sched/topology.c:1269:25: warning: incorrect type in assignment (different address spaces) kernel/sched/topology.c:1269:25: expected struct sched_group **[noderef] sg kernel/sched/topology.c:1269:25: got struct sched_group *[noderef] <asn:3>*<noident> kernel/sched/topology.c:1273:26: warning: incorrect type in assignment (different address spaces) kernel/sched/topology.c:1273:26: expected struct sched_group_capacity **[noderef] sgc kernel/sched/topology.c:1273:26: got struct sched_group_capacity *[noderef] <asn:3>*<noident> kernel/sched/topology.c:1288:26: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1288:26: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1288:26: got struct sched_domain **<noident> kernel/sched/topology.c:1295:26: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1295:26: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1295:26: got struct sched_domain_shared **<noident> kernel/sched/topology.c:1304:26: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1304:26: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1304:26: got struct sched_group **<noident> kernel/sched/topology.c:1311:26: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1311:26: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1311:26: got struct sched_group_capacity **<noident> kernel/sched/topology.c:759:30: warning: incorrect type in argument 1 (different address spaces) kernel/sched/topology.c:759:30: expected void [noderef] <asn:3>*__pdata kernel/sched/topology.c:759:30: got struct sched_domain **[noderef] sd kernel/sched/topology.c:776:15: warning: incorrect type in assignment (different address spaces) kernel/sched/topology.c:776:15: expected struct sched_domain **[noderef] sd kernel/sched/topology.c:776:15: got struct sched_domain *[noderef] <asn:3>*<noident> kernel/sched/topology.c:794:9: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:794:9: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:794:9: got struct sched_domain **<noident> kernel/sched/topology.c:795:10: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:795:10: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:795:10: got struct sched_domain **<noident> kernel/sched/topology.c:797:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:797:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:797:28: got struct sched_domain_shared **<noident> kernel/sched/topology.c:798:18: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:798:18: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:798:18: got struct sched_domain_shared **<noident> kernel/sched/topology.c:800:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:800:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:800:28: got struct sched_group **<noident> kernel/sched/topology.c:801:18: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:801:18: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:801:18: got struct sched_group **<noident> kernel/sched/topology.c:803:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:803:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:803:28: got struct sched_group_capacity **<noident> kernel/sched/topology.c:804:18: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:804:18: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:804:18: got struct sched_group_capacity **<noident> kernel/sched/topology.c:848:36: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:848:36: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:848:36: got struct sched_domain **<noident> kernel/sched/topology.c:954:31: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:954:31: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:954:31: got struct sched_domain_shared **<noident> kernel/sched/topology.c:1354:21: warning: symbol 'build_sched_domain' was not declared. Should it be static? kernel/sched/topology.c:1409:34: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1409:34: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1409:34: got struct sched_domain **<noident> kernel/sched/topology.c:1419:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1419:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1419:28: got struct sched_domain **<noident> kernel/sched/topology.c:1436:28: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1436:28: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1436:28: got struct sched_domain **<noident> kernel/sched/topology.c:1446:23: warning: incorrect type in initializer (different address spaces) kernel/sched/topology.c:1446:23: expected void const [noderef] <asn:3>*__vpp_verify kernel/sched/topology.c:1446:23: got struct sched_domain **<noident> kernel/sched/topology.c:759:29: warning: dereference of noderef expression kernel/sched/topology.c:777:14: warning: dereference of noderef expression kernel/sched/topology.c:1262:22: warning: dereference of noderef expression kernel/sched/topology.c:1266:22: warning: dereference of noderef expression kernel/sched/topology.c:1270:22: warning: dereference of noderef expression kernel/sched/topology.c:1274:22: warning: dereference of noderef expression kernel/sched/topology.c:1329:29: warning: dereference of noderef expression kernel/sched/topology.c:1336:29: warning: dereference of noderef expression kernel/sched/topology.c:1338:29: warning: dereference of noderef expression kernel/sched/topology.c:1340:29: warning: dereference of noderef expression kernel/sched/topology.c:1343:29: warning: dereference of noderef expression kernel/sched/topology.c:1344:17: warning: dereference of noderef expression kernel/sched/topology.c:1345:29: warning: dereference of noderef expression kernel/sched/topology.c:1346:17: warning: dereference of noderef expression kernel/sched/topology.c:1347:29: warning: dereference of noderef expression kernel/sched/topology.c:1348:17: warning: dereference of noderef expression kernel/sched/topology.c:1349:29: warning: dereference of noderef expression kernel/sched/topology.c:1350:17: warning: dereference of noderef expression ... it's just not usable in that form for a regular maintenance flow. So what would be more useful is to add a specific Sparse check that only checks KERNEL_DS, to add it as a regular (.config driven) build option and make sure the kernel build has zero warnings. From that point on we can declare that this kind of bug won't occur anymore, if the Sparse implementation of the check is correct. But there's a (big) problem with that development model: Sparse is not part of the kernel tree and adding a feature to it while making the kernel depend on that brand new feature is a logistical nightmare. The overhead is quite similar to adding new features to a compiler - it happens at a glacial pace and is only done for major features really, at considerable expense. I don't think this is an adequate model for 'extended syntax checking' of the kernel, especially when it comes to correctness that has such obvious security impact. Thanks, Ingo
On Mon, May 08, 2017 at 09:33:52AM +0200, Ingo Molnar wrote: > > (added more Cc:s) > > * Thomas Garnier <thgarnie@google.com> wrote: > > > On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote: > > > Ensure that a syscall does not return to user-mode with a kernel address > > > limit. If that happens, a process can corrupt kernel-mode memory and > > > elevate privileges [1]. > > > > > > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each > > > architecture can create optimized versions. This option is enabled by > > > default on s390 because a similar feature already exists. > > > > > > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990 > > > > > > Signed-off-by: Thomas Garnier <thgarnie@google.com> > > > Tested-by: Kees Cook <keescook@chromium.org> > > > > Ingo: Do you want to take the set? > > Yeah, so now I'm questioning the whole premise of the feature, sorry :-/ > > A big disavantage is that the "security check" will add 2-5 instructions to the > system call fast path. Every one of them, and essentially forever. Just to handle > a CVE that was caused by a buggy touch-screen driver helper function leaking > KERNEL_DS and which was fixed long ago ... > > And yes, I realize that there were other such bugs and that such bugs might occur > in the future - but why not push the overhead of the security check to the kernel > build phase? I.e. I'm wondering how well we could do static analysis during kernel > build - would a limited mode of Sparse be good enough for that? Or we could add a > new static checker to tools/, built from first principles and used primarily for > extended syntactical checking. > > For example I'd consider it a good practice to mandate that if a kernel function > sets KERNEL_DS then it must restore it as well. Any function that does not do > that, or is too complex for the static analysis to prove correctness for sure > should be considered buggy! > > Are there any common kernel APIs outside set_fs() that set KERNEL_DS > intentionally? The overwhelming pattern ought to be: > > orig_fs = get_fs(); > set_fs(KERNEL_DS); > ... > set_fs(orig_fs); > > ... and even a relatively simple static analysis tool ought to be able to see > through that. > > I'd even suggest we do it not like Sparse builds are done today, but in a more > integrated fashion: do static analysis as part of a typical kernel defconfig build > and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for > modconfig builds. > > _That_ solution I'd feel very, very good about - it would be so much better than > any runtime checks... > > Not to mention that such an integrated static analysis facility would allow many > other things to be checked during build time, which we couldn't possibly check > runtime. What about a simple coccinelle script to test for this type of thing? We write it once, add it to the in-kernel body of tests, and then 0-day runs it on all trees all the time. That should catch this type of issue, like all of the other "bad programming bus" that the tool currently catches. thanks, greg k-h
On Mon, May 8, 2017 at 12:33 AM, Ingo Molnar <mingo@kernel.org> wrote: > > (added more Cc:s) > > * Thomas Garnier <thgarnie@google.com> wrote: > >> On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote: >> > Ensure that a syscall does not return to user-mode with a kernel address >> > limit. If that happens, a process can corrupt kernel-mode memory and >> > elevate privileges [1]. >> > >> > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each >> > architecture can create optimized versions. This option is enabled by >> > default on s390 because a similar feature already exists. >> > >> > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990 >> > >> > Signed-off-by: Thomas Garnier <thgarnie@google.com> >> > Tested-by: Kees Cook <keescook@chromium.org> >> >> Ingo: Do you want to take the set? > > Yeah, so now I'm questioning the whole premise of the feature, sorry :-/ > > A big disavantage is that the "security check" will add 2-5 instructions to the > system call fast path. Every one of them, and essentially forever. Just to handle > a CVE that was caused by a buggy touch-screen driver helper function leaking > KERNEL_DS and which was fixed long ago ... > > And yes, I realize that there were other such bugs and that such bugs might occur > in the future - but why not push the overhead of the security check to the kernel > build phase? I.e. I'm wondering how well we could do static analysis during kernel > build - would a limited mode of Sparse be good enough for that? Or we could add a > new static checker to tools/, built from first principles and used primarily for > extended syntactical checking. Static analysis is just not going to cover all cases. We've had vulnerabilities where interrupt handlers left KERNEL_DS set, for example. If there are performance concerns, let's put this behind a CONFIG. 2-5 instructions is not an issue for most people that want this coverage. > For example I'd consider it a good practice to mandate that if a kernel function > sets KERNEL_DS then it must restore it as well. Any function that does not do > that, or is too complex for the static analysis to prove correctness for sure > should be considered buggy! > > Are there any common kernel APIs outside set_fs() that set KERNEL_DS > intentionally? The overwhelming pattern ought to be: > > orig_fs = get_fs(); > set_fs(KERNEL_DS); > ... > set_fs(orig_fs); > > ... and even a relatively simple static analysis tool ought to be able to see > through that. This pattern was, in fact, what the interrupt handler bug escaped from. We have to build proactive defenses, and this check has a clear defensive advantage. It's a noble goal to improve the static analyzers and simplify the source, but we have too much history to prove that this just isn't enough. This instruction cost of this is extremely small, too. Until we can eliminate set_fs(), we need to add this check. > I'd even suggest we do it not like Sparse builds are done today, but in a more > integrated fashion: do static analysis as part of a typical kernel defconfig build > and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for > modconfig builds. > > _That_ solution I'd feel very, very good about - it would be so much better than > any runtime checks... I'm not opposed to this, but there will be push-back on "making the build slower", and it still won't catch everything. Bug-finding is different from making a bug class just unexploitable at all. As we've done before, it's the difference between trying to find format string attacks vs just removing %n from the format parser. > Not to mention that such an integrated static analysis facility would allow many > other things to be checked during build time, which we couldn't possibly check > runtime. Absolutely! But it's orthogonal to proactive runtime exploit blocking. We've got one that works and defends against an entire class of vulnerability for very low cost. It it's truly too costly for default, let's put it behind a CONFIG and see who wants it. (Most distros, I suspect, will enable it, just like hardened usercopy which is much more expensive than this.) -Kees
* Kees Cook <keescook@chromium.org> wrote: > > And yes, I realize that there were other such bugs and that such bugs might > > occur in the future - but why not push the overhead of the security check to > > the kernel build phase? I.e. I'm wondering how well we could do static > > analysis during kernel build - would a limited mode of Sparse be good enough > > for that? Or we could add a new static checker to tools/, built from first > > principles and used primarily for extended syntactical checking. > > Static analysis is just not going to cover all cases. We've had vulnerabilities > where interrupt handlers left KERNEL_DS set, for example. [...] Got any commit ID of that bug - was it because a function executed by the interrupt handler leaked KERNEL_DS? > [...] If there are performance concerns, let's put this behind a CONFIG. 2-5 > instructions is not an issue for most people that want this coverage. That doesn't really _solve_ the performance concerns, it just forces most people to enable it by creating a 'security or performance' false dichotomy ... > [...] and it still won't catch everything. Bug-finding is different from making > a bug class just unexploitable at all. As we've done before, it's the difference > between trying to find format string attacks vs just removing %n from the format > parser. No, it does not make it unexploitable, it could still be exploitable if the runtime check is buggy or if there's kernel execution outside of the regular system call paths - there's plenty of such hardware functionality on x86 for example. Thanks, Ingo
On Mon, May 8, 2017 at 4:02 PM, Ingo Molnar <mingo@kernel.org> wrote: > > * Kees Cook <keescook@chromium.org> wrote: > >> > And yes, I realize that there were other such bugs and that such bugs might >> > occur in the future - but why not push the overhead of the security check to >> > the kernel build phase? I.e. I'm wondering how well we could do static >> > analysis during kernel build - would a limited mode of Sparse be good enough >> > for that? Or we could add a new static checker to tools/, built from first >> > principles and used primarily for extended syntactical checking. >> >> Static analysis is just not going to cover all cases. We've had vulnerabilities >> where interrupt handlers left KERNEL_DS set, for example. [...] > > Got any commit ID of that bug - was it because a function executed by the > interrupt handler leaked KERNEL_DS? I think Kees might be talking about https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that perf code that can run in pretty much any context called access_ok().
On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote: > > ... it's just not usable in that form for a regular maintenance flow. > > So what would be more useful is to add a specific Sparse check that > only checks > KERNEL_DS, to add it as a regular (.config driven) build option and > make sure the > kernel build has zero warnings. > > From that point on we can declare that this kind of bug won't occur > anymore, if > the Sparse implementation of the check is correct. > > But there's a (big) problem with that development model: Sparse is not > part of the > kernel tree and adding a feature to it while making the kernel depend > on that > brand new feature is a logistical nightmare. The overhead is quite > similar to > adding new features to a compiler - it happens at a glacial pace and > is only done > for major features really, at considerable expense. I don't think this > is an > adequate model for 'extended syntax checking' of the kernel, > especially when it > comes to correctness that has such obvious security impact. > > Thanks, > > Ingo There's the option of using GCC plugins now that the infrastructure was upstreamed from grsecurity. It can be used as part of the regular build process and as long as the analysis is pretty simple it shouldn't hurt compile time much. The problem with doing that is I don't think there are people with much experience with GCC contributing upstream and it's going to be more work to develop/maintain than some kind of specialized DSL for analysis. I think a few static analysis plugins are used as part of maintaining grsecurity for solving issues like finding false positives for the REFCOUNT overflow checking feature, so it's something that's already being done in practice elsewhere.
On Mon, May 8, 2017 at 7:02 AM, Ingo Molnar <mingo@kernel.org> wrote: > > * Kees Cook <keescook@chromium.org> wrote: > >> > And yes, I realize that there were other such bugs and that such bugs might >> > occur in the future - but why not push the overhead of the security check to >> > the kernel build phase? I.e. I'm wondering how well we could do static >> > analysis during kernel build - would a limited mode of Sparse be good enough >> > for that? Or we could add a new static checker to tools/, built from first >> > principles and used primarily for extended syntactical checking. >> >> Static analysis is just not going to cover all cases. We've had vulnerabilities >> where interrupt handlers left KERNEL_DS set, for example. [...] > > Got any commit ID of that bug - was it because a function executed by the > interrupt handler leaked KERNEL_DS? Ah, it was an exception handler, but the one I was thinking of was this: https://lwn.net/Articles/419141/ >> [...] If there are performance concerns, let's put this behind a CONFIG. 2-5 >> instructions is not an issue for most people that want this coverage. > > That doesn't really _solve_ the performance concerns, it just forces most people > to enable it by creating a 'security or performance' false dichotomy ... That's fair, but what I'm trying to say is that many people will want this, so rejecting it because it's 2 more instructions seems unreasonable. We have had much more invasive changes added to the kernel. >> [...] and it still won't catch everything. Bug-finding is different from making >> a bug class just unexploitable at all. As we've done before, it's the difference >> between trying to find format string attacks vs just removing %n from the format >> parser. > > No, it does not make it unexploitable, it could still be exploitable if the > runtime check is buggy or if there's kernel execution outside of the regular > system call paths - there's plenty of such hardware functionality on x86 for > example. Fine, but this is splitting hairs. This does protect a specific situation, and it does so very cheaply. The real fix would be to remove set_fs() entirely. :P -Kees
On Mon, May 8, 2017 at 8:22 AM, Daniel Micay <danielmicay@gmail.com> wrote: > On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote: >> >> ... it's just not usable in that form for a regular maintenance flow. >> >> So what would be more useful is to add a specific Sparse check that >> only checks >> KERNEL_DS, to add it as a regular (.config driven) build option and >> make sure the >> kernel build has zero warnings. >> >> From that point on we can declare that this kind of bug won't occur >> anymore, if >> the Sparse implementation of the check is correct. >> >> But there's a (big) problem with that development model: Sparse is not >> part of the >> kernel tree and adding a feature to it while making the kernel depend >> on that >> brand new feature is a logistical nightmare. The overhead is quite >> similar to >> adding new features to a compiler - it happens at a glacial pace and >> is only done >> for major features really, at considerable expense. I don't think this >> is an >> adequate model for 'extended syntax checking' of the kernel, >> especially when it >> comes to correctness that has such obvious security impact. >> >> Thanks, >> >> Ingo > > There's the option of using GCC plugins now that the infrastructure was > upstreamed from grsecurity. It can be used as part of the regular build > process and as long as the analysis is pretty simple it shouldn't hurt > compile time much. Well, and that the situation may arise due to memory corruption, not from poorly-matched set_fs() calls, which static analysis won't help solve. We need to catch this bad kernel state because it is a very bad state to run in. -Kees
On Mon, May 8, 2017 at 8:26 AM, Kees Cook <keescook@chromium.org> wrote: > On Mon, May 8, 2017 at 8:22 AM, Daniel Micay <danielmicay@gmail.com> wrote: >> On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote: >>> >>> ... it's just not usable in that form for a regular maintenance flow. >>> >>> So what would be more useful is to add a specific Sparse check that >>> only checks >>> KERNEL_DS, to add it as a regular (.config driven) build option and >>> make sure the >>> kernel build has zero warnings. >>> >>> From that point on we can declare that this kind of bug won't occur >>> anymore, if >>> the Sparse implementation of the check is correct. >>> >>> But there's a (big) problem with that development model: Sparse is not >>> part of the >>> kernel tree and adding a feature to it while making the kernel depend >>> on that >>> brand new feature is a logistical nightmare. The overhead is quite >>> similar to >>> adding new features to a compiler - it happens at a glacial pace and >>> is only done >>> for major features really, at considerable expense. I don't think this >>> is an >>> adequate model for 'extended syntax checking' of the kernel, >>> especially when it >>> comes to correctness that has such obvious security impact. >>> >>> Thanks, >>> >>> Ingo >> >> There's the option of using GCC plugins now that the infrastructure was >> upstreamed from grsecurity. It can be used as part of the regular build >> process and as long as the analysis is pretty simple it shouldn't hurt >> compile time much. > > Well, and that the situation may arise due to memory corruption, not > from poorly-matched set_fs() calls, which static analysis won't help > solve. We need to catch this bad kernel state because it is a very bad > state to run in. Of course, I agree with Kees points on this and previous emails. A static analysis solution is hard to scale across functions and build time can suffer. I don't think the coverage will be good enough to consider this change and static analysis as similar. > > -Kees > > -- > Kees Cook > Pixel Security
On Mon, May 08, 2017 at 04:06:35PM +0200, Jann Horn wrote: > I think Kees might be talking about > https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in > commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that > perf code that can run in pretty much any context called access_ok(). And that commit has *NOT* solved the problem. perf_callchain_user() can be called synchronously, without passing through that code. Tracepoint shite... That set_fs() should be done in get_perf_callchain(), just around the call of perf_callchain_user(). Along with pagefault_disable(), actually. BTW, that's a nice example demonstrating why doing that on the kernel boundary is wrong. Wider (in theory) area being "protected" => easier to miss the ways not crossing its border.
* Kees Cook <keescook@chromium.org> wrote: > On Mon, May 8, 2017 at 7:02 AM, Ingo Molnar <mingo@kernel.org> wrote: > > > > * Kees Cook <keescook@chromium.org> wrote: > > > >> > And yes, I realize that there were other such bugs and that such bugs might > >> > occur in the future - but why not push the overhead of the security check to > >> > the kernel build phase? I.e. I'm wondering how well we could do static > >> > analysis during kernel build - would a limited mode of Sparse be good enough > >> > for that? Or we could add a new static checker to tools/, built from first > >> > principles and used primarily for extended syntactical checking. > >> > >> Static analysis is just not going to cover all cases. We've had vulnerabilities > >> where interrupt handlers left KERNEL_DS set, for example. [...] > > > > Got any commit ID of that bug - was it because a function executed by the > > interrupt handler leaked KERNEL_DS? > > Ah, it was an exception handler, but the one I was thinking of was this: > https://lwn.net/Articles/419141/ Ok, so that's CVE-2010-4258, where an oops with KERNEL_DS set was used to escalate privileges, due to the kernel's oops handler not cleaning up the KERNEL_DS. The exploit used another bug, a crash in a network protocol handler, to execute the oops handler with KERNEL_DS set. The explanation of the exploit itself points out that it's a very interesting bug and I agree, it's not a general kernel bug but a bug in a very narrow code path (oops handling) that caused this, and I don't see how that example can be turned into a general example: it was a bug in oops handling to let the process continue execution (and perform the CLEARTID operation) *and* leak the address limit at KERNEL_DS. By similar argument a bug in the runtime checking of the address limit may allow exploits. Consider the oops path cleanup a similarly sensitive code path as the address limit check. To handle this category of exploits it would be enough to add a runtime check to the _oops handling code itself_ (to make sure we've set addr_limit back to USER_DS even if we crash in a KERNEL_DS code area), not to every system call! That check would avoid that particular historic pattern, if combined with static analysis that ensured that KERNEL_DS is always set/restored correctly. (Which btw. I believe some of the regular static scans of the kernel are already doing today.) Furthermore, to go back to your original argument: > Static analysis is just not going to cover all cases. it's not even true that a runtime check will 'cover all cases': for example a similar bug to CVE-2010-4258 could still be exploited: - Note that the actual put_user() was not prevented via the runtime check - the runtime check would run *after* the buggy put_user() was done. The runtime check warns or panics after the fact, which might (or might not) be enough to prevent the exploit. - Also note that a slightly different form of the bug would still be exploitable, even with the runtime check: for example if the task-shutdown code can be made to unconditionally set KERNEL_DS, but after the put_user(), then the runtime check would not 'cover all cases'. So the argument for doing this runtime check after every system call is very dubious. Thanks, Ingo
* Greg KH <greg@kroah.com> wrote: > What about a simple coccinelle script to test for this type of thing? > We write it once, add it to the in-kernel body of tests, and then 0-day > runs it on all trees all the time. That should catch this type of > issue, like all of the other "bad programming bus" that the tool > currently catches. Yeah, that would work - but today most of our coccinelle scripts are still pretty verbose, and I think it's important to make this a different category of coccinelle script, which is .config driven where a loud warning yells at us. I.e. force the 'zero warnings tolerated' model. I also noticed that Coccinelle builds are pretty slow, so it would still make sense to have a performance oriented static checking facility that does not have the performance baggage of high level functional languages. I.e. either integrate it into Sparse - or start a kernel integrated static analysis tooling project that would only follow control flow initially - which is what we need here I believe. We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would be a pity to add a runtime check to every system call ... We could also add a runtime check to oops handling to make sure we don't leak KERNEL_DS through kernel crashes, to ease worries about CVE-2010-4258. Thanks, Ingo
* Kees Cook <keescook@chromium.org> wrote: > > There's the option of using GCC plugins now that the infrastructure was > > upstreamed from grsecurity. It can be used as part of the regular build > > process and as long as the analysis is pretty simple it shouldn't hurt compile > > time much. > > Well, and that the situation may arise due to memory corruption, not from > poorly-matched set_fs() calls, which static analysis won't help solve. We need > to catch this bad kernel state because it is a very bad state to run in. If memory corruption corrupted the task state into having addr_limit set to KERNEL_DS then there's already a fair chance that it's game over: it could also have set *uid to 0, or changed a sensitive PF_ flag, or a number of other things... Furthermore, think about it: there's literally an infinite amount of corrupted task states that could be a security problem and that could be checked after every system call. Do we want to check every one of them? Thanks, Ingo
On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote: > We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would > be a pity to add a runtime check to every system call ... I think we should simply strive to remove all of them that aren't in core scheduler / arch code. Basically evetyytime we do the oldfs = get_fs(); set_fs(KERNEL_DS); .. set_fs(oldfs); trick we're doing something wrong, and there should always be better ways to archive it. E.g. using iov_iter with a ITER_KVEC type consistently would already remove most of them.
On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote: > > * Kees Cook <keescook@chromium.org> wrote: > > > > There's the option of using GCC plugins now that the infrastructure was > > > upstreamed from grsecurity. It can be used as part of the regular build > > > process and as long as the analysis is pretty simple it shouldn't hurt compile > > > time much. > > > > Well, and that the situation may arise due to memory corruption, not from > > poorly-matched set_fs() calls, which static analysis won't help solve. We need > > to catch this bad kernel state because it is a very bad state to run in. > > If memory corruption corrupted the task state into having addr_limit set to > KERNEL_DS then there's already a fair chance that it's game over: it could also > have set *uid to 0, or changed a sensitive PF_ flag, or a number of other > things... > > Furthermore, think about it: there's literally an infinite amount of corrupted > task states that could be a security problem and that could be checked after every > system call. Do we want to check every one of them? Ok, I'm all for not checking lots of stuff all the time, just to protect from crappy drivers that. Especially as we _can_ audit and run checks on the source code for them in the kernel tree. But, and here's the problem, outside of the desktop/enterprise world, there are a ton of out-of-tree code that is crap. The number of security/bug fixes and kernel crashes for out-of-tree code in systems like Android phones is just so high it's laughable. When you have a device that is running 3.2 million lines of kernel code, yet the diffstat of the tree compared to mainline adds 3 million lines of code, there is bound to be a ton of issues/problems there. So this is an entirely different thing we need to try to protect ourselves from. A long time ago I laughed when I saw that Microsoft had to do lots of "hardening" of their kernel to protect themselves from crappy drivers, as I knew we didn't have to do that because we had the source for them and could fix the root issues. But that has changed and now we don't all have that option. That code is out-of-tree because the vendor doesn't care, and doesn't want to take any time at all to do anything resembling a real code review[1]. So, how about options like the ones being proposed here, go behind a new config option: CONFIG_PROTECT_FROM_CRAPPY_DRIVERS that device owners can enable if they do not trust their vendor-provided code (hint, I sure don't.) That way the "normal" path that all of us are used to running will be fine, but if you want to take the speed hit to try to protect yourself, then you can do that as well. Anyway, just an idea... thanks, greg k-h [1] I am working really hard with lots of vendors to try to fix their broken development model, but that is going to take years to resolve as their device pipelines are years long, and changing their mindsets takes a long time...
On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote: >> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would >> be a pity to add a runtime check to every system call ... > > I think we should simply strive to remove all of them that aren't > in core scheduler / arch code. Basically evetyytime we do the > > oldfs = get_fs(); > set_fs(KERNEL_DS); > .. > set_fs(oldfs); > > trick we're doing something wrong, and there should always be better > ways to archive it. E.g. using iov_iter with a ITER_KVEC type > consistently would already remove most of them. How about trying to remove all of them? If we could actually get rid of all of them, we could drop the arch support, and we'd get faster, simpler, shorter uaccess code throughout the kernel. The ones in kernel/compat.c are generally garbage. They should be using compat_alloc_user_space(). Ditto for kernel/power/user.c. flush_module_icache() is a potentially silly arch thing. Does the code in kernel/module.c that uses set_fs() actually work? kernel/signal.c's set_fs() is laziness. __probe_kernel_read() and __probe_kernel_write() use set_fs(), but that usage only matters on sane arches* like s390x. We should arguably have a set_uaccess_address_space() or similar for this purpose that's a nop on normal arches like x86. fs/splice.c has some, ahem, interesting uses that have been the source of nasty exploits in the past. Converting them to use iov_iter properly would be really, really nice. Christoph, I don't suppose you'd like to do that? The others seem to mostly be fixable, but I haven't looked that closely. Overall, I suspect that a big part of why mitigations like the one being discussed in this thread were developed is because addr_limit used to be on the stack, making it (along with restart_block) a really nice target. This is fixed now on x86, arm64, and s390x, I believe, and other arches can easily opt in to the fix. * I'm strongly in favor of arches that have totally separate user and kernel address spaces. Sadly, the most common arches don't do this.
On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: > fs/splice.c has some, ahem, interesting uses that have been the source > of nasty exploits in the past. Converting them to use iov_iter > properly would be really, really nice. Christoph, I don't suppose > you'd like to do that? I can take care of all the fs code including this one.
On Tue, May 9, 2017 at 4:10 AM, Greg KH <greg@kroah.com> wrote: > On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote: >> >> * Kees Cook <keescook@chromium.org> wrote: >> >> > > There's the option of using GCC plugins now that the infrastructure was >> > > upstreamed from grsecurity. It can be used as part of the regular build >> > > process and as long as the analysis is pretty simple it shouldn't hurt compile >> > > time much. >> > >> > Well, and that the situation may arise due to memory corruption, not from >> > poorly-matched set_fs() calls, which static analysis won't help solve. We need >> > to catch this bad kernel state because it is a very bad state to run in. >> >> If memory corruption corrupted the task state into having addr_limit set to >> KERNEL_DS then there's already a fair chance that it's game over: it could also >> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other >> things... >> >> Furthermore, think about it: there's literally an infinite amount of corrupted >> task states that could be a security problem and that could be checked after every >> system call. Do we want to check every one of them? > > Ok, I'm all for not checking lots of stuff all the time, just to protect > from crappy drivers that. Especially as we _can_ audit and run checks > on the source code for them in the kernel tree. > > But, and here's the problem, outside of the desktop/enterprise world, > there are a ton of out-of-tree code that is crap. The number of > security/bug fixes and kernel crashes for out-of-tree code in systems > like Android phones is just so high it's laughable. > > When you have a device that is running 3.2 million lines of kernel code, > yet the diffstat of the tree compared to mainline adds 3 million lines > of code, there is bound to be a ton of issues/problems there. > > So this is an entirely different thing we need to try to protect > ourselves from. A long time ago I laughed when I saw that Microsoft had > to do lots of "hardening" of their kernel to protect themselves from > crappy drivers, as I knew we didn't have to do that because we had the > source for them and could fix the root issues. But that has changed and > now we don't all have that option. That code is out-of-tree because the > vendor doesn't care, and doesn't want to take any time at all to do > anything resembling a real code review[1]. That's a big part of why I thought would be useful. I am less worried about edge cases upstream right now than forks with custom codes not using set_fs correctly. > > So, how about options like the ones being proposed here, go behind a new > config option: > CONFIG_PROTECT_FROM_CRAPPY_DRIVERS > that device owners can enable if they do not trust their vendor-provided > code (hint, I sure don't.) That way the "normal" path that all of us > are used to running will be fine, but if you want to take the speed hit > to try to protect yourself, then you can do that as well. Maybe another name but why not. > > Anyway, just an idea... > > thanks, > > greg k-h > > [1] I am working really hard with lots of vendors to try to fix their > broken development model, but that is going to take years to resolve > as their device pipelines are years long, and changing their > mindsets takes a long time...
On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: > On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: > > fs/splice.c has some, ahem, interesting uses that have been the source > > of nasty exploits in the past. Converting them to use iov_iter > > properly would be really, really nice. Christoph, I don't suppose > > you'd like to do that? > > I can take care of all the fs code including this one. I spent the afternoon hacking up where I'd like this to head. It's completely untested as of now: http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
On Tue, May 9, 2017 at 9:00 AM, Andy Lutomirski <luto@kernel.org> wrote: > On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote: >> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote: >>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would >>> be a pity to add a runtime check to every system call ... >> >> I think we should simply strive to remove all of them that aren't >> in core scheduler / arch code. Basically evetyytime we do the >> >> oldfs = get_fs(); >> set_fs(KERNEL_DS); >> .. >> set_fs(oldfs); >> >> trick we're doing something wrong, and there should always be better >> ways to archive it. E.g. using iov_iter with a ITER_KVEC type >> consistently would already remove most of them. > > How about trying to remove all of them? If we could actually get rid > of all of them, we could drop the arch support, and we'd get faster, > simpler, shorter uaccess code throughout the kernel. > > The ones in kernel/compat.c are generally garbage. They should be > using compat_alloc_user_space(). Ditto for kernel/power/user.c. compat_alloc_user_space() is a hack that should go away too. It ends up copying the data three times. The more efficient solution to this is to have a core syscall function that only accesses kernel memory, and then have two front-end functions (native and compat) that do the actual reads and writes to userspace, with conversion in the compat case. -- Brian Gerst
On Mon, May 8, 2017 at 11:56 PM, Ingo Molnar <mingo@kernel.org> wrote: > > * Kees Cook <keescook@chromium.org> wrote: > >> > There's the option of using GCC plugins now that the infrastructure was >> > upstreamed from grsecurity. It can be used as part of the regular build >> > process and as long as the analysis is pretty simple it shouldn't hurt compile >> > time much. >> >> Well, and that the situation may arise due to memory corruption, not from >> poorly-matched set_fs() calls, which static analysis won't help solve. We need >> to catch this bad kernel state because it is a very bad state to run in. [attempting some thread-merging] > Ok, so that's CVE-2010-4258, where an oops with KERNEL_DS set was used to escalate > privileges, due to the kernel's oops handler not cleaning up the KERNEL_DS. The > exploit used another bug, a crash in a network protocol handler, to execute the > oops handler with KERNEL_DS set. Right, I didn't mean to suggest that vulnerability would be fixed by this solution. I was trying to show how there can be some pretty complex interaction with exceptions/interrupts/etc that would make pure static analysis still miss things. > If memory corruption corrupted the task state into having addr_limit set to > KERNEL_DS then there's already a fair chance that it's game over: it could also > have set *uid to 0, or changed a sensitive PF_ flag, or a number of other > things... > > Furthermore, think about it: there's literally an infinite amount of corrupted > task states that could be a security problem and that could be checked after every > system call. Do we want to check every one of them? Right, but this "slippery slope" argument isn't the best way to reject security changes. Let me take a step back and describe the threat, and where we should likely spend time: The primary threat with addr_limit getting changed is that a narrowly-scoped attack (traditionally stack exhaustion or adjacent-stack large-index writes) could be leveraged into opening the entire kernel to writes (by allowing all syscalls with a copy_to_user() call to suddenly be able to write to kernel memory). So, really, the flaw is having addr_limit at all. Removing set_fs() should, I think, allow this to become a const (or at least should get us a lot closer). The main path to corrupting addr_limit has been via stack corruption. On architectures with CONFIG_THREAD_INFO_IN_TASK, this risk is greatly reduced already, but it's not universally available yet. (And as long as we're talking about stack attacks, CONFIG_VMAP_STACK makes cross-stack overflows go away, and cross-stack indexing harder, but that's not really about addr_limit since currently nothing with VMAP_STACK doesn't already have THREAD_INFO_IN_TASK.) So, left with a still exploitable target in memory that allows such an expansion of attack method, I still think it's worth keeping this patch series, but if we can drop set_fs() I could probably be convinced the benefit of the series doesn't exceed the cost on THREAD_INFO_IN_TASK-architectures (x86, arm64, s390). But that means at least currently keeping it on arm, for example. If we can make addr_limit const, well, we don't need the series at all. -Kees
On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: >> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: >> > fs/splice.c has some, ahem, interesting uses that have been the source >> > of nasty exploits in the past. Converting them to use iov_iter >> > properly would be really, really nice. Christoph, I don't suppose >> > you'd like to do that? >> >> I can take care of all the fs code including this one. > > I spent the afternoon hacking up where I'd like this to head. It's > completely untested as of now: > > http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination Ooooh yes! Nice work. I love this: http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d but I wonder what it'll cause out-of-tree code to do. I mean, I'd rather nothing out-of-tree be calling these, but I'd hate 3rd party hacks even more. http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587 This accidentally(?) removes the kernel-doc comments. http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798 Could this be made defensive? (Return 0 if ret wraps, for example?) I see what the comment says, but not everyone will read that. :( http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1 Perhaps unconditionally set USER_DS on exit instead of retaining whatever was there? -Kees
On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook@chromium.org> wrote: > On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote: >> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: >>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: >>> > fs/splice.c has some, ahem, interesting uses that have been the source >>> > of nasty exploits in the past. Converting them to use iov_iter >>> > properly would be really, really nice. Christoph, I don't suppose >>> > you'd like to do that? >>> >>> I can take care of all the fs code including this one. >> >> I spent the afternoon hacking up where I'd like this to head. It's >> completely untested as of now: >> >> http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination > > Ooooh yes! Nice work. > > I love this: > http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d > but I wonder what it'll cause out-of-tree code to do. I mean, I'd > rather nothing out-of-tree be calling these, but I'd hate 3rd party > hacks even more. > > http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587 > This accidentally(?) removes the kernel-doc comments. > > http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798 > Could this be made defensive? (Return 0 if ret wraps, for example?) I > see what the comment says, but not everyone will read that. :( > > http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1 > Perhaps unconditionally set USER_DS on exit instead of retaining > whatever was there? I don't like silent fixups. If we want to do this, we should BUG or at least WARN, not just change the addr limit. But I'm also not convinced it's indicative of an actual bug here. --Andy
On Tue, May 9, 2017 at 3:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook@chromium.org> wrote: >> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote: >>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: >>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: >>>> > fs/splice.c has some, ahem, interesting uses that have been the source >>>> > of nasty exploits in the past. Converting them to use iov_iter >>>> > properly would be really, really nice. Christoph, I don't suppose >>>> > you'd like to do that? >>>> >>>> I can take care of all the fs code including this one. >>> >>> I spent the afternoon hacking up where I'd like this to head. It's >>> completely untested as of now: >>> >>> http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination >> >> Ooooh yes! Nice work. >> >> I love this: >> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d >> but I wonder what it'll cause out-of-tree code to do. I mean, I'd >> rather nothing out-of-tree be calling these, but I'd hate 3rd party >> hacks even more. >> >> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587 >> This accidentally(?) removes the kernel-doc comments. >> >> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798 >> Could this be made defensive? (Return 0 if ret wraps, for example?) I >> see what the comment says, but not everyone will read that. :( >> >> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1 >> Perhaps unconditionally set USER_DS on exit instead of retaining >> whatever was there? > > I don't like silent fixups. If we want to do this, we should BUG or > at least WARN, not just change the addr limit. But I'm also not > convinced it's indicative of an actual bug here. Nothing should enter that function with KERNEL_DS set, right? BUG_ON(get_fs() != USER_DS); set_fs(KERNEL_DS); ... set_fs(USER_DS); -Kees
On Tue, May 9, 2017 at 4:31 PM, Kees Cook <keescook@chromium.org> wrote: > On Tue, May 9, 2017 at 3:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook@chromium.org> wrote: >>> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote: >>>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: >>>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: >>>>> > fs/splice.c has some, ahem, interesting uses that have been the source >>>>> > of nasty exploits in the past. Converting them to use iov_iter >>>>> > properly would be really, really nice. Christoph, I don't suppose >>>>> > you'd like to do that? >>>>> >>>>> I can take care of all the fs code including this one. >>>> >>>> I spent the afternoon hacking up where I'd like this to head. It's >>>> completely untested as of now: >>>> >>>> http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination >>> >>> Ooooh yes! Nice work. >>> >>> I love this: >>> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d >>> but I wonder what it'll cause out-of-tree code to do. I mean, I'd >>> rather nothing out-of-tree be calling these, but I'd hate 3rd party >>> hacks even more. >>> >>> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587 >>> This accidentally(?) removes the kernel-doc comments. >>> >>> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798 >>> Could this be made defensive? (Return 0 if ret wraps, for example?) I >>> see what the comment says, but not everyone will read that. :( >>> >>> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1 >>> Perhaps unconditionally set USER_DS on exit instead of retaining >>> whatever was there? >> >> I don't like silent fixups. If we want to do this, we should BUG or >> at least WARN, not just change the addr limit. But I'm also not >> convinced it's indicative of an actual bug here. > > Nothing should enter that function with KERNEL_DS set, right? > > BUG_ON(get_fs() != USER_DS); > set_fs(KERNEL_DS); > ... > set_fs(USER_DS); > It's not immediately obvious to me that this shouldn't happen. Why not do it the way Christoph did and then, once the rest of set_fs() is tamed, consider this change (or trying to mass-convert file_operations implementations to get rid of it entirely)? --Andy
On Tue, May 09, 2017 at 09:03:22AM -0700, Christoph Hellwig wrote: > On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: > > On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: > > > fs/splice.c has some, ahem, interesting uses that have been the source > > > of nasty exploits in the past. Converting them to use iov_iter > > > properly would be really, really nice. Christoph, I don't suppose > > > you'd like to do that? > > > > I can take care of all the fs code including this one. Oh? > I spent the afternoon hacking up where I'd like this to head. It's > completely untested as of now: > > http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination And just what happens to driver that has no ->read_iter()? Unless I'm seriously misreading that, NAK with extreme prejudice.
On Wed, May 10, 2017 at 03:11:18AM +0100, Al Viro wrote: > On Tue, May 09, 2017 at 09:03:22AM -0700, Christoph Hellwig wrote: > > On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: > > > On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: > > > > fs/splice.c has some, ahem, interesting uses that have been the source > > > > of nasty exploits in the past. Converting them to use iov_iter > > > > properly would be really, really nice. Christoph, I don't suppose > > > > you'd like to do that? > > > > > > I can take care of all the fs code including this one. > > Oh? > > > I spent the afternoon hacking up where I'd like this to head. It's > > completely untested as of now: > > > > http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination > > And just what happens to driver that has no ->read_iter()? Unless I'm > seriously misreading that, NAK with extreme prejudice. FWIW, some parts of that queue are obviously sane; it's the conversions of kernel_write() and friends to ->read_iter/->write_iter() that are non-starters. That stuff is used in too many situations; we can't guarantee that all of them will be for files that have those. As for default_file_splice_read(), I seriously suspect that with your change we could as well just make it return -EINVAL and be done with that; places that have ->read_iter() tend to have explicit ->splice_read() and it looks like the ones that do not should simply use generic_file_read_iter(). I hadn't checked that, but there's not a lot of those: arch/s390/hypfs/inode.c:437: .read_iter = hypfs_read_iter, drivers/char/mem.c:798: .read_iter = read_iter_null, drivers/char/mem.c:813: .read_iter = read_iter_zero, drivers/char/mem.c:824: .read_iter = read_iter_zero, drivers/char/raw.c:286: .read_iter = blkdev_read_iter, drivers/net/tap.c:1134: .read_iter = tap_read_iter, drivers/net/tun.c:2423: .read_iter = tun_chr_read_iter, drivers/usb/gadget/function/f_fs.c:1255: .read_iter = ffs_epfile_read_iter, drivers/usb/gadget/legacy/inode.c:703: .read_iter = ep_read_iter, drivers/vhost/net.c:1252: .read_iter = vhost_net_chr_read_iter, fs/9p/vfs_file.c:641: .read_iter = generic_file_read_iter, fs/9p/vfs_file.c:652: .read_iter = generic_file_read_iter, fs/9p/vfs_file.c:664: .read_iter = v9fs_file_read_iter, fs/9p/vfs_file.c:675: .read_iter = v9fs_file_read_iter, fs/9p/vfs_file.c:687: .read_iter = v9fs_mmap_file_read_iter, fs/9p/vfs_file.c:698: .read_iter = v9fs_mmap_file_read_iter, fs/fuse/cuse.c:180: .read_iter = cuse_read_iter, fs/fuse/file.c:3015: .read_iter = fuse_direct_read_iter, fs/hugetlbfs/inode.c:980: .read_iter = hugetlbfs_read_iter, fs/ncpfs/file.c:248: .read_iter = ncp_file_read_iter, fs/orangefs/file.c:742: .read_iter = orangefs_file_read_iter, fs/pipe.c:1011: .read_iter = pipe_read, sound/core/pcm_native.c:3696: .read_iter = snd_pcm_readv, is the full list and I'm fairly certain that most of them will work with generic_file_splice_read() just fine. drivers/char definitely will, so will ncpfs/orangefs/hugetlbfs/most of 9p ones (two of the latter might need some care in p9_client_read(), but that should be doable easily enough). pipe is irrelevant (->splice_read() won't be called for those). fuse ones should be doable, but that might take a bit more infrastructure work in lib/iov_iter.c. vhost, gadgetfs, tun/tap - no idea at the moment.
On Wed, May 10, 2017 at 03:45:24AM +0100, Al Viro wrote: > FWIW, some parts of that queue are obviously sane; it's the conversions of > kernel_write() and friends to ->read_iter/->write_iter() that are non-starters. Egads... OK, I *have* misread what you are doing there. Your vfs_iter_read() works for files sans ->read_iter(). For strange values of "works" - you hardwire "it's either iovec or kvec iterator" into its calling conventions, which is a trouble waiting to happen. What's the point? What's wrong with having kernel_read()/kernel_readv()/etc.? You still have set_fs() in there; doing that one level up in call chain would be just fine... IDGI. Broken commit: "net: don't play with address limits in kernel_recvmsg". It would be OK if it was only about data. Unfortunately, that's not true in one case: svc_udp_recvfrom() wants ->msg_control. Another delicate place: you can't assume that write() always advances file position by its (positive) return value. btrfs stuff is sensitive to that. ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure about blind asma->file->f_pos += ret. That's begging for races. Actually, scratch that - it *is* racy.
On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote: > Broken commit: "net: don't play with address limits in kernel_recvmsg". > It would be OK if it was only about data. Unfortunately, that's not > true in one case: svc_udp_recvfrom() wants ->msg_control. > > Another delicate place: you can't assume that write() always advances > file position by its (positive) return value. btrfs stuff is sensitive > to that. > > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure > about blind asma->file->f_pos += ret. That's begging for races. Actually, > scratch that - it *is* racy. kvec_length(): please, don't. I would rather have the last remaining iov_length() gone... What do you need it for, anyway? You have only two users and both have the count passed to them (as *count and *cnt resp.)
On Wed, May 10, 2017 at 04:21:37AM +0100, Al Viro wrote: > On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote: > > > Broken commit: "net: don't play with address limits in kernel_recvmsg". > > It would be OK if it was only about data. Unfortunately, that's not > > true in one case: svc_udp_recvfrom() wants ->msg_control. > > > > Another delicate place: you can't assume that write() always advances > > file position by its (positive) return value. btrfs stuff is sensitive > > to that. > > > > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure > > about blind asma->file->f_pos += ret. That's begging for races. Actually, > > scratch that - it *is* racy. > > kvec_length(): please, don't. I would rather have the last remaining > iov_length() gone... What do you need it for, anyway? You have only > two users and both have the count passed to them (as *count and *cnt resp.) fcntl stuff: I've decided not to put something similar into work.compat since I couldn't decide what to do with compat stuff - word-by-word copy from userland converting to struct flock + conversion to posix_lock + actual work + conversion to flock + word-by-word copy to userland... Smells like we might be better off with compat_flock_to_posix_lock() et.al. I'm still not sure; played a bit one way and another and dediced to drop it for now. Hell knows...
On Tue, May 09, 2017 at 09:50:32AM -0700, Kees Cook wrote: > http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587 > This accidentally(?) removes the kernel-doc comments. That was sort of intentional, the kerneldoc boilerplate adds very little value for method instances. > http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798 > Could this be made defensive? (Return 0 if ret wraps, for example?) I > see what the comment says, but not everyone will read that. :( It's a straight copy of iov_length, and I'd rather keep it that way.
On Wed, May 10, 2017 at 03:45:24AM +0100, Al Viro wrote: > FWIW, some parts of that queue are obviously sane; it's the conversions of > kernel_write() and friends to ->read_iter/->write_iter() that are non-starters. And that part is the main point! > That stuff is used in too many situations; we can't guarantee that all of > them will be for files that have those. That's why this series handles ITER_KVEC for this case, which is all that's really needed for kernel_read/write. If you insiste the bvec and pipe cases are handled as well that couod be added fairly easily. > As for default_file_splice_read(), I seriously suspect that with your change > we could as well just make it return -EINVAL and be done with that; places > that have ->read_iter() tend to have explicit ->splice_read() and it looks > like the ones that do not should simply use generic_file_read_iter(). > I hadn't checked that, but there's not a lot of those: Making ->splice_read to default to the ->read_iter based implementation and returning -EINVAL if neither that nor an explicit ->splice_read is provided is useful, but wasn't the aim of this series. Similar on the write side.
On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote: > What's the point? What's wrong with having kernel_read()/kernel_readv()/etc.? > You still have set_fs() in there; doing that one level up in call chain would > be just fine... IDGI. The problem is that they modify the address limit, which the whole subthread here wants to get rid of. > Broken commit: "net: don't play with address limits in kernel_recvmsg". > It would be OK if it was only about data. Unfortunately, that's not > true in one case: svc_udp_recvfrom() wants ->msg_control. Dropped, but we'll need to fix that eventually. > Another delicate place: you can't assume that write() always advances > file position by its (positive) return value. btrfs stuff is sensitive > to that. If we don't want to assume that we need to pass pointer to pos to kernel_read/write. Which might be a good idea in general. > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure > about blind asma->file->f_pos += ret. That's begging for races. Actually, > scratch that - it *is* racy. I think the proper fix is to not even bother to maintain f_pos of the backing file, as we don't ever use it - all reads from it pass in an explicit position anyway.
On Wed, May 10, 2017 at 04:39:12AM +0100, Al Viro wrote: > fcntl stuff: I've decided not to put something similar into work.compat > since I couldn't decide what to do with compat stuff - word-by-word copy > from userland converting to struct flock + conversion to posix_lock + > actual work + conversion to flock + word-by-word copy to userland... Smells > like we might be better off with compat_flock_to_posix_lock() et.al. > I'm still not sure; played a bit one way and another and dediced to drop > it for now. Hell knows... My version already is an improvement in lines of code alone. Between that and stopping to mess with the address limit I think it's a clear winner. But it's pretty independent of the rest, and I'll just run it through Jeff and Bruce and ask them what they think.
On Tue, May 09, 2017 at 04:31:00PM -0700, Kees Cook wrote: > > I don't like silent fixups. If we want to do this, we should BUG or > > at least WARN, not just change the addr limit. But I'm also not > > convinced it's indicative of an actual bug here. > > Nothing should enter that function with KERNEL_DS set, right? It might very well do. Various drivers or the networking code mess with the address limits for fairly broad sections of code.
On Tue, May 09, 2017 at 11:53:01PM -0700, Christoph Hellwig wrote: > On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote: > > What's the point? What's wrong with having kernel_read()/kernel_readv()/etc.? > > You still have set_fs() in there; doing that one level up in call chain would > > be just fine... IDGI. > > The problem is that they modify the address limit, which the whole > subthread here wants to get rid of. And you *still* do the same. Christoph, this is ridiculous - the worst part of the area is not a couple of functions in fs/read_write.c, it's a fucking lot of ->read() and ->write() instances in shitty driver code, pardon the redundance. And _that_ is still done under set_fs(KERNEL_DS). Claiming that set_fs() done one function deeper in callchain (both in fs/read_write.c) is somehow better because it reduces the amount of code under that thing... Get real, please - helpers that encapsulate those set_fs() pairs (a-la kernel_read(), etc.) absolutely make sense and converting their open-coded instances to calls of those helpers is clearly a good thing. However, we are not * getting rid of low-quality code run under KERNEL_DS * gettind rid of set_fs() itself * getting a generic kernel_read() variant that would really take an iov_iter. That's what I'm objecting to. Centralized kernel_readv() et.al. - sure, and fs/read_write.c is the right place for those. No arguments here. Conversion to those - absolutely; drivers have no fucking business touching set_fs() at all. But your primitives are trouble waiting to happen. Let them take kvec arrays. And let them, in case when there's no ->read_iter()/->write_iter(), do set_fs(). Statically, without this if (iter->type & ITER_KVEC) ... stuff. > > Another delicate place: you can't assume that write() always advances > > file position by its (positive) return value. btrfs stuff is sensitive > > to that. > > If we don't want to assume that we need to pass pointer to pos to > kernel_read/write. Which might be a good idea in general. Yes. > > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure > > about blind asma->file->f_pos += ret. That's begging for races. Actually, > > scratch that - it *is* racy. > > I think the proper fix is to not even bother to maintain f_pos of the > backing file, as we don't ever use it - all reads from it pass in > an explicit position anyway. vfs_llseek() used by ashmem_llseek()...
On Tue, May 9, 2017 at 6:03 PM, Christoph Hellwig <hch@infradead.org> wrote: > On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote: >> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote: >> > fs/splice.c has some, ahem, interesting uses that have been the source >> > of nasty exploits in the past. Converting them to use iov_iter >> > properly would be really, really nice. Christoph, I don't suppose >> > you'd like to do that? >> >> I can take care of all the fs code including this one. > > I spent the afternoon hacking up where I'd like this to head. It's > completely untested as of now: > > http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination My older time64_t syscall series has the side-effect of doing something like this to the time-related compat handlers in kernel/compat.c. If nobody else has started looking at removing set_fs from those, I can extract the relevant parts from my series. Arnd
On Wed, May 10, 2017 at 08:27:47AM +0100, Al Viro wrote: > And you *still* do the same. Christoph, this is ridiculous - the worst > part of the area is not a couple of functions in fs/read_write.c, it's > a fucking lot of ->read() and ->write() instances in shitty driver code, > pardon the redundance. And _that_ is still done under set_fs(KERNEL_DS). For now yes. But it provides a sane API on top, and then we can start moving the drivers that matter to the iter variants and drop support for the rest soon. Most in-kernel I/O is done to files, and the rest to a very limited set of devices. (not accounting for sockets through their own APIs, thats another story) > That's what I'm objecting to. Centralized kernel_readv() et.al. - sure, > and fs/read_write.c is the right place for those. No arguments here. > Conversion to those - absolutely; drivers have no fucking business touching > set_fs() at all. But your primitives are trouble waiting to happen. > Let them take kvec arrays. And let them, in case when there's no > ->read_iter()/->write_iter(), do set_fs(). Statically, without this > if (iter->type & ITER_KVEC) ... stuff. Oh weĺl. I can do that first, but I think eventually we'll have to get back to what I've done now.
On Wed, May 10, 2017 at 09:28:48AM +0200, Arnd Bergmann wrote: > My older time64_t syscall series has the side-effect of doing something > like this to the time-related compat handlers in kernel/compat.c. If nobody > else has started looking at removing set_fs from those, I can extract > the relevant parts from my series. That would be great.
On Tue, May 9, 2017 at 3:00 PM, Andy Lutomirski <luto@kernel.org> wrote: > On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote: >> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote: >>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would >>> be a pity to add a runtime check to every system call ... >> >> I think we should simply strive to remove all of them that aren't >> in core scheduler / arch code. Basically evetyytime we do the >> >> oldfs = get_fs(); >> set_fs(KERNEL_DS); >> .. >> set_fs(oldfs); >> >> trick we're doing something wrong, and there should always be better >> ways to archive it. E.g. using iov_iter with a ITER_KVEC type >> consistently would already remove most of them. > > How about trying to remove all of them? If we could actually get rid > of all of them, we could drop the arch support, and we'd get faster, > simpler, shorter uaccess code throughout the kernel. > > The ones in kernel/compat.c are generally garbage. They should be > using compat_alloc_user_space(). Ditto for kernel/power/user.c. compat_alloc_user_space() has some problems too, it adds complexity to a rarely-tested code path and can add some noticeable overhead in cases where user space access is slow because of extra checks. It's clearly better than set_fs(), but the way I prefer to convert the code is to avoid both and instead move compat handlers next to the native code, and splitting out the common code between native and compat mode into a helper that takes a regular kernel pointer. I think that's what both Al has done in the past on compat_ioctl() and select() and what Christoph does in his latest series, but it seems worth pointing out for others that decide to help out here. Arnd
On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote: > > How about trying to remove all of them? If we could actually get rid > > of all of them, we could drop the arch support, and we'd get faster, > > simpler, shorter uaccess code throughout the kernel. BTW, not all get_user() under KERNEL_DS are plain loads. There is an exception - probe_kernel_read(). > > The ones in kernel/compat.c are generally garbage. They should be > > using compat_alloc_user_space(). Ditto for kernel/power/user.c. > > compat_alloc_user_space() has some problems too, it adds > complexity to a rarely-tested code path and can add some noticeable > overhead in cases where user space access is slow because of > extra checks. > > It's clearly better than set_fs(), but the way I prefer to convert the > code is to avoid both and instead move compat handlers next to > the native code, and splitting out the common code between native > and compat mode into a helper that takes a regular kernel pointer. > > I think that's what both Al has done in the past on compat_ioctl() > and select() and what Christoph does in his latest series, but > it seems worth pointing out for others that decide to help out here. Folks, reducing the amount of places where we play with set_fs() is certainly a good thing. Getting rid of them completely is something entirely different; I have tried to plot out patch series in this direction many times during the last 5 years or so, but it's not going to be easy. Tomorrow I can start posting my notes in that direction (and there are tons of those, unfortunately mixed with git grep results, highly unprintable personal comments, etc.); just let me grab some sleep first... BTW, slow userland access is not just due to extra checks; access_ok(), in particular, is pretty much noise. The real PITA comes from the things like STAC/CLAC on recent x86. Or hardware overhead of cross-address-space block copy insn (e.g. on s390, where it's optimized for multi-cacheline blocks). Or things like uml, where it's a matter of walking the page tables for each sodding __get_user(). It's not always just a matter of address space limit...
On Wed, May 10, 2017 at 09:08:41AM +0100, Al Viro wrote: > On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote: > > > > How about trying to remove all of them? If we could actually get rid > > > of all of them, we could drop the arch support, and we'd get faster, > > > simpler, shorter uaccess code throughout the kernel. > > BTW, not all get_user() under KERNEL_DS are plain loads. There is an > exception - probe_kernel_read(). And various calls that looks like opencoded versions, e.g. drivers/dio or the ELF loader. But in the long run we'll just need a separate primitive for that, but that can wait until the set_fs calls outside the core code are gone.
On Wed, May 10, 2017 at 1:14 AM, Christoph Hellwig <hch@infradead.org> wrote: > On Wed, May 10, 2017 at 09:08:41AM +0100, Al Viro wrote: >> On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote: >> >> > > How about trying to remove all of them? If we could actually get rid >> > > of all of them, we could drop the arch support, and we'd get faster, >> > > simpler, shorter uaccess code throughout the kernel. >> >> BTW, not all get_user() under KERNEL_DS are plain loads. There is an >> exception - probe_kernel_read(). > > And various calls that looks like opencoded versions, e.g. drivers/dio > or the ELF loader. > > But in the long run we'll just need a separate primitive for that, > but that can wait until the set_fs calls outside the core code are > gone. I suspect that, on most arches, the primitive is called __copy_from_user(). We could make the generic code do that except where overridden.
On Tue, May 09, 2017 at 04:31:00PM -0700, Kees Cook wrote: > > I don't like silent fixups. If we want to do this, we should BUG or > > at least WARN, not just change the addr limit. But I'm also not > > convinced it's indicative of an actual bug here. > > Nothing should enter that function with KERNEL_DS set, right? > > BUG_ON(get_fs() != USER_DS); We're feeling triggerhappy, aren't we? A nice juicy WARN-splat along with a fixup looks much better than killing the box, to me.
On Tue, May 9, 2017 at 7:29 AM, Thomas Garnier <thgarnie@google.com> wrote: > > On Tue, May 9, 2017 at 4:10 AM, Greg KH <greg@kroah.com> wrote: > > On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote: > >> > >> * Kees Cook <keescook@chromium.org> wrote: > >> > >> > > There's the option of using GCC plugins now that the infrastructure was > >> > > upstreamed from grsecurity. It can be used as part of the regular build > >> > > process and as long as the analysis is pretty simple it shouldn't hurt compile > >> > > time much. > >> > > >> > Well, and that the situation may arise due to memory corruption, not from > >> > poorly-matched set_fs() calls, which static analysis won't help solve. We need > >> > to catch this bad kernel state because it is a very bad state to run in. > >> > >> If memory corruption corrupted the task state into having addr_limit set to > >> KERNEL_DS then there's already a fair chance that it's game over: it could also > >> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other > >> things... > >> > >> Furthermore, think about it: there's literally an infinite amount of corrupted > >> task states that could be a security problem and that could be checked after every > >> system call. Do we want to check every one of them? > > > > Ok, I'm all for not checking lots of stuff all the time, just to protect > > from crappy drivers that. Especially as we _can_ audit and run checks > > on the source code for them in the kernel tree. > > > > But, and here's the problem, outside of the desktop/enterprise world, > > there are a ton of out-of-tree code that is crap. The number of > > security/bug fixes and kernel crashes for out-of-tree code in systems > > like Android phones is just so high it's laughable. > > > > When you have a device that is running 3.2 million lines of kernel code, > > yet the diffstat of the tree compared to mainline adds 3 million lines > > of code, there is bound to be a ton of issues/problems there. > > > > So this is an entirely different thing we need to try to protect > > ourselves from. A long time ago I laughed when I saw that Microsoft had > > to do lots of "hardening" of their kernel to protect themselves from > > crappy drivers, as I knew we didn't have to do that because we had the > > source for them and could fix the root issues. But that has changed and > > now we don't all have that option. That code is out-of-tree because the > > vendor doesn't care, and doesn't want to take any time at all to do > > anything resembling a real code review[1]. > > That's a big part of why I thought would be useful. I am less worried > about edge cases upstream right now than forks with custom codes not > using set_fs correctly. > > > > > So, how about options like the ones being proposed here, go behind a new > > config option: > > CONFIG_PROTECT_FROM_CRAPPY_DRIVERS > > that device owners can enable if they do not trust their vendor-provided > > code (hint, I sure don't.) That way the "normal" path that all of us > > are used to running will be fine, but if you want to take the speed hit > > to try to protect yourself, then you can do that as well. > > Maybe another name but why not. Ingo: Do you want the change as-is? Would you like it to be optional? What do you think? > > > > > Anyway, just an idea... > > > > thanks, > > > > greg k-h > > > > [1] I am working really hard with lots of vendors to try to fix their > > broken development model, but that is going to take years to resolve > > as their device pipelines are years long, and changing their > > mindsets takes a long time... > > > > -- > Thomas
On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: > > Ingo: Do you want the change as-is? Would you like it to be optional? > What do you think? I'm not ingo, but I don't like that patch. It's in the wrong place - that system call return code is too timing-critical to add address limit checks. Now what I think you *could* do is: - make "set_fs()" actually set a work flag in the current thread flags - do the test in the slow-path (syscall_return_slowpath). Yes, yes, that ends up being architecture-specific, but it's fairly simple. And it only slows down the system calls that actually use "set_fs()". Sure, it will slow those down a fair amount, but they are hopefully a small subset of all cases. How does that sound to people? Thats' where we currently do that if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) local_irq_enable(); check too, which is a fairly similar issue. Linus
On Thu, 11 May 2017 16:44:07 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: > > > > Ingo: Do you want the change as-is? Would you like it to be optional? > > What do you think? > > I'm not ingo, but I don't like that patch. It's in the wrong place - > that system call return code is too timing-critical to add address > limit checks. > > Now what I think you *could* do is: > > - make "set_fs()" actually set a work flag in the current thread flags > > - do the test in the slow-path (syscall_return_slowpath). > > Yes, yes, that ends up being architecture-specific, but it's fairly simple. > > And it only slows down the system calls that actually use "set_fs()". > Sure, it will slow those down a fair amount, but they are hopefully a > small subset of all cases. > > How does that sound to people? Thats' where we currently do that > > if (IS_ENABLED(CONFIG_PROVE_LOCKING) && > WARN(irqs_disabled(), "syscall %ld left IRQs disabled", > regs->orig_ax)) > local_irq_enable(); > > check too, which is a fairly similar issue. This is exactly what Heiko did for the s390 backend as a result of this discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S, for the hot patch the check for the bit is included in the general _CIF_WORK test. Only the slow patch gets a bit slower. git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d "s390: restore address space when returning to user space".
On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky <schwidefsky@de.ibm.com> wrote: > On Thu, 11 May 2017 16:44:07 -0700 > Linus Torvalds <torvalds@linux-foundation.org> wrote: > >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: >> > >> > Ingo: Do you want the change as-is? Would you like it to be optional? >> > What do you think? >> >> I'm not ingo, but I don't like that patch. It's in the wrong place - >> that system call return code is too timing-critical to add address >> limit checks. >> >> Now what I think you *could* do is: >> >> - make "set_fs()" actually set a work flag in the current thread flags >> >> - do the test in the slow-path (syscall_return_slowpath). >> >> Yes, yes, that ends up being architecture-specific, but it's fairly simple. >> >> And it only slows down the system calls that actually use "set_fs()". >> Sure, it will slow those down a fair amount, but they are hopefully a >> small subset of all cases. >> >> How does that sound to people? Thats' where we currently do that >> >> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && >> WARN(irqs_disabled(), "syscall %ld left IRQs disabled", >> regs->orig_ax)) >> local_irq_enable(); >> >> check too, which is a fairly similar issue. > > This is exactly what Heiko did for the s390 backend as a result of this > discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S, > for the hot patch the check for the bit is included in the general > _CIF_WORK test. Only the slow patch gets a bit slower. > > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d > "s390: restore address space when returning to user space". If I'm understanding this, it won't catch corruption of addr_limit during fast-path syscalls, though (i.e. addr_limit changed without a call to set_fs()). :( This addr_limit corruption is mostly only a risk archs without THREAD_INFO_IN_TASK, but it would still be nice to catch unbalanced set_fs() code, so I like the idea. I like getting rid of addr_limit entirely even more, but that'll take some time. :) -Kees
On Thu, 11 May 2017 22:34:31 -0700 Kees Cook <keescook@chromium.org> wrote: > On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky > <schwidefsky@de.ibm.com> wrote: > > On Thu, 11 May 2017 16:44:07 -0700 > > Linus Torvalds <torvalds@linux-foundation.org> wrote: > > > >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: > >> > > >> > Ingo: Do you want the change as-is? Would you like it to be optional? > >> > What do you think? > >> > >> I'm not ingo, but I don't like that patch. It's in the wrong place - > >> that system call return code is too timing-critical to add address > >> limit checks. > >> > >> Now what I think you *could* do is: > >> > >> - make "set_fs()" actually set a work flag in the current thread flags > >> > >> - do the test in the slow-path (syscall_return_slowpath). > >> > >> Yes, yes, that ends up being architecture-specific, but it's fairly simple. > >> > >> And it only slows down the system calls that actually use "set_fs()". > >> Sure, it will slow those down a fair amount, but they are hopefully a > >> small subset of all cases. > >> > >> How does that sound to people? Thats' where we currently do that > >> > >> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && > >> WARN(irqs_disabled(), "syscall %ld left IRQs disabled", > >> regs->orig_ax)) > >> local_irq_enable(); > >> > >> check too, which is a fairly similar issue. > > > > This is exactly what Heiko did for the s390 backend as a result of this > > discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S, > > for the hot patch the check for the bit is included in the general > > _CIF_WORK test. Only the slow patch gets a bit slower. > > > > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d > > "s390: restore address space when returning to user space". > > If I'm understanding this, it won't catch corruption of addr_limit > during fast-path syscalls, though (i.e. addr_limit changed without a > call to set_fs()). :( This addr_limit corruption is mostly only a risk > archs without THREAD_INFO_IN_TASK, but it would still be nice to catch > unbalanced set_fs() code, so I like the idea. I like getting rid of > addr_limit entirely even more, but that'll take some time. :) Well for s390 there is no addr_limit as we use two separate address space for kernel vs. user. The equivalent to the addr_limit corruption on a fast-path syscall would be changing CR7 outside of set_fs. This boils down to the question what we are protection against? Bad code with unbalanced set_fs or evil code that changes addr_limit/CR7 outside of set_fs
[resending because kernel.org seems to have mangled my SMTP credentials. I wonder if this is a common problem.] On Thu, May 11, 2017 at 4:44 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: >> >> Ingo: Do you want the change as-is? Would you like it to be optional? >> What do you think? > > I'm not ingo, but I don't like that patch. It's in the wrong place - > that system call return code is too timing-critical to add address > limit checks. > > Now what I think you *could* do is: > > - make "set_fs()" actually set a work flag in the current thread flags > > - do the test in the slow-path (syscall_return_slowpath). > > Yes, yes, that ends up being architecture-specific, but it's fairly simple. > > And it only slows down the system calls that actually use "set_fs()". > Sure, it will slow those down a fair amount, but they are hopefully a > small subset of all cases. > > How does that sound to people? Thats' where we currently do that > > if (IS_ENABLED(CONFIG_PROVE_LOCKING) && > WARN(irqs_disabled(), "syscall %ld left IRQs disabled", > regs->orig_ax)) > local_irq_enable(); > > check too, which is a fairly similar issue. > I like this. It wouldn't help the problem that I suspect is a major part of the motivation for this patch: a stack overflow could overwrite addr_limit. But we fixed that for real already. Slightly off-topic: I would *love* to see syscall_return_slowpath() or similar moved or at least mostly moved into generic code. Aside from the fact that it used to be written in asm, there's nothing fundamentally arch-specific about it. > > And it only slows down the system calls that actually use "set_fs()". > Sure, it will slow those down a fair amount, but they are hopefully a > small subset of all cases. It won't even slow them down that much. The slow path is reasonably fast these days.
* Kees Cook <keescook@chromium.org> wrote: > > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d > > "s390: restore address space when returning to user space". > > If I'm understanding this, it won't catch corruption of addr_limit > during fast-path syscalls, though (i.e. addr_limit changed without a > call to set_fs()). :( Nor does it, or the patch you propose, protect against against something corrupting task->mm pointer, or the task->*uid values, or any of the myriads of security relevant values stored in the task structure! Making sure API (set_fs()) usage is bug-free and protecting against the effects of general data corruption are two unrelated things that should not mixed. Thanks, Ingo
* Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: > > > > Ingo: Do you want the change as-is? Would you like it to be optional? > > What do you think? > > I'm not ingo, but I don't like that patch. It's in the wrong place - > that system call return code is too timing-critical to add address > limit checks. > > Now what I think you *could* do is: > > - make "set_fs()" actually set a work flag in the current thread flags > > - do the test in the slow-path (syscall_return_slowpath). > > Yes, yes, that ends up being architecture-specific, but it's fairly simple. > > And it only slows down the system calls that actually use "set_fs()". > Sure, it will slow those down a fair amount, but they are hopefully a > small subset of all cases. > > How does that sound to people? Thats' where we currently do that > > if (IS_ENABLED(CONFIG_PROVE_LOCKING) && > WARN(irqs_disabled(), "syscall %ld left IRQs disabled", > regs->orig_ax)) > local_irq_enable(); > > check too, which is a fairly similar issue. I really like that idea and I'd be perfectly fine with that solution, because it puts the overhead where the problem comes from, and adds an extra incentive for code to move away from set_fs() facilities. Win-win. Thanks, Ingo
* Andy Lutomirski <luto@kernel.org> wrote: > On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote: > > On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote: > >> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would > >> be a pity to add a runtime check to every system call ... > > > > I think we should simply strive to remove all of them that aren't > > in core scheduler / arch code. Basically evetyytime we do the > > > > oldfs = get_fs(); > > set_fs(KERNEL_DS); > > .. > > set_fs(oldfs); > > > > trick we're doing something wrong, and there should always be better > > ways to archive it. E.g. using iov_iter with a ITER_KVEC type > > consistently would already remove most of them. > > How about trying to remove all of them? If we could actually get rid > of all of them, we could drop the arch support, and we'd get faster, > simpler, shorter uaccess code throughout the kernel. I'm all for that! Thanks, Ingo
On Fri, May 12, 2017 at 09:00:12AM +0200, Ingo Molnar wrote: > > How about trying to remove all of them? If we could actually get rid > > of all of them, we could drop the arch support, and we'd get faster, > > simpler, shorter uaccess code throughout the kernel. > > I'm all for that! Oh, for... Ingo, do you really want to go through all ->write() and ->read() instances, converting all of them to iov_iter? Or, better yet, deal with the patch flood from Nick Krause sock puppet brigade? Folks, seriously, have you even looked through that zoo? I have, and it's really, really not fun. Sure, we can say "fuck 'em, no need to allow splice() on random crap". Would be perfectly reasonable, expect that it's not the only place doing kernel_write() and its ilk... And converting everything to ->read_iter()/->write_iter() means an insane amount of code churn, not to mention coping with random bogosities in semantics. ->read() and ->write() are going to stay around, pretty much indefinitely.
On Fri, May 12, 2017 at 08:15:49AM +0100, Al Viro wrote: > And converting everything to ->read_iter()/->write_iter() means an insane > amount of code churn, not to mention coping with random bogosities in > semantics. ->read() and ->write() are going to stay around, pretty > much indefinitely. But I don't think kernel users of them have to. I've been digging through the calllers and will send an analysis to the list in a bit.
On Fri, May 12, 2017 at 9:15 AM, Al Viro <viro@zeniv.linux.org.uk> wrote: > On Fri, May 12, 2017 at 09:00:12AM +0200, Ingo Molnar wrote: > >> > How about trying to remove all of them? If we could actually get rid >> > of all of them, we could drop the arch support, and we'd get faster, >> > simpler, shorter uaccess code throughout the kernel. >> >> I'm all for that! > > Oh, for... Ingo, do you really want to go through all ->write() and ->read() > instances, converting all of them to iov_iter? Or, better yet, deal with > the patch flood from Nick Krause sock puppet brigade? How realistic and how useful would it be to first completely eliminate the ones that are in loadable modules and then wrapping the definition in #ifndef MODULE (or even make it an extern function)? This should be a fairly complete list of the modular users: drivers/block/drbd/drbd_main.c: set_fs(KERNEL_DS); drivers/input/serio/hp_sdc.c: set_fs(KERNEL_DS); drivers/media/v4l2-core/v4l2-compat-ioctl32.c: set_fs(KERNEL_DS); drivers/misc/lkdtm_bugs.c: set_fs(KERNEL_DS); drivers/s390/crypto/pkey_api.c: set_fs(KERNEL_DS); drivers/staging/comedi/drivers/serial2002.c: set_fs(KERNEL_DS); drivers/staging/lustre/lnet/libcfs/tracefile.c: set_fs(get_ds()); drivers/staging/media/atomisp/pci/atomisp2/atomisp_compat_ioctl32.c: set_fs(KERNEL_DS); drivers/staging/rtl8723bs/os_dep/osdep_service.c: oldfs = get_fs(); set_fs(get_ds()); drivers/usb/gadget/function/f_mass_storage.c: set_fs(get_ds()); drivers/usb/gadget/function/u_uac1.c: set_fs(KERNEL_DS); drivers/vhost/vhost.c: set_fs(USER_DS); drivers/video/fbdev/core/fbmem.c: set_fs(KERNEL_DS); drivers/video/fbdev/hpfb.c: set_fs(KERNEL_DS); fs/autofs4/waitq.c: set_fs(KERNEL_DS); fs/binfmt_aout.c: set_fs(KERNEL_DS); fs/binfmt_elf.c: set_fs(USER_DS); fs/binfmt_elf_fdpic.c: set_fs(KERNEL_DS); fs/btrfs/send.c: set_fs(KERNEL_DS); fs/ext4/ioctl.c: set_fs(KERNEL_DS); fs/nfsd/vfs.c: set_fs(KERNEL_DS); net/9p/trans_fd.c: set_fs(get_ds()); net/ipv6/addrconf.c: set_fs(KERNEL_DS); net/ipv6/exthdrs.c: set_fs(KERNEL_DS); net/sunrpc/svcsock.c: oldfs = get_fs(); set_fs(KERNEL_DS); sound/core/oss/pcm_oss.c: set_fs(get_ds()); sound/core/pcm_native.c: set_fs(get_ds()); sound/drivers/opl3/opl3_oss.c: set_fs(get_ds()); sound/oss/dmabuf.c: set_fs(get_ds()); sound/oss/swarm_cs4297a.c: set_fs(KERNEL_DS); sound/pci/emu10k1/emufx.c: set_fs(get_ds()); sound/pci/hda/hda_codec.c: set_fs(get_ds()); Arnd
On Fri, May 12, 2017 at 12:35:21AM -0700, Christoph Hellwig wrote: > On Fri, May 12, 2017 at 08:15:49AM +0100, Al Viro wrote: > > And converting everything to ->read_iter()/->write_iter() means an insane > > amount of code churn, not to mention coping with random bogosities in > > semantics. ->read() and ->write() are going to stay around, pretty > > much indefinitely. > > But I don't think kernel users of them have to. I've been digging > through the calllers and will send an analysis to the list in a bit. Ok, here is the cleaned up list. The targets for in-kernel I/O are basically regular files on normal fs, block devices, pipes and sockets, with a few narrow exceptions and a few unclear cases. I've added a few maintainers to Cc to clarify, and the list is below: When looking at kernel_read/kernel_readv/kernel_write/__kernel_write instances, or vfs_read/vfs_readv/vfs_write/vfs_writev inststances with set_fs tricks most of them simply require a regular file on a "real" fs or a block device, pipe, socket: - various binary loaders: need to be regular files (+ on a "real" fs) - nandsim: expect a regular file or maybe a block device - code: expects a regular file - ecryptfs: expects a regular file - splice: can be a regular file or device file - security/keys: regular file - cachefiles: needs to be a regular file - coredump: usually a regular file, but possibly a pipe - acct: regular file - target: regular file - nfsd: regular file - lustre: regular file - f_mass_storage: regular file, or maybe block device - autofs: pipe - btrfs: regular file / pipe / socket - ima: regular file on specific file systems Then there are a few interesting ones with specific targets: - sysctl: regular file on procfs - mconsole: procfs as far as I can tell. Might be able to further narrow it down? - ashmem: regular file on shmem The nommu mmap code needs to read everything that wants to support a MAP_PRIVATE mmap, but as far as a I can tell: do a) address limits not matter for nommu, and b) I have no f***cking idea why it doesn't use readpage(s) to start with like the MMU code. And a few that I can't figure: - cx25821: no idea what this opens, need to confirm with the maintainers (on Cc) - 9p: not clear. looks like it might be sockets/pipes Last but not least we have a driver that's a complete mess: - series2002: /dev/tty* (drivers is in staging and needs to be rewritten using the proper in-kernel APIs anyway)
On Fri, May 12, 2017 at 09:43:40AM +0200, Arnd Bergmann wrote: > How realistic and how useful would it be to first completely eliminate > the ones that are in loadable modules and then wrapping the definition > in #ifndef MODULE (or even make it an extern function)? Should be fairly doable and might be a nice step towards cleaning the mess up. In fact with my seres a large part of those are gone, and most of the remaining handler are ioctl handlers or what seems like opencoded versions of probe_kernel_read. But it won't help against exploits modifying addr_limit manually.
On Fri, May 12, 2017 at 09:43:40AM +0200, Arnd Bergmann wrote: > How realistic and how useful would it be to first completely eliminate > the ones that are in loadable modules and then wrapping the definition > in #ifndef MODULE (or even make it an extern function)? Eliminate _what_? ->read() and ->write() instances? > This should be a fairly complete list of the modular users: > > drivers/block/drbd/drbd_main.c: set_fs(KERNEL_DS); Ah, set_fs()... Sure, many of those can be killed off. Wouldn't be a bad idea, but I don't understand what difference does modular/built-in make here... This one: AFAICS doesn't give a damn about set_fs() at all. > drivers/input/serio/hp_sdc.c: set_fs(KERNEL_DS); Open-coded probe_kernel_read(), apparently. > drivers/media/v4l2-core/v4l2-compat-ioctl32.c: set_fs(KERNEL_DS); massive compat ioctl crap. > drivers/misc/lkdtm_bugs.c: set_fs(KERNEL_DS); insane. > drivers/s390/crypto/pkey_api.c: set_fs(KERNEL_DS); No idea. > drivers/staging/comedi/drivers/serial2002.c: set_fs(KERNEL_DS); Open-coded kernel_write(); to some character device, no less... And similar for kernel_read(), apparently. > drivers/staging/lustre/lnet/libcfs/tracefile.c: set_fs(get_ds()); Fuck knows; kernel_write() might do it. Depends upon what it's writing to. You've missed other places in lustre, BTW - including the ioctls on sockets, etc. > drivers/staging/media/atomisp/pci/atomisp2/atomisp_compat_ioctl32.c: > set_fs(KERNEL_DS); Compat ioctl crap, again. > drivers/staging/rtl8723bs/os_dep/osdep_service.c: oldfs > = get_fs(); set_fs(get_ds()); Oh, lovely - reading an arbitrary (as in, specified by pathname) file. Firmware (mis)handling? > drivers/usb/gadget/function/f_mass_storage.c: set_fs(get_ds()); No idea. > drivers/usb/gadget/function/u_uac1.c: set_fs(KERNEL_DS); kernel_write(), by the look of it. Or something similar. > drivers/vhost/vhost.c: set_fs(USER_DS); kernel thread doing use_mm() > drivers/video/fbdev/core/fbmem.c: set_fs(KERNEL_DS); compat ioctl. > drivers/video/fbdev/hpfb.c: set_fs(KERNEL_DS); probe_kernel_read() > fs/autofs4/waitq.c: set_fs(KERNEL_DS); kernel_write() > fs/binfmt_aout.c: set_fs(KERNEL_DS); > fs/binfmt_elf.c: set_fs(USER_DS); > fs/binfmt_elf_fdpic.c: set_fs(KERNEL_DS); coredump stuff. > fs/btrfs/send.c: set_fs(KERNEL_DS); kernel_write() Anyway, what's special about modules? IDGI...
On Fri, May 12, 2017 at 01:11:26AM -0700, Christoph Hellwig wrote:
> But it won't help against exploits modifying addr_limit manually.
Or the ones setting current->cred to that of init. Your point being?
On Fri, May 12, 2017 at 10:11 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> Anyway, what's special about modules? IDGI...
One of the arguments that came up earlier was code in external modules
being mostly unaudited, sometimes without any source code available
at all but still used in devices.
If modules can't do set_fs() any more, this could eliminate bugs with
unpaired set_fs in those modules.
Limiting factors of course are:
- embedded systems that ship come with their own kernels (as opposed
to using whatever users have, or relying on binary distros) can just
make it available to modules again, by reverting the patch
- As Christoph said, they could have an open-coded set_fs in the
driver
- Whatever other method a clueless driver write might come up with
isn't necessarily better than set_fs().
Arnd
On Fri, May 12, 2017 at 01:07:41AM -0700, Christoph Hellwig wrote: > Last but not least we have a driver that's a complete mess: > > - series2002: /dev/tty* (drivers is in staging and needs to be rewritten > using the proper in-kernel APIs anyway) I agree, that's a mess, and it needs to use the new serdev interface at the least. Worse case, we just delete the thing :) thanks, greg k-h
On Thu, May 11, 2017 at 11:58 PM, Ingo Molnar <mingo@kernel.org> wrote: > > * Linus Torvalds <torvalds@linux-foundation.org> wrote: > >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: >> > >> > Ingo: Do you want the change as-is? Would you like it to be optional? >> > What do you think? >> >> I'm not ingo, but I don't like that patch. It's in the wrong place - >> that system call return code is too timing-critical to add address >> limit checks. >> >> Now what I think you *could* do is: >> >> - make "set_fs()" actually set a work flag in the current thread flags >> >> - do the test in the slow-path (syscall_return_slowpath). >> >> Yes, yes, that ends up being architecture-specific, but it's fairly simple. >> >> And it only slows down the system calls that actually use "set_fs()". >> Sure, it will slow those down a fair amount, but they are hopefully a >> small subset of all cases. >> >> How does that sound to people? Thats' where we currently do that >> >> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && >> WARN(irqs_disabled(), "syscall %ld left IRQs disabled", >> regs->orig_ax)) >> local_irq_enable(); >> >> check too, which is a fairly similar issue. > > I really like that idea and I'd be perfectly fine with that solution, because it > puts the overhead where the problem comes from, and adds an extra incentive for > code to move away from set_fs() facilities. Win-win. Great, I will adapt the patch for that. > > Thanks, > > Ingo
On Thu, May 11, 2017 at 10:54 PM, Martin Schwidefsky <schwidefsky@de.ibm.com> wrote: > On Thu, 11 May 2017 22:34:31 -0700 > Kees Cook <keescook@chromium.org> wrote: > >> On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky >> <schwidefsky@de.ibm.com> wrote: >> > On Thu, 11 May 2017 16:44:07 -0700 >> > Linus Torvalds <torvalds@linux-foundation.org> wrote: >> > >> >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote: >> >> > >> >> > Ingo: Do you want the change as-is? Would you like it to be optional? >> >> > What do you think? >> >> >> >> I'm not ingo, but I don't like that patch. It's in the wrong place - >> >> that system call return code is too timing-critical to add address >> >> limit checks. >> >> >> >> Now what I think you *could* do is: >> >> >> >> - make "set_fs()" actually set a work flag in the current thread flags >> >> >> >> - do the test in the slow-path (syscall_return_slowpath). >> >> >> >> Yes, yes, that ends up being architecture-specific, but it's fairly simple. >> >> >> >> And it only slows down the system calls that actually use "set_fs()". >> >> Sure, it will slow those down a fair amount, but they are hopefully a >> >> small subset of all cases. >> >> >> >> How does that sound to people? Thats' where we currently do that >> >> >> >> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && >> >> WARN(irqs_disabled(), "syscall %ld left IRQs disabled", >> >> regs->orig_ax)) >> >> local_irq_enable(); >> >> >> >> check too, which is a fairly similar issue. >> > >> > This is exactly what Heiko did for the s390 backend as a result of this >> > discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S, >> > for the hot patch the check for the bit is included in the general >> > _CIF_WORK test. Only the slow patch gets a bit slower. >> > >> > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d >> > "s390: restore address space when returning to user space". >> >> If I'm understanding this, it won't catch corruption of addr_limit >> during fast-path syscalls, though (i.e. addr_limit changed without a >> call to set_fs()). :( This addr_limit corruption is mostly only a risk >> archs without THREAD_INFO_IN_TASK, but it would still be nice to catch >> unbalanced set_fs() code, so I like the idea. I like getting rid of >> addr_limit entirely even more, but that'll take some time. :) > > Well for s390 there is no addr_limit as we use two separate address space > for kernel vs. user. The equivalent to the addr_limit corruption on a > fast-path syscall would be changing CR7 outside of set_fs. This boils > down to the question what we are protection against? Bad code with > unbalanced set_fs or evil code that changes addr_limit/CR7 outside of > set_fs Yeah, the risk for "corrupted addr_limit" is mainly a concern for archs with addr_limit on the kernel stack. If I'm reading things correctly, that means, from the archs I've been paying closer attention to, it's an issue for arm, mips, and powerpc: arch/arm/include/asm/uaccess.h: current_thread_info()->addr_limit = fs; arch/arm/include/asm/thread_info.h: (current_stack_pointer & ~(THREAD_SIZE - 1)); arch/mips/include/asm/uaccess.h:#define set_fs(x) (current_thread_info()->addr_limit = (x)) arch/mips/kernel/process.c: * task stacks at THREAD_SIZE - 32 arch/powerpc/include/asm/uaccess.h:#define set_fs(val) (current->thread.fs = (val)) arch/powerpc/kernel/process.c: struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE; (s390 uses a register, x86 and arm64 implement THREAD_INFO_IN_TASK.) Targeting addr_limit through arbitrary write attacks isn't too common since ... it's an arbitrary write. The issue with addr_limit was that it can live on the kernel stack, which meant all kinds of stack-related bugs can lead to it getting stomped on. So, two goals to protect addr_limit: - get it off the stack to make the difficulty of corruption on par with other sensitive things that would require an arbitrary write flaw. - detect/block unbalanced set_fs() calls. If we can get the former addressed by the remaining architectures, then that class of attack will go away. For the latter, it sounds like Linus's slowpath-exit will work nicely. To me it looks like he architectures with addr_limit still on the stack would still benefit from always-check-addr_limit on syscall exit, but that would be arch-specific anyway. And then, of course, we've got the parallel task of just removing set_fs() entirely. :) -Kees
On Fri, May 12, 2017 at 12:01:59PM -0700, Kees Cook wrote: > Yeah, the risk for "corrupted addr_limit" is mainly a concern for > archs with addr_limit on the kernel stack. If I'm reading things > correctly, that means, from the archs I've been paying closer > attention to, it's an issue for arm, mips, and powerpc: I'd first want to uninline everything in uaccess.h first that makes use of access_ok() - which I think is something that needs to happen anyway.
On Fri, May 12, 2017 at 12:01 PM, Kees Cook <keescook@chromium.org> wrote: > > Yeah, the risk for "corrupted addr_limit" is mainly a concern for > archs with addr_limit on the kernel stack. If I'm reading things > correctly, that means, from the archs I've been paying closer > attention to, it's an issue for arm, mips, and powerpc: I don't understand why people are looking at addr_limit as some kind of special thing. If somebody is smashing the stack and corrupting thread info data, the game is over. addr_limit is the *least* of your problems, and it's not even all that likely that it will be increasing (it's much more likely that it would be overwritten with a smaller value). Quite frankly, this kind of idiotic discussion just makes me question the whole idea of the patch. Any "security" that is this specific is not real security, it's just masturbatory garbage. It may be worth checking that people use "set_fs()" properly. But stop this idiotic crap. It just makes the kernel security people look like the crazies. There are enough incompetent crazy security people, don't go there. The kinds of things it is worth protecting against are the big class of generic issues, not the kind of "oh, but imagine if a cosmic ray flips this particular word in memory" kind of crap that ignores all the other words of memory. Seriously, Kees. You are just making security people look bad. Stop it. Linus
On Fri, May 12, 2017 at 12:08 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Fri, May 12, 2017 at 12:01 PM, Kees Cook <keescook@chromium.org> wrote: >> Yeah, the risk for "corrupted addr_limit" is mainly a concern for >> archs with addr_limit on the kernel stack. If I'm reading things >> correctly, that means, from the archs I've been paying closer >> attention to, it's an issue for arm, mips, and powerpc: > > I don't understand why people are looking at addr_limit as some kind > of special thing. > > If somebody is smashing the stack and corrupting thread info data, the > game is over. addr_limit is the *least* of your problems, and it's not > even all that likely that it will be increasing (it's much more likely > that it would be overwritten with a smaller value). > > Quite frankly, this kind of idiotic discussion just makes me question > the whole idea of the patch. > > Any "security" that is this specific is not real security, it's just > masturbatory garbage. > > It may be worth checking that people use "set_fs()" properly. But stop > this idiotic crap. It just makes the kernel security people look like > the crazies. > > There are enough incompetent crazy security people, don't go there. > The kinds of things it is worth protecting against are the big class > of generic issues, not the kind of "oh, but imagine if a cosmic ray > flips this particular word in memory" kind of crap that ignores all > the other words of memory. > > Seriously, Kees. You are just making security people look bad. Stop it. I'm clearly not explaining things well enough. I shouldn't say "corruption", I should say "malicious manipulation". The methodology of attacks against the stack are quite different from the other kinds of attacks like use-after-free, heap overflow, etc. Being able to exhaust the kernel stack (either due to deep recursion or unbounded alloca()) means attackers can control a write to addr_limit, and then leverage that into an actual arbitrary write via subsequent calls to copy_to_user() pointed at kernel memory. This isn't theoretical, this is how those attacks are performed. It may sound crazy, but it's real. With thread_info off the stack, the whole problem goes away. It's wonderful that this has happened for x86, arm64, and s390. There are always going to be new methods of attack for everything, but while we slowly address the design weakness (set_fs()), we can fix the low hanging fruit too. -Kees
On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: > I'm clearly not explaining things well enough. I shouldn't say > "corruption", I should say "malicious manipulation". The methodology > of attacks against the stack are quite different from the other kinds > of attacks like use-after-free, heap overflow, etc. Being able to > exhaust the kernel stack (either due to deep recursion or unbounded > alloca()) I really hope we don't have alloca() use in the kernel. Do you have evidence to support that assertion? IMHO alloca() (or similar) should not be present in any kernel code because we have a limited stack - we have kmalloc() etc for that kind of thing.
On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote: > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: > > I'm clearly not explaining things well enough. I shouldn't say > > "corruption", I should say "malicious manipulation". The methodology > > of attacks against the stack are quite different from the other kinds > > of attacks like use-after-free, heap overflow, etc. Being able to > > exhaust the kernel stack (either due to deep recursion or unbounded > > alloca()) > > I really hope we don't have alloca() use in the kernel. Do you have > evidence to support that assertion? > > IMHO alloca() (or similar) should not be present in any kernel code > because we have a limited stack - we have kmalloc() etc for that kind > of thing. On stack variable length arrays get implemented by the compiler doing alloca(), and we sadly have a few of those around. But yes, fully agreed on the desirability of alloca() and things.
On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote: > On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote: > > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: > > > I'm clearly not explaining things well enough. I shouldn't say > > > "corruption", I should say "malicious manipulation". The methodology > > > of attacks against the stack are quite different from the other kinds > > > of attacks like use-after-free, heap overflow, etc. Being able to > > > exhaust the kernel stack (either due to deep recursion or unbounded > > > alloca()) > > > > I really hope we don't have alloca() use in the kernel. Do you have > > evidence to support that assertion? > > > > IMHO alloca() (or similar) should not be present in any kernel code > > because we have a limited stack - we have kmalloc() etc for that kind > > of thing. > > On stack variable length arrays get implemented by the compiler doing > alloca(), and we sadly have a few of those around. I hope their size is appropriately limited, but something tells me it would be foolish to assume that. > But yes, fully agreed on the desirability of alloca() and things. Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks like it certainly would prevent an explicit alloca() call.
On Fri, May 12, 2017 at 1:45 PM, Russell King - ARM Linux <linux@armlinux.org.uk> wrote: > On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote: >> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote: >> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: >> > > I'm clearly not explaining things well enough. I shouldn't say >> > > "corruption", I should say "malicious manipulation". The methodology >> > > of attacks against the stack are quite different from the other kinds >> > > of attacks like use-after-free, heap overflow, etc. Being able to >> > > exhaust the kernel stack (either due to deep recursion or unbounded >> > > alloca()) >> > >> > I really hope we don't have alloca() use in the kernel. Do you have >> > evidence to support that assertion? >> > >> > IMHO alloca() (or similar) should not be present in any kernel code >> > because we have a limited stack - we have kmalloc() etc for that kind >> > of thing. >> >> On stack variable length arrays get implemented by the compiler doing >> alloca(), and we sadly have a few of those around. > > I hope their size is appropriately limited, but something tells me it > would be foolish to assume that. > >> But yes, fully agreed on the desirability of alloca() and things. > > Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks > like it certainly would prevent an explicit alloca() call. Building with -Werror=vla is exciting. :) A lot of it is in crypto (which are relatively static sizes, just using function callbacks), but there is plenty more. -Kees
On Fri, May 12, 2017 at 2:00 PM, Kees Cook <keescook@chromium.org> wrote: > On Fri, May 12, 2017 at 1:45 PM, Russell King - ARM Linux > <linux@armlinux.org.uk> wrote: >> On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote: >>> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote: >>> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: >>> > > I'm clearly not explaining things well enough. I shouldn't say >>> > > "corruption", I should say "malicious manipulation". The methodology >>> > > of attacks against the stack are quite different from the other kinds >>> > > of attacks like use-after-free, heap overflow, etc. Being able to >>> > > exhaust the kernel stack (either due to deep recursion or unbounded >>> > > alloca()) >>> > >>> > I really hope we don't have alloca() use in the kernel. Do you have >>> > evidence to support that assertion? >>> > >>> > IMHO alloca() (or similar) should not be present in any kernel code >>> > because we have a limited stack - we have kmalloc() etc for that kind >>> > of thing. >>> >>> On stack variable length arrays get implemented by the compiler doing >>> alloca(), and we sadly have a few of those around. >> >> I hope their size is appropriately limited, but something tells me it >> would be foolish to assume that. >> >>> But yes, fully agreed on the desirability of alloca() and things. >> >> Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks >> like it certainly would prevent an explicit alloca() call. > > Building with -Werror=vla is exciting. :) > > A lot of it is in crypto (which are relatively static sizes, just > using function callbacks), but there is plenty more. I meant to also paste an example (which is harmless, I haven't looked extensively at other examples): unsigned long alignmask = crypto_tfm_alg_alignmask(tfm); unsigned int size = crypto_tfm_alg_blocksize(tfm); u8 buffer[size + alignmask]; Looking at all the places (and having tried to remove a few of these in pstore), I think it might be quite frustrating to eliminate them all and then declare VLAs dead. I'm not against trying, though. :) -Kees
On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote: > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: > > I'm clearly not explaining things well enough. I shouldn't say > > "corruption", I should say "malicious manipulation". The methodology > > of attacks against the stack are quite different from the other kinds > > of attacks like use-after-free, heap overflow, etc. Being able to > > exhaust the kernel stack (either due to deep recursion or unbounded > > alloca()) > > I really hope we don't have alloca() use in the kernel. Do you have > evidence to support that assertion? > > IMHO alloca() (or similar) should not be present in any kernel code > because we have a limited stack - we have kmalloc() etc for that kind > of thing. No alloca(), but there are VLAs. Said that, the whole "what if they can bugger thread_info and/or task_struct and go after set_fs() state" is idiocy, of course - in that case the box is fucked, no matter what.
On Fri, 2017-05-12 at 22:06 +0100, Al Viro wrote: > On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux > wrote: > > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: > > > I'm clearly not explaining things well enough. I shouldn't say > > > "corruption", I should say "malicious manipulation". The > > > methodology > > > of attacks against the stack are quite different from the other > > > kinds > > > of attacks like use-after-free, heap overflow, etc. Being able to > > > exhaust the kernel stack (either due to deep recursion or > > > unbounded > > > alloca()) > > > > I really hope we don't have alloca() use in the kernel. Do you have > > evidence to support that assertion? > > > > IMHO alloca() (or similar) should not be present in any kernel code > > because we have a limited stack - we have kmalloc() etc for that > > kind > > of thing. > > No alloca(), but there are VLAs. Said that, the whole "what if they > can bugger thread_info and/or task_struct and go after set_fs() state" > is idiocy, of course - in that case the box is fucked, no matter what. VMAP_STACK + -fstack-check would prevent exploiting even an unbounded VLA / alloca size vs. it being an arbitrary write. -fstack-check guarantees that there's one byte per page as the stack grows, although there are some unfortunate GCC bugs making it less than perfect right now... but they recently started caring about it more including making it near zero overhead as it was always supposed to be.
On Fri, May 12, 2017 at 2:06 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: > On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote: >> On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote: >> > I'm clearly not explaining things well enough. I shouldn't say >> > "corruption", I should say "malicious manipulation". The methodology >> > of attacks against the stack are quite different from the other kinds >> > of attacks like use-after-free, heap overflow, etc. Being able to >> > exhaust the kernel stack (either due to deep recursion or unbounded >> > alloca()) >> >> I really hope we don't have alloca() use in the kernel. Do you have >> evidence to support that assertion? >> >> IMHO alloca() (or similar) should not be present in any kernel code >> because we have a limited stack - we have kmalloc() etc for that kind >> of thing. > > No alloca(), but there are VLAs. Said that, the whole "what if they > can bugger thread_info and/or task_struct and go after set_fs() state" > is idiocy, of course - in that case the box is fucked, no matter what. Two things are at risk from stack exhaustion: thread_info (mainly addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and overflow into adjacent allocations (fixed by VMAP_STACK). The latter is fundamentally a heap overflow. -Kees
> overflow into adjacent allocations (fixed by VMAP_STACK).
99% fixed, but it's possible to skip over the guard page without
-fstack-check enabled (plus some edge cases need to be fixed in GCC),
unless VLAs were forbidden in addition to the existing large frame size
warning.
I'm not sure about in-tree code, but Qualcomm had some of these
improperly bounded VLA vulnerabilities in their MSM kernel...
On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote: > Two things are at risk from stack exhaustion: thread_info (mainly > addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and Really? Let's take a look at arm, for example: struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ and current() is defined as current_thread_info()->task. Seriously, look at these beasts. Overwriting ->addr_limit is nowhere near the top threat. If attacker can overwrite thread_info, you have lost.
On Fri, 2017-05-12 at 22:41 +0100, Al Viro wrote: > On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote: > > > Two things are at risk from stack exhaustion: thread_info (mainly > > addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and > > Really? Let's take a look at arm, for example: > > struct thread_info { > unsigned long flags; /* low level flags */ > int preempt_count; /* 0 => preemptable, > <0 => bug */ > mm_segment_t addr_limit; /* address limit */ > struct task_struct *task; /* main task > structure */ > > and current() is defined as current_thread_info()->task. > > Seriously, look at these beasts. Overwriting ->addr_limit is nowhere > near > the top threat. If attacker can overwrite thread_info, you have > lost. That is why THREAD_INFO_IN_TASK exists. It moves the struct thread_info to a location away from the stack, which means a stack overflow will not overwrite the thread_info.
On Fri, May 12, 2017 at 2:41 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: > On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote: > >> Two things are at risk from stack exhaustion: thread_info (mainly >> addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and > > Really? Let's take a look at arm, for example: > > struct thread_info { > unsigned long flags; /* low level flags */ > int preempt_count; /* 0 => preemptable, <0 => bug */ > mm_segment_t addr_limit; /* address limit */ > struct task_struct *task; /* main task structure */ > > and current() is defined as current_thread_info()->task. > > Seriously, look at these beasts. Overwriting ->addr_limit is nowhere near > the top threat. If attacker can overwrite thread_info, you have lost. I don't disagree, but the type of attack is different. If the attacker overwrites task_struct pointer, then they need to have built an false one, and that may be made difficult by PAN, or need to know more about kernel memory layout (rather than only stack depth), etc. Attacking addr_limit makes it very very easy to upgrade attack capabilities. I'm not say thread_info shouldn't be moved off the stack. -Kees
On Fri, May 12, 2017 at 05:47:55PM -0400, Rik van Riel wrote: > > Seriously, look at these beasts. Overwriting ->addr_limit is nowhere > > near > > the top threat. If attacker can overwrite thread_info, you have > > lost. > > That is why THREAD_INFO_IN_TASK exists. It moves > the struct thread_info to a location away from the > stack, which means a stack overflow will not overwrite > the thread_info. ... in which case such attacks on ->addr_limit also become a non-issue. AFAICS, we are mixing several unrelated issues here: * amount of places where set_fs() is called. Sure, reducing it is a good idea and we want to move to primitives like kernel_write() et.al. Fewer users => lower odds of screwing it up. * making sure that remaining callers are properly paired. Ditto. * switching to ->read_iter()/->write_iter() where it makes sense. Again, no problem with that. * providing sane environment for places like perf/oprofile. Again, a good idea, and set_fs(USER_DS) is only a part of what's needed there. * switching _everything_ to ->read_iter()/->write_iter(). Flat-out insane and AFAICS nobody is signing up for that. * getting rid of set_fs() entirely. I'm afraid that it's not feasible without the previous one and frankly, I don't see much point. * sanity-checking on return to userland. Maybe useful, maybe not. * taking thread_info out of the way of stack overflows. Reasonable, but has very little to do with the rest of that. * protecting against Lovecraftian horrors slithering in from the outer space only to commit unspeakable acts against ->addr_limit and ignoring much tastier targets next to it, but then what do you expect from degenerate spawn of Great Old Ones - sanity?
On Mon, May 8, 2017 at 1:48 PM, Al Viro <viro@zeniv.linux.org.uk> wrote: > On Mon, May 08, 2017 at 04:06:35PM +0200, Jann Horn wrote: > >> I think Kees might be talking about >> https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in >> commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that >> perf code that can run in pretty much any context called access_ok(). > > And that commit has *NOT* solved the problem. perf_callchain_user() > can be called synchronously, without passing through that code. > Tracepoint shite... > > That set_fs() should be done in get_perf_callchain(), just around the call of > perf_callchain_user(). Along with pagefault_disable(), actually. > Even that's not quite enough because of a different issue: perf nmis can hit during scheduling or when we're in lazy mm, leading to the entirely wrong set of page tables being used. We need nmi_uaccess_begin() and nmi_uaccess_end(), and the former needs to be allowed to fail. AFAIK this isn't presently a security problem because it mainly affects kernel threads, and you need to be root to profile them, but maybe there's some race where it does matter.
On Fri, May 12, 2017 at 12:15 AM, Al Viro <viro@zeniv.linux.org.uk> wrote: > Folks, seriously, have you even looked through that zoo? I have, and it's > really, really not fun. Sure, we can say "fuck 'em, no need to allow > splice() on random crap". Would be perfectly reasonable, expect that > it's not the only place doing kernel_write() and its ilk... Can you clarify this? I think we really may be able to do exactly this. From Christoph's list, there are only two things that need kernel_read/kernel_write to user-supplied fds that may come from a variety of sources: splice and exec. If you're execing a chardev from a crappy driver, something is seriously wrong. And returning -EINVAL from splice() to or from files that use ->read and ->write seems find (and splice(2) even documents -EINVAL as meaning that the target doesn't support splicing). --Andy
On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote: > On stack variable length arrays get implemented by the compiler doing > alloca(), and we sadly have a few of those around. I've just got rid of one of those and I wish they would appear entirely as they are horrible in so many different ways. Sparse warns about them, btw.
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d25435d94b6e..3d2ec084d5fc 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES config S390 def_bool y + select ADDR_LIMIT_CHECK select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9b06f8..e534b93ce43a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -191,6 +191,28 @@ extern struct trace_event_functions exit_syscall_print_funcs; SYSCALL_METADATA(sname, x, __VA_ARGS__) \ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) + +/* + * Called before coming back to user-mode. Returning to user-mode with an + * address limit different than USER_DS can allow to overwrite kernel memory. + */ +static inline void addr_limit_check_syscall(void) +{ + BUG_ON(!segment_eq(get_fs(), USER_DS)); +} + +#ifndef CONFIG_ADDR_LIMIT_CHECK +#define ADDR_LIMIT_CHECK_PRE() \ + bool user_caller = segment_eq(get_fs(), USER_DS) +#define ADDR_LIMIT_CHECK_POST() \ + if (user_caller) addr_limit_check_syscall() +#else +#define ADDR_LIMIT_CHECK_PRE() +#define ADDR_LIMIT_CHECK_POST() +asmlinkage void addr_limit_check_failed(void) __noreturn; +#endif + + #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ @@ -199,7 +221,10 @@ extern struct trace_event_functions exit_syscall_print_funcs; asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ - long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + long ret; \ + ADDR_LIMIT_CHECK_PRE(); \ + ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + ADDR_LIMIT_CHECK_POST(); \ __MAP(x,__SC_TEST,__VA_ARGS__); \ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ return ret; \ diff --git a/init/Kconfig b/init/Kconfig index 42a346b0df43..599d9fe30703 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1961,6 +1961,12 @@ config PROFILING config TRACEPOINTS bool +config ADDR_LIMIT_CHECK + bool + help + Disable the generic address limit check. Allow each architecture to + optimize how and when the verification is done. + source "arch/Kconfig" endmenu # General setup diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..a1cbcd715d62 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2458,3 +2458,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) return 0; } #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_ADDR_LIMIT_CHECK +/* + * Used when an architecture specific implementation detects an invalid address + * limit. This function does not return. + */ +asmlinkage void addr_limit_check_failed(void) +{ + /* Try to fail on the generic address limit check */ + addr_limit_check_syscall(); + panic("Invalid address limit before returning to user-mode"); +} +#endif