[v3,2/2] modules:capabilities: add a per-task modules autoload restriction
diff mbox

Message ID 1492640420-27345-3-git-send-email-tixxdz@gmail.com
State New
Headers show

Commit Message

Djalal Harouni April 19, 2017, 10:20 p.m. UTC
Previous patches added the global "modules_autoload" restriction. This patch
make it possible to support process trees, containers, and sandboxes by
providing an inherited per-task "modules_autoload" flag that cannot be
re-enabled once disabled. This allows to restrict automatic module
loading without affecting the rest of the system.

Any task can set its "modules_autoload". Once set, this setting is inherited
across fork, clone and execve. With "modules_autoload" set, automatic
module loading will have first to satisfy the per-task access permissions
before attempting to implicitly load the module. For example, automatic
loading of modules that contain bugs or vulnerabilities can be
restricted and untrusted users can no longer abuse such interfaces

To set modules_autoload, use prctl(PR_SET_MODULES_AUTOLOAD, value, 0, 0, 0).

When value is (0), the default, automatic modules loading is allowed.

When value is (1), task must have CAP_SYS_MODULE to be able to trigger a
module auto-load operation, or CAP_NET_ADMIN for modules with a
'netdev-%s' alias.

When value is (2), automatic modules loading is disabled for the current
task.

The 'modules_autoload' value may only be increased, never decreased, thus
ensuring that once applied, processes can never relax their setting.

When a request to a kernel module is denied, the module name with the
corresponding process name and its pid are logged. Administrators can use
such information to explicitly load the appropriate modules.

The per-task "modules_autoload" restriction:

Before:
$ lsmod | grep ipip -
$ sudo ip tunnel add mytun mode ipip remote 10.0.2.100 local 10.0.2.15 ttl 255
$ lsmod | grep ipip -
ipip                   16384  0
tunnel4                16384  1 ipip
ip_tunnel              28672  1 ipip

After:
$ lsmod | grep ipip -
$ ./pr_modules_autoload
$ grep "Modules" /proc/self/status
ModulesAutoload:        2
$ cat /proc/sys/kernel/modules_autoload
0
$ sudo ip tunnel add mytun mode ipip remote 10.0.2.100 local 10.0.2.15 ttl 255
add tunnel "tunl0" failed: No such device
$ lsmod | grep ipip
$ dmesg | tail -3
[   16.363903] virbr0: port 1(virbr0-nic) entered disabled state
[  823.565958] Automatic module loading of netdev-tunl0 by "ip"[1362] was denied
[  823.565967] Automatic module loading of tunl0 by "ip"[1362] was denied

Cc: Serge Hallyn <serge@hallyn.com>
Cc: Andy Lutomirski <luto@kernel.org>
Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
---
 Documentation/filesystems/proc.txt       |  3 ++
 Documentation/prctl/modules_autoload.txt | 49 +++++++++++++++++++++++++++++++
 fs/proc/array.c                          |  6 ++++
 include/linux/module.h                   | 48 ++++++++++++++++++++++++++++--
 include/linux/sched.h                    |  5 ++++
 include/linux/security.h                 |  2 +-
 include/uapi/linux/prctl.h               |  8 +++++
 kernel/fork.c                            |  4 +++
 kernel/module.c                          | 17 +++++++----
 security/commoncap.c                     | 50 ++++++++++++++++++++++++++++----
 10 files changed, 178 insertions(+), 14 deletions(-)
 create mode 100644 Documentation/prctl/modules_autoload.txt

Comments

Djalal Harouni April 19, 2017, 10:38 p.m. UTC | #1
On Thu, Apr 20, 2017 at 12:20 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
[...]
> +/* Returns task's modules_autoload */
> +static inline void task_copy_modules_autoload(struct task_struct *dest,
> +                                             struct task_struct *src)
> +{
> +       dest->modules_autoload = src->modules_autoload;
> +}

Kees Cook just pointed out that this hook is not needed since we
already dup everything from parent to child.


[...]
>
> diff --git a/include/linux/security.h b/include/linux/security.h
> index e274bb11..9581cc5 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -866,7 +866,7 @@ static inline int security_task_create(unsigned long clone_flags)
>  static inline int security_task_alloc(struct task_struct *task,
>                                       unsigned long clone_flags)
>  {
> -       return 0;
> +       return cap_task_alloc(task, clone_flags);
>  }

Will remove it in next iteration.


>  static inline void security_task_free(struct task_struct *task)
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index a8d0759..0244264 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -197,4 +197,12 @@ struct prctl_mm_map {
>  # define PR_CAP_AMBIENT_LOWER          3
>  # define PR_CAP_AMBIENT_CLEAR_ALL      4
>
> +/*
> + * Control the per-task "modules_autoload" access.
> + *
> + * See Documentation/prctl/modules_autoload.txt for more details.
> + */
> +#define PR_SET_MODULES_AUTOLOAD                48
> +#define PR_GET_MODULES_AUTOLOAD                49
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 81347bd..141e06b 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1695,6 +1695,10 @@ static __latent_entropy struct task_struct *copy_process(
>         p->sequential_io_avg    = 0;
>  #endif
>
> +#ifdef CONFIG_MODULES
> +       p->modules_autoload     = 0;
> +#endif
> +
>         /* Perform scheduler related setup. Assign this task to a CPU. */
>         retval = sched_fork(clone_flags, p);
>         if (retval)
> diff --git a/kernel/module.c b/kernel/module.c
> index 54cb6e0..e1eca74 100644
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -4313,19 +4313,24 @@ static int modules_autoload_privileged_access(const char *name)
>  }
>
>  /**
> - * modules_autoload_access - Determine whether a module auto-load is permitted
> + * modules_autoload_access - Determine whether the task is allowed to perform a
> + *                          module auto-load request
> + * @task: The task performing the request
>   * @kmod_name: The module name
>   *
> - * Determine whether a module should be automatically loaded or not. The check
> - * uses the sysctl "modules_autoload" value.
> + * Determine whether the task is allowed to perform a module auto-load request.
> + * This checks the per-task "modules_autoload" flag, if the access is not denied,
> + * then the global sysctl "modules_autoload" is evaluated.
>   *
>   * Returns 0 if the module request is allowed or -EPERM if not.
>   */
> -int modules_autoload_access(char *kmod_name)
> +int modules_autoload_access(struct task_struct *task, char *kmod_name)
>  {
> -       if (modules_autoload == MODULES_AUTOLOAD_ALLOWED)
> +       unsigned int autoload = max_t(unsigned int,
> +                                     modules_autoload, task->modules_autoload);
> +       if (autoload == MODULES_AUTOLOAD_ALLOWED)
>                 return 0;
> -       else if (modules_autoload == MODULES_AUTOLOAD_PRIVILEGED)
> +       else if (autoload == MODULES_AUTOLOAD_PRIVILEGED)
>                 return modules_autoload_privileged_access(kmod_name);
>
>         /* MODULES_AUTOLOAD_DISABLED */
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 67a6cfe..bcc1e09 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -886,6 +886,40 @@ static int cap_prctl_drop(unsigned long cap)
>         return commit_creds(new);
>  }
>
> +static int pr_set_mod_autoload(unsigned long arg2, unsigned long arg3,
> +                              unsigned long arg4, unsigned long arg5)
> +{
> +       if (arg3 || arg4 || arg5)
> +               return -EINVAL;
> +
> +       return task_set_modules_autoload(current, arg2);
> +}
> +
> +static inline int pr_get_mod_autoload(unsigned long arg2, unsigned long arg3,
> +                                     unsigned long arg4, unsigned long arg5)
> +{
> +       if (arg2 || arg3 || arg4 || arg5)
> +               return -EINVAL;
> +
> +       return task_modules_autoload(current);
> +}
> +
> +/**
> + * cap_task_alloc - Implement process context allocation for this security module
> + * @task: task being allocated
> + * @clone_flags: contains the clone flags indicating what should be shared.
> + *
> + * Allocate or initialize the task context for this security module.
> + *
> + * Returns 0.
> + */
> +int cap_task_alloc(struct task_struct *task, unsigned long clone_flags)
> +{
> +       task_copy_modules_autoload(task, current);
> +
> +       return 0;
> +}

All the calls to initialize task->modules_autoload or dup its value
using the task_alloc will be removed in next iteration as pointed out
by Kees.

Thanks!
Andy Lutomirski April 19, 2017, 11:15 p.m. UTC | #2
On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
> Previous patches added the global "modules_autoload" restriction. This patch
> make it possible to support process trees, containers, and sandboxes by
> providing an inherited per-task "modules_autoload" flag that cannot be
> re-enabled once disabled. This allows to restrict automatic module
> loading without affecting the rest of the system.
>
> Any task can set its "modules_autoload". Once set, this setting is inherited
> across fork, clone and execve. With "modules_autoload" set, automatic
> module loading will have first to satisfy the per-task access permissions
> before attempting to implicitly load the module. For example, automatic
> loading of modules that contain bugs or vulnerabilities can be
> restricted and untrusted users can no longer abuse such interfaces
>
> To set modules_autoload, use prctl(PR_SET_MODULES_AUTOLOAD, value, 0, 0, 0).
>
> When value is (0), the default, automatic modules loading is allowed.
>
> When value is (1), task must have CAP_SYS_MODULE to be able to trigger a
> module auto-load operation, or CAP_NET_ADMIN for modules with a
> 'netdev-%s' alias.
>
> When value is (2), automatic modules loading is disabled for the current
> task.
>
> The 'modules_autoload' value may only be increased, never decreased, thus
> ensuring that once applied, processes can never relax their setting.
>
> When a request to a kernel module is denied, the module name with the
> corresponding process name and its pid are logged. Administrators can use
> such information to explicitly load the appropriate modules.
>
> The per-task "modules_autoload" restriction:
>
> Before:
> $ lsmod | grep ipip -
> $ sudo ip tunnel add mytun mode ipip remote 10.0.2.100 local 10.0.2.15 ttl 255
> $ lsmod | grep ipip -
> ipip                   16384  0
> tunnel4                16384  1 ipip
> ip_tunnel              28672  1 ipip
>
> After:
> $ lsmod | grep ipip -
> $ ./pr_modules_autoload
> $ grep "Modules" /proc/self/status
> ModulesAutoload:        2
> $ cat /proc/sys/kernel/modules_autoload
> 0
> $ sudo ip tunnel add mytun mode ipip remote 10.0.2.100 local 10.0.2.15 ttl 255
> add tunnel "tunl0" failed: No such device
> $ lsmod | grep ipip
> $ dmesg | tail -3
> [   16.363903] virbr0: port 1(virbr0-nic) entered disabled state
> [  823.565958] Automatic module loading of netdev-tunl0 by "ip"[1362] was denied
> [  823.565967] Automatic module loading of tunl0 by "ip"[1362] was denied
>
> Cc: Serge Hallyn <serge@hallyn.com>
> Cc: Andy Lutomirski <luto@kernel.org>
> Suggested-by: Kees Cook <keescook@chromium.org>
> Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
> ---
>  Documentation/filesystems/proc.txt       |  3 ++
>  Documentation/prctl/modules_autoload.txt | 49 +++++++++++++++++++++++++++++++
>  fs/proc/array.c                          |  6 ++++
>  include/linux/module.h                   | 48 ++++++++++++++++++++++++++++--
>  include/linux/sched.h                    |  5 ++++
>  include/linux/security.h                 |  2 +-
>  include/uapi/linux/prctl.h               |  8 +++++
>  kernel/fork.c                            |  4 +++
>  kernel/module.c                          | 17 +++++++----
>  security/commoncap.c                     | 50 ++++++++++++++++++++++++++++----
>  10 files changed, 178 insertions(+), 14 deletions(-)
>  create mode 100644 Documentation/prctl/modules_autoload.txt
>
> diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
> index 4cddbce..df4d145 100644
> --- a/Documentation/filesystems/proc.txt
> +++ b/Documentation/filesystems/proc.txt
> @@ -194,6 +194,7 @@ read the file /proc/PID/status:
>    CapBnd: ffffffffffffffff
>    NoNewPrivs:     0
>    Seccomp:        0
> +  ModulesAutoload:      0
>    voluntary_ctxt_switches:        0
>    nonvoluntary_ctxt_switches:     1
>
> @@ -267,6 +268,8 @@ Table 1-2: Contents of the status files (as of 4.8)
>   CapBnd                      bitmap of capabilities bounding set
>   NoNewPrivs                  no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...)
>   Seccomp                     seccomp mode, like prctl(PR_GET_SECCOMP, ...)
> + ModulesAutoload             modules autoload, like
> +                             prctl(PR_GET_MODULES_AUTOLOAD, ...)
>   Cpus_allowed                mask of CPUs on which this process may run
>   Cpus_allowed_list           Same as previous, but in "list format"
>   Mems_allowed                mask of memory nodes allowed to this process
> diff --git a/Documentation/prctl/modules_autoload.txt b/Documentation/prctl/modules_autoload.txt
> new file mode 100644
> index 0000000..242852e
> --- /dev/null
> +++ b/Documentation/prctl/modules_autoload.txt
> @@ -0,0 +1,49 @@
> +A request to a kernel feature that is implemented by a module that is
> +not loaded may trigger the module auto-load feature, allowing to
> +transparently satisfy userspace. In this case an implicit kernel module
> +load operation happens.
> +
> +Usually to load or unload a kernel module, an explicit operation happens
> +where programs are required to have some capabilities in order to perform
> +such operations. However, with the implicit module loading, no
> +capabilities are required, anyone who is able to request a certain kernel
> +feature, may also implicitly load its corresponding kernel module. This
> +operation can be abused by unprivileged users to expose kernel interfaces
> +that maybe privileged users did not want to be made available for various
> +reasons: resources, bugs, vulnerabilties, etc. The DCCP vulnerability is
> +(CVE-2017-6074) is one real example.
> +
> +The new per-task "modules_autoload" flag, is a new way to restrict
> +automatic module loading, preventing the kernel from exposing more of
> +its interface. This particularly useful for containers and sandboxes
> +where sandboxed processes should affect the rest of the system.
> +
> +Any task can set "modules_autoload". Once set, this setting is inherited
> +across fork, clone and execve. With "modules_autoload" set, automatic
> +module loading will have first to satisfy the per-task access permissions
> +before attempting to implicitly load the module. For example, automatic
> +loading of modules that contain bugs or vulnerabilities can be
> +restricted and imprivileged users can no longer abuse such interfaces.
> +
> +To set modules_autoload, use prctl(PR_SET_MODULES_AUTOLOAD, value, 0, 0, 0).
> +
> +When value is (0), the default, automatic modules loading is allowed.
> +
> +When value is (1), task must have CAP_SYS_MODULE to be able to trigger a
> +module auto-load operation, or CAP_NET_ADMIN for modules with a
> +'netdev-%s' alias.
> +
> +When value is (2), automatic modules loading is disabled for the current
> +task.
> +
> +The 'modules_autoload' value may only be increased, never decreased, thus
> +ensuring that once applied, processes can never relax their setting.
> +
> +When a request to a kernel module is denied, the module name with the
> +corresponding process name and its pid are logged. Administrators can use
> +such information to explicitly load the appropriate modules.
> +
> +Please note that even if the per-task "modules_autoload" value allows to
> +auto-load the corresponding module, automatic module loading may still
> +fail due to the global "modules_autoload" sysctl. For more details please
> +see "modules_autoload" in Documentation/sysctl/kernel.txt
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 88c3555..cbcf087 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -88,6 +88,7 @@
>  #include <linux/string_helpers.h>
>  #include <linux/user_namespace.h>
>  #include <linux/fs_struct.h>
> +#include <linux/module.h>
>
>  #include <asm/pgtable.h>
>  #include <asm/processor.h>
> @@ -346,10 +347,15 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
>
>  static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
>  {
> +       int autoload = task_modules_autoload(p);
> +
>         seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
>  #ifdef CONFIG_SECCOMP
>         seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
>  #endif
> +       if (autoload != -ENOSYS)
> +               seq_put_decimal_ull(m, "\nModulesAutoload:\t", autoload);
> +
>         seq_putc(m, '\n');
>  }
>
> diff --git a/include/linux/module.h b/include/linux/module.h
> index 4b96c10..595800f 100644
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -13,6 +13,7 @@
>  #include <linux/kmod.h>
>  #include <linux/init.h>
>  #include <linux/elf.h>
> +#include <linux/sched.h>
>  #include <linux/stringify.h>
>  #include <linux/kobject.h>
>  #include <linux/moduleparam.h>
> @@ -506,7 +507,33 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
>  bool is_module_percpu_address(unsigned long addr);
>  bool is_module_text_address(unsigned long addr);
>
> -int modules_autoload_access(char *kmod_name);
> +int modules_autoload_access(struct task_struct *task, char *kmod_name);
> +
> +/* Sets task's modules_autoload */
> +static inline int task_set_modules_autoload(struct task_struct *task,
> +                                           unsigned long value)
> +{
> +       if (value > MODULES_AUTOLOAD_DISABLED)
> +               return -EINVAL;
> +       else if (task->modules_autoload > value)
> +               return -EPERM;
> +       else if (task->modules_autoload < value)
> +               task->modules_autoload = value;
> +
> +       return 0;
> +}

This needs to be more locked down.  Otherwise someone could set this
and then run a setuid program.  Admittedly, it would be quite odd if
this particular thing causes a problem, but the issue exists
nonetheless.

More generally, I think this feature would fit in fairly nicely with
my "implicit rights" idea.  Unfortunately, Linus hated it, but maybe
if I actually implemented it, he wouldn't hate it so much.
Kees Cook April 19, 2017, 11:43 p.m. UTC | #3
On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> +/* Sets task's modules_autoload */
>> +static inline int task_set_modules_autoload(struct task_struct *task,
>> +                                           unsigned long value)
>> +{
>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>> +               return -EINVAL;
>> +       else if (task->modules_autoload > value)
>> +               return -EPERM;
>> +       else if (task->modules_autoload < value)
>> +               task->modules_autoload = value;
>> +
>> +       return 0;
>> +}
>
> This needs to be more locked down.  Otherwise someone could set this
> and then run a setuid program.  Admittedly, it would be quite odd if
> this particular thing causes a problem, but the issue exists
> nonetheless.

Eeeh, I don't agree this needs to be changed. APIs provided by modules
are different than the existing privilege-manipulation syscalls this
concern stems from. Applications are already forced to deal with
things being missing like this in the face of it simply not being
built into the kernel.

Having to hide this behind nnp seems like it'd reduce its utility...

-Kees
kernel test robot April 20, 2017, 1:57 a.m. UTC | #4
Hi Djalal,

[auto build test ERROR on security/next]
[also build test ERROR on next-20170419]
[cannot apply to linus/master v4.11-rc7]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Djalal-Harouni/modules-capabilities-automatic-module-loading-restrictions/20170420-093603
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git next
config: x86_64-randconfig-x017-201716 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   In file included from include/net/scm.h:7:0,
                    from include/linux/netlink.h:8,
                    from include/uapi/linux/neighbour.h:5,
                    from include/linux/netdevice.h:51,
                    from include/net/sock.h:51,
                    from fs//ocfs2/cluster/tcp.h:32,
                    from fs//ocfs2/cluster/heartbeat.c:41:
   include/linux/security.h: In function 'security_task_alloc':
>> include/linux/security.h:869:9: error: implicit declaration of function 'cap_task_alloc' [-Werror=implicit-function-declaration]
     return cap_task_alloc(task, clone_flags);
            ^~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/cap_task_alloc +869 include/linux/security.h

   863		return 0;
   864	}
   865	
   866	static inline int security_task_alloc(struct task_struct *task,
   867					      unsigned long clone_flags)
   868	{
 > 869		return cap_task_alloc(task, clone_flags);
   870	}
   871	
   872	static inline void security_task_free(struct task_struct *task)

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Andy Lutomirski April 20, 2017, 2:41 a.m. UTC | #5
On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>> +/* Sets task's modules_autoload */
>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>> +                                           unsigned long value)
>>> +{
>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>> +               return -EINVAL;
>>> +       else if (task->modules_autoload > value)
>>> +               return -EPERM;
>>> +       else if (task->modules_autoload < value)
>>> +               task->modules_autoload = value;
>>> +
>>> +       return 0;
>>> +}
>>
>> This needs to be more locked down.  Otherwise someone could set this
>> and then run a setuid program.  Admittedly, it would be quite odd if
>> this particular thing causes a problem, but the issue exists
>> nonetheless.
>
> Eeeh, I don't agree this needs to be changed. APIs provided by modules
> are different than the existing privilege-manipulation syscalls this
> concern stems from. Applications are already forced to deal with
> things being missing like this in the face of it simply not being
> built into the kernel.
>
> Having to hide this behind nnp seems like it'd reduce its utility...
>

I think that adding an inherited boolean to task_struct that can be
set by unprivileged tasks and passed to privileged tasks is a terrible
precedent.  Ideally someone would try to find all the existing things
like this and kill them off.

I agree that I don't see how one would exploit this particular
feature, but I still think I dislike the approach.  This is a slippery
slope to adding a boolean for perf_event_open(), unshare(), etc, and
we should solve these for real rather than half-arsing them IMO.
Kees Cook April 21, 2017, 11:19 p.m. UTC | #6
On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>> +/* Sets task's modules_autoload */
>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>> +                                           unsigned long value)
>>>> +{
>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>> +               return -EINVAL;
>>>> +       else if (task->modules_autoload > value)
>>>> +               return -EPERM;
>>>> +       else if (task->modules_autoload < value)
>>>> +               task->modules_autoload = value;
>>>> +
>>>> +       return 0;
>>>> +}
>>>
>>> This needs to be more locked down.  Otherwise someone could set this
>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>> this particular thing causes a problem, but the issue exists
>>> nonetheless.
>>
>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>> are different than the existing privilege-manipulation syscalls this
>> concern stems from. Applications are already forced to deal with
>> things being missing like this in the face of it simply not being
>> built into the kernel.
>>
>> Having to hide this behind nnp seems like it'd reduce its utility...
>>
>
> I think that adding an inherited boolean to task_struct that can be
> set by unprivileged tasks and passed to privileged tasks is a terrible
> precedent.  Ideally someone would try to find all the existing things
> like this and kill them off.

(Tristate, not boolean, but yeah.)

I see two others besides seccomp and nnp:

PR_MCE_KILL
PR_SET_THP_DISABLE

I really don't think this needs nnp protection.

> I agree that I don't see how one would exploit this particular
> feature, but I still think I dislike the approach.  This is a slippery
> slope to adding a boolean for perf_event_open(), unshare(), etc, and
> we should solve these for real rather than half-arsing them IMO.

I disagree (obviously); this would be protecting the entire module
autoload attack surface. That's hardly a specific control, and it's a
demonstrably needed flag.

-Kees
Andy Lutomirski April 21, 2017, 11:28 p.m. UTC | #7
On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>> +/* Sets task's modules_autoload */
>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>> +                                           unsigned long value)
>>>>> +{
>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>> +               return -EINVAL;
>>>>> +       else if (task->modules_autoload > value)
>>>>> +               return -EPERM;
>>>>> +       else if (task->modules_autoload < value)
>>>>> +               task->modules_autoload = value;
>>>>> +
>>>>> +       return 0;
>>>>> +}
>>>>
>>>> This needs to be more locked down.  Otherwise someone could set this
>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>> this particular thing causes a problem, but the issue exists
>>>> nonetheless.
>>>
>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>> are different than the existing privilege-manipulation syscalls this
>>> concern stems from. Applications are already forced to deal with
>>> things being missing like this in the face of it simply not being
>>> built into the kernel.
>>>
>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>
>>
>> I think that adding an inherited boolean to task_struct that can be
>> set by unprivileged tasks and passed to privileged tasks is a terrible
>> precedent.  Ideally someone would try to find all the existing things
>> like this and kill them off.
>
> (Tristate, not boolean, but yeah.)
>
> I see two others besides seccomp and nnp:
>
> PR_MCE_KILL

Well, that's interesting.  That should presumably be reset on setuid
exec or something.

> PR_SET_THP_DISABLE

Um.  At least that's just a performance issue.

>
> I really don't think this needs nnp protection.
>
>> I agree that I don't see how one would exploit this particular
>> feature, but I still think I dislike the approach.  This is a slippery
>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>> we should solve these for real rather than half-arsing them IMO.
>
> I disagree (obviously); this would be protecting the entire module
> autoload attack surface. That's hardly a specific control, and it's a
> demonstrably needed flag.
>

The list is just going to get longer.  We should probably have controls for:

 - Use of perf.  Unclear how fine grained they should be.

 - Creation of new user namespaces.  Possibly also use of things like
iptables without global privilege.

 - Ability to look up tasks owned by different uids (or maybe other
tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
the only hard part, I think.

 - Ability to bind ports, maybe?

My point is that all of these need some way to handle configuration
and inheritance, and I don't think that a bunch of per-task prctls is
the right way.  As just an example, saying that interactive users can
autoload modules but other users can't, or that certain systemd
services can, etc, might be nice.  Linus already complained that he
(i.e. user "torvalds" or whatever) should be able to profile the
kernel but that other uids should not be able to.

I personally like my implicit_rights idea, and it might be interesting
to prototype it.
Kees Cook April 21, 2017, 11:40 p.m. UTC | #8
On Fri, Apr 21, 2017 at 4:28 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
>> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>>> +/* Sets task's modules_autoload */
>>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>>> +                                           unsigned long value)
>>>>>> +{
>>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>>> +               return -EINVAL;
>>>>>> +       else if (task->modules_autoload > value)
>>>>>> +               return -EPERM;
>>>>>> +       else if (task->modules_autoload < value)
>>>>>> +               task->modules_autoload = value;
>>>>>> +
>>>>>> +       return 0;
>>>>>> +}
>>>>>
>>>>> This needs to be more locked down.  Otherwise someone could set this
>>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>>> this particular thing causes a problem, but the issue exists
>>>>> nonetheless.
>>>>
>>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>>> are different than the existing privilege-manipulation syscalls this
>>>> concern stems from. Applications are already forced to deal with
>>>> things being missing like this in the face of it simply not being
>>>> built into the kernel.
>>>>
>>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>>
>>>
>>> I think that adding an inherited boolean to task_struct that can be
>>> set by unprivileged tasks and passed to privileged tasks is a terrible
>>> precedent.  Ideally someone would try to find all the existing things
>>> like this and kill them off.
>>
>> (Tristate, not boolean, but yeah.)
>>
>> I see two others besides seccomp and nnp:
>>
>> PR_MCE_KILL
>
> Well, that's interesting.  That should presumably be reset on setuid
> exec or something.
>
>> PR_SET_THP_DISABLE
>
> Um.  At least that's just a performance issue.
>
>>
>> I really don't think this needs nnp protection.
>>
>>> I agree that I don't see how one would exploit this particular
>>> feature, but I still think I dislike the approach.  This is a slippery
>>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>>> we should solve these for real rather than half-arsing them IMO.
>>
>> I disagree (obviously); this would be protecting the entire module
>> autoload attack surface. That's hardly a specific control, and it's a
>> demonstrably needed flag.
>>
>
> The list is just going to get longer.  We should probably have controls for:
>
>  - Use of perf.  Unclear how fine grained they should be.

This can already be "given up" by a process by using seccomp. The
system-wide setting is what's missing here, and that's a whole other
thread already even though basically every distro has implemented the
= 3 sysctl knob level.

>  - Creation of new user namespaces.  Possibly also use of things like
> iptables without global privilege.

This is another one that can be controlled by seccomp. The system-wide
setting already exists in /proc/sys/user/max_user_namespaces.

>  - Ability to look up tasks owned by different uids (or maybe other
> tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
> the only hard part, I think.

The attack surface here is relatively small compared to the other examples.

>  - Ability to bind ports, maybe?

seccomp and maybe a sysctl? I'd have to look at that more carefully,
but again, this isn't a comparable attack-surface/confinement issue.

> My point is that all of these need some way to handle configuration
> and inheritance, and I don't think that a bunch of per-task prctls is
> the right way.  As just an example, saying that interactive users can
> autoload modules but other users can't, or that certain systemd
> services can, etc, might be nice.  Linus already complained that he
> (i.e. user "torvalds" or whatever) should be able to profile the
> kernel but that other uids should not be able to.
>
> I personally like my implicit_rights idea, and it might be interesting
> to prototype it.

I don't like blocking a needed feature behind a large super-feature
that doesn't exist yet. We'd be able to refactor this code into using
such a thing in the future, so I'd prefer to move ahead with this
since it would stop actual exploits.

-Kees
Andy Lutomirski April 21, 2017, 11:51 p.m. UTC | #9
On Fri, Apr 21, 2017 at 4:40 PM, Kees Cook <keescook@chromium.org> wrote:
> On Fri, Apr 21, 2017 at 4:28 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
>>> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>>>> +/* Sets task's modules_autoload */
>>>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>>>> +                                           unsigned long value)
>>>>>>> +{
>>>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>>>> +               return -EINVAL;
>>>>>>> +       else if (task->modules_autoload > value)
>>>>>>> +               return -EPERM;
>>>>>>> +       else if (task->modules_autoload < value)
>>>>>>> +               task->modules_autoload = value;
>>>>>>> +
>>>>>>> +       return 0;
>>>>>>> +}
>>>>>>
>>>>>> This needs to be more locked down.  Otherwise someone could set this
>>>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>>>> this particular thing causes a problem, but the issue exists
>>>>>> nonetheless.
>>>>>
>>>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>>>> are different than the existing privilege-manipulation syscalls this
>>>>> concern stems from. Applications are already forced to deal with
>>>>> things being missing like this in the face of it simply not being
>>>>> built into the kernel.
>>>>>
>>>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>>>
>>>>
>>>> I think that adding an inherited boolean to task_struct that can be
>>>> set by unprivileged tasks and passed to privileged tasks is a terrible
>>>> precedent.  Ideally someone would try to find all the existing things
>>>> like this and kill them off.
>>>
>>> (Tristate, not boolean, but yeah.)
>>>
>>> I see two others besides seccomp and nnp:
>>>
>>> PR_MCE_KILL
>>
>> Well, that's interesting.  That should presumably be reset on setuid
>> exec or something.
>>
>>> PR_SET_THP_DISABLE
>>
>> Um.  At least that's just a performance issue.
>>
>>>
>>> I really don't think this needs nnp protection.
>>>
>>>> I agree that I don't see how one would exploit this particular
>>>> feature, but I still think I dislike the approach.  This is a slippery
>>>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>>>> we should solve these for real rather than half-arsing them IMO.
>>>
>>> I disagree (obviously); this would be protecting the entire module
>>> autoload attack surface. That's hardly a specific control, and it's a
>>> demonstrably needed flag.
>>>
>>
>> The list is just going to get longer.  We should probably have controls for:
>>
>>  - Use of perf.  Unclear how fine grained they should be.
>
> This can already be "given up" by a process by using seccomp. The
> system-wide setting is what's missing here, and that's a whole other
> thread already even though basically every distro has implemented the
> = 3 sysctl knob level.

But it can't be done the way Linus wants it, and I don't blame him for
complaining.

>
>>  - Creation of new user namespaces.  Possibly also use of things like
>> iptables without global privilege.
>
> This is another one that can be controlled by seccomp. The system-wide
> setting already exists in /proc/sys/user/max_user_namespaces.

Awkwardly, though.

>
>>  - Ability to look up tasks owned by different uids (or maybe other
>> tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
>> the only hard part, I think.
>
> The attack surface here is relatively small compared to the other examples.
>
>>  - Ability to bind ports, maybe?
>
> seccomp and maybe a sysctl? I'd have to look at that more carefully,
> but again, this isn't a comparable attack-surface/confinement issue.
>
>> My point is that all of these need some way to handle configuration
>> and inheritance, and I don't think that a bunch of per-task prctls is
>> the right way.  As just an example, saying that interactive users can
>> autoload modules but other users can't, or that certain systemd
>> services can, etc, might be nice.  Linus already complained that he
>> (i.e. user "torvalds" or whatever) should be able to profile the
>> kernel but that other uids should not be able to.
>>
>> I personally like my implicit_rights idea, and it might be interesting
>> to prototype it.
>
> I don't like blocking a needed feature behind a large super-feature
> that doesn't exist yet. We'd be able to refactor this code into using
> such a thing in the future, so I'd prefer to move ahead with this
> since it would stop actual exploits.

I don't think the super-feature is so hard, and I think we should not
add the per-task thing the way it's done in this patch.  Let's not add
per-task things where the best argument for their security is "not
sure how it would be exploited".

Anyway, I think the sysctl is really the important bit.  The per-task
setting is icing on the cake IMO.  One upon a time autoload was more
important, but these days modaliases are supposed to do most of the
work.  I bet that modern distros don't need unprivileged autoload at
all.

--Andy
Casey Schaufler April 21, 2017, 11:52 p.m. UTC | #10
On 4/21/2017 4:28 PM, Andy Lutomirski wrote:
> On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
>> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>>> +/* Sets task's modules_autoload */
>>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>>> +                                           unsigned long value)
>>>>>> +{
>>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>>> +               return -EINVAL;
>>>>>> +       else if (task->modules_autoload > value)
>>>>>> +               return -EPERM;
>>>>>> +       else if (task->modules_autoload < value)
>>>>>> +               task->modules_autoload = value;
>>>>>> +
>>>>>> +       return 0;
>>>>>> +}
>>>>> This needs to be more locked down.  Otherwise someone could set this
>>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>>> this particular thing causes a problem, but the issue exists
>>>>> nonetheless.
>>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>>> are different than the existing privilege-manipulation syscalls this
>>>> concern stems from. Applications are already forced to deal with
>>>> things being missing like this in the face of it simply not being
>>>> built into the kernel.
>>>>
>>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>>
>>> I think that adding an inherited boolean to task_struct that can be
>>> set by unprivileged tasks and passed to privileged tasks is a terrible
>>> precedent.  Ideally someone would try to find all the existing things
>>> like this and kill them off.
>> (Tristate, not boolean, but yeah.)
>>
>> I see two others besides seccomp and nnp:
>>
>> PR_MCE_KILL
> Well, that's interesting.  That should presumably be reset on setuid
> exec or something.
>
>> PR_SET_THP_DISABLE
> Um.  At least that's just a performance issue.
>
>> I really don't think this needs nnp protection.
>>
>>> I agree that I don't see how one would exploit this particular
>>> feature, but I still think I dislike the approach.  This is a slippery
>>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>>> we should solve these for real rather than half-arsing them IMO.
>> I disagree (obviously); this would be protecting the entire module
>> autoload attack surface. That's hardly a specific control, and it's a
>> demonstrably needed flag.
>>
> The list is just going to get longer.  We should probably have controls for:
>
>  - Use of perf.  Unclear how fine grained they should be.
>
>  - Creation of new user namespaces.  Possibly also use of things like
> iptables without global privilege.
>
>  - Ability to look up tasks owned by different uids (or maybe other
> tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
> the only hard part, I think.
>
>  - Ability to bind ports, maybe?

One of my longer term (i.e. after stacking) projects
is to create sensible access control on ports. Why shouldn't
they have owners and mode bits (or ACLs, if you prefer)
or real names. I kind of think we should be able to eliminate
the need for dbus without resorting to kdbus.

So I don't like the idea of treating that as a special case.
I'd rather see ports controlled properly. (Of course, the
SELinux crowd will point out they have this handled, but I
remain unconvinced of the overall solution)

>
> My point is that all of these need some way to handle configuration
> and inheritance, and I don't think that a bunch of per-task prctls is
> the right way.  As just an example, saying that interactive users can
> autoload modules but other users can't, or that certain systemd
> services can, etc, might be nice.  Linus already complained that he
> (i.e. user "torvalds" or whatever) should be able to profile the
> kernel but that other uids should not be able to.
>
> I personally like my implicit_rights idea, and it might be interesting
> to prototype it.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Andy Lutomirski April 22, 2017, midnight UTC | #11
On Fri, Apr 21, 2017 at 4:52 PM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> On 4/21/2017 4:28 PM, Andy Lutomirski wrote:
>> On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
>>> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>>>> +/* Sets task's modules_autoload */
>>>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>>>> +                                           unsigned long value)
>>>>>>> +{
>>>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>>>> +               return -EINVAL;
>>>>>>> +       else if (task->modules_autoload > value)
>>>>>>> +               return -EPERM;
>>>>>>> +       else if (task->modules_autoload < value)
>>>>>>> +               task->modules_autoload = value;
>>>>>>> +
>>>>>>> +       return 0;
>>>>>>> +}
>>>>>> This needs to be more locked down.  Otherwise someone could set this
>>>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>>>> this particular thing causes a problem, but the issue exists
>>>>>> nonetheless.
>>>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>>>> are different than the existing privilege-manipulation syscalls this
>>>>> concern stems from. Applications are already forced to deal with
>>>>> things being missing like this in the face of it simply not being
>>>>> built into the kernel.
>>>>>
>>>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>>>
>>>> I think that adding an inherited boolean to task_struct that can be
>>>> set by unprivileged tasks and passed to privileged tasks is a terrible
>>>> precedent.  Ideally someone would try to find all the existing things
>>>> like this and kill them off.
>>> (Tristate, not boolean, but yeah.)
>>>
>>> I see two others besides seccomp and nnp:
>>>
>>> PR_MCE_KILL
>> Well, that's interesting.  That should presumably be reset on setuid
>> exec or something.
>>
>>> PR_SET_THP_DISABLE
>> Um.  At least that's just a performance issue.
>>
>>> I really don't think this needs nnp protection.
>>>
>>>> I agree that I don't see how one would exploit this particular
>>>> feature, but I still think I dislike the approach.  This is a slippery
>>>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>>>> we should solve these for real rather than half-arsing them IMO.
>>> I disagree (obviously); this would be protecting the entire module
>>> autoload attack surface. That's hardly a specific control, and it's a
>>> demonstrably needed flag.
>>>
>> The list is just going to get longer.  We should probably have controls for:
>>
>>  - Use of perf.  Unclear how fine grained they should be.
>>
>>  - Creation of new user namespaces.  Possibly also use of things like
>> iptables without global privilege.
>>
>>  - Ability to look up tasks owned by different uids (or maybe other
>> tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
>> the only hard part, I think.
>>
>>  - Ability to bind ports, maybe?
>
> One of my longer term (i.e. after stacking) projects
> is to create sensible access control on ports. Why shouldn't
> they have owners and mode bits (or ACLs, if you prefer)
> or real names. I kind of think we should be able to eliminate
> the need for dbus without resorting to kdbus.

My implicit_rights concept gives any type of access control you can
use on inodes because they *are* inodes.  So you get ACLs, etc.

Brief summary for those who didn't read my old email: We add a new
kind of filesystem object called a "right".  It's a special kind of
socket inode that can't be bound or connected but is instead created
by a new syscall.  It has a name, so "port:1234" might be a name of a
right.

To use an implicit right, you do whatever syscall you would do
normally.  The kernel looks for a right object at
/dev/implicit_rights/<name>.  If that object exists, is a right of the
correct type (i.e. the right's name matches <name>) and you have
execute access, you win.  Otherwise you lose.

To avoid breaking existing distros, for things like modules_autoload,
you would set a sysctl
/proc/sys/kernel/required_implicit_rights/modules_autoload=1.  With
that set, to autoload a module without CAP_SYS_MODULE, you need the
/dev/implicit_rights/modules_autoload.

>
> So I don't like the idea of treating that as a special case.
> I'd rather see ports controlled properly. (Of course, the
> SELinux crowd will point out they have this handled, but I
> remain unconvinced of the overall solution)

Agreed.  But I think we should address all of these things together.
Djalal Harouni April 22, 2017, 12:12 a.m. UTC | #12
On Sat, Apr 22, 2017 at 1:51 AM, Andy Lutomirski <luto@kernel.org> wrote:
[...]
>>> I personally like my implicit_rights idea, and it might be interesting
>>> to prototype it.
>>
>> I don't like blocking a needed feature behind a large super-feature
>> that doesn't exist yet. We'd be able to refactor this code into using
>> such a thing in the future, so I'd prefer to move ahead with this
>> since it would stop actual exploits.
>
> I don't think the super-feature is so hard, and I think we should not
> add the per-task thing the way it's done in this patch.  Let's not add
> per-task things where the best argument for their security is "not
> sure how it would be exploited".

Actually the XFRM framework CVE-2017-7184 [1] is one real example, of
course there are others. The exploit was used on a generic distro
during a security contest that distro is Ubuntu. That distro will
never provide a module autoloading restriction by default to not harm
it's users. Consumers or containers/sandboxes then can run their
confined apps using such facilities.

These bugs will stay in embedded devices that use these generic
distros for ever.

> Anyway, I think the sysctl is really the important bit.  The per-task
> setting is icing on the cake IMO.  One upon a time autoload was more
> important, but these days modaliases are supposed to do most of the
> work.  I bet that modern distros don't need unprivileged autoload at
> all.

Actually I think they do and we can't just change that. Users may
depend on it, it is a well established facility.

Now the other problem is CAP_NET_ADMIN which does lot of things, it is
more like the CAP_SYS_ADMIN.

This is a quick list that I got from only the past months, I'm pretty
sure there are more:

* DCCP use after free CVE-2017-6074
* n_hldc CVE-2017-2636
* XFRM framework CVE-2017-7184
* L2TPv3 CVE-2016-10200

Most of these need CAP_NET_ADMIN to be autoloaded, however we also
need CAP_NET_ADMIN for other things... therefore it is better to have
an extra facility that could coexist with CAP_NET_ADMIN and other
sandbox features.


[1] http://www.openwall.com/lists/oss-security/2017/03/29/2
Casey Schaufler April 22, 2017, 12:13 a.m. UTC | #13
On 4/21/2017 5:00 PM, Andy Lutomirski wrote:
> On Fri, Apr 21, 2017 at 4:52 PM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>> On 4/21/2017 4:28 PM, Andy Lutomirski wrote:
>>> On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
>>>> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>>>>> +/* Sets task's modules_autoload */
>>>>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>>>>> +                                           unsigned long value)
>>>>>>>> +{
>>>>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>>>>> +               return -EINVAL;
>>>>>>>> +       else if (task->modules_autoload > value)
>>>>>>>> +               return -EPERM;
>>>>>>>> +       else if (task->modules_autoload < value)
>>>>>>>> +               task->modules_autoload = value;
>>>>>>>> +
>>>>>>>> +       return 0;
>>>>>>>> +}
>>>>>>> This needs to be more locked down.  Otherwise someone could set this
>>>>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>>>>> this particular thing causes a problem, but the issue exists
>>>>>>> nonetheless.
>>>>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>>>>> are different than the existing privilege-manipulation syscalls this
>>>>>> concern stems from. Applications are already forced to deal with
>>>>>> things being missing like this in the face of it simply not being
>>>>>> built into the kernel.
>>>>>>
>>>>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>>>>
>>>>> I think that adding an inherited boolean to task_struct that can be
>>>>> set by unprivileged tasks and passed to privileged tasks is a terrible
>>>>> precedent.  Ideally someone would try to find all the existing things
>>>>> like this and kill them off.
>>>> (Tristate, not boolean, but yeah.)
>>>>
>>>> I see two others besides seccomp and nnp:
>>>>
>>>> PR_MCE_KILL
>>> Well, that's interesting.  That should presumably be reset on setuid
>>> exec or something.
>>>
>>>> PR_SET_THP_DISABLE
>>> Um.  At least that's just a performance issue.
>>>
>>>> I really don't think this needs nnp protection.
>>>>
>>>>> I agree that I don't see how one would exploit this particular
>>>>> feature, but I still think I dislike the approach.  This is a slippery
>>>>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>>>>> we should solve these for real rather than half-arsing them IMO.
>>>> I disagree (obviously); this would be protecting the entire module
>>>> autoload attack surface. That's hardly a specific control, and it's a
>>>> demonstrably needed flag.
>>>>
>>> The list is just going to get longer.  We should probably have controls for:
>>>
>>>  - Use of perf.  Unclear how fine grained they should be.
>>>
>>>  - Creation of new user namespaces.  Possibly also use of things like
>>> iptables without global privilege.
>>>
>>>  - Ability to look up tasks owned by different uids (or maybe other
>>> tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
>>> the only hard part, I think.
>>>
>>>  - Ability to bind ports, maybe?
>> One of my longer term (i.e. after stacking) projects
>> is to create sensible access control on ports. Why shouldn't
>> they have owners and mode bits (or ACLs, if you prefer)
>> or real names. I kind of think we should be able to eliminate
>> the need for dbus without resorting to kdbus.
> My implicit_rights concept gives any type of access control you can
> use on inodes because they *are* inodes.  So you get ACLs, etc.
>
> Brief summary for those who didn't read my old email: We add a new
> kind of filesystem object called a "right".  It's a special kind of
> socket inode that can't be bound or connected but is instead created
> by a new syscall.  It has a name, so "port:1234" might be a name of a
> right.
>
> To use an implicit right, you do whatever syscall you would do
> normally.  The kernel looks for a right object at
> /dev/implicit_rights/<name>.  If that object exists, is a right of the
> correct type (i.e. the right's name matches <name>) and you have
> execute access, you win.  Otherwise you lose.
>
> To avoid breaking existing distros, for things like modules_autoload,
> you would set a sysctl
> /proc/sys/kernel/required_implicit_rights/modules_autoload=1.  With
> that set, to autoload a module without CAP_SYS_MODULE, you need the
> /dev/implicit_rights/modules_autoload.

Sounds good.

>> So I don't like the idea of treating that as a special case.
>> I'd rather see ports controlled properly. (Of course, the
>> SELinux crowd will point out they have this handled, but I
>> remain unconvinced of the overall solution)
> Agreed.  But I think we should address all of these things together.

What I don't want is to have to buy into a hundred things I
don't want in order to get the one thing I do. A General mechanism
is dandy, but I don't want to have to write a gazillion policy
lines for features I don't want in order to get a simple control.
The problem with SELinux is not the effort required to protect
what you care about, it's the effort required to do everything else.
Djalal Harouni April 22, 2017, 1:19 a.m. UTC | #14
On Sat, Apr 22, 2017 at 2:12 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Sat, Apr 22, 2017 at 1:51 AM, Andy Lutomirski <luto@kernel.org> wrote:
> [...]
>>>> I personally like my implicit_rights idea, and it might be interesting
>>>> to prototype it.
>>>
>>> I don't like blocking a needed feature behind a large super-feature
>>> that doesn't exist yet. We'd be able to refactor this code into using
>>> such a thing in the future, so I'd prefer to move ahead with this
>>> since it would stop actual exploits.
>>
>> I don't think the super-feature is so hard, and I think we should not
>> add the per-task thing the way it's done in this patch.  Let's not add
>> per-task things where the best argument for their security is "not
>> sure how it would be exploited".
>
> Actually the XFRM framework CVE-2017-7184 [1] is one real example, of
> course there are others. The exploit was used on a generic distro
> during a security contest that distro is Ubuntu. That distro will
> never provide a module autoloading restriction by default to not harm
> it's users. Consumers or containers/sandboxes then can run their
> confined apps using such facilities.
>
> These bugs will stay in embedded devices that use these generic
> distros for ever.

The DCCP CVE-2017-6074 exploit:
http://seclists.org/oss-sec/2017/q1/503

Well, pretty sure there is more... the bugs are real, as their
exploits. Anyway I think these features can coexist as they are
optional, and most process trees protections can get along by design.
Andy Lutomirski April 22, 2017, 6:45 a.m. UTC | #15
On Fri, Apr 21, 2017 at 5:13 PM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> On 4/21/2017 5:00 PM, Andy Lutomirski wrote:
>> On Fri, Apr 21, 2017 at 4:52 PM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>>> On 4/21/2017 4:28 PM, Andy Lutomirski wrote:
>>>> On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
>>>>> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>>>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>>>>>> +/* Sets task's modules_autoload */
>>>>>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>>>>>> +                                           unsigned long value)
>>>>>>>>> +{
>>>>>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>>>>>> +               return -EINVAL;
>>>>>>>>> +       else if (task->modules_autoload > value)
>>>>>>>>> +               return -EPERM;
>>>>>>>>> +       else if (task->modules_autoload < value)
>>>>>>>>> +               task->modules_autoload = value;
>>>>>>>>> +
>>>>>>>>> +       return 0;
>>>>>>>>> +}
>>>>>>>> This needs to be more locked down.  Otherwise someone could set this
>>>>>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>>>>>> this particular thing causes a problem, but the issue exists
>>>>>>>> nonetheless.
>>>>>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>>>>>> are different than the existing privilege-manipulation syscalls this
>>>>>>> concern stems from. Applications are already forced to deal with
>>>>>>> things being missing like this in the face of it simply not being
>>>>>>> built into the kernel.
>>>>>>>
>>>>>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>>>>>
>>>>>> I think that adding an inherited boolean to task_struct that can be
>>>>>> set by unprivileged tasks and passed to privileged tasks is a terrible
>>>>>> precedent.  Ideally someone would try to find all the existing things
>>>>>> like this and kill them off.
>>>>> (Tristate, not boolean, but yeah.)
>>>>>
>>>>> I see two others besides seccomp and nnp:
>>>>>
>>>>> PR_MCE_KILL
>>>> Well, that's interesting.  That should presumably be reset on setuid
>>>> exec or something.
>>>>
>>>>> PR_SET_THP_DISABLE
>>>> Um.  At least that's just a performance issue.
>>>>
>>>>> I really don't think this needs nnp protection.
>>>>>
>>>>>> I agree that I don't see how one would exploit this particular
>>>>>> feature, but I still think I dislike the approach.  This is a slippery
>>>>>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>>>>>> we should solve these for real rather than half-arsing them IMO.
>>>>> I disagree (obviously); this would be protecting the entire module
>>>>> autoload attack surface. That's hardly a specific control, and it's a
>>>>> demonstrably needed flag.
>>>>>
>>>> The list is just going to get longer.  We should probably have controls for:
>>>>
>>>>  - Use of perf.  Unclear how fine grained they should be.
>>>>
>>>>  - Creation of new user namespaces.  Possibly also use of things like
>>>> iptables without global privilege.
>>>>
>>>>  - Ability to look up tasks owned by different uids (or maybe other
>>>> tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
>>>> the only hard part, I think.
>>>>
>>>>  - Ability to bind ports, maybe?
>>> One of my longer term (i.e. after stacking) projects
>>> is to create sensible access control on ports. Why shouldn't
>>> they have owners and mode bits (or ACLs, if you prefer)
>>> or real names. I kind of think we should be able to eliminate
>>> the need for dbus without resorting to kdbus.
>> My implicit_rights concept gives any type of access control you can
>> use on inodes because they *are* inodes.  So you get ACLs, etc.
>>
>> Brief summary for those who didn't read my old email: We add a new
>> kind of filesystem object called a "right".  It's a special kind of
>> socket inode that can't be bound or connected but is instead created
>> by a new syscall.  It has a name, so "port:1234" might be a name of a
>> right.
>>
>> To use an implicit right, you do whatever syscall you would do
>> normally.  The kernel looks for a right object at
>> /dev/implicit_rights/<name>.  If that object exists, is a right of the
>> correct type (i.e. the right's name matches <name>) and you have
>> execute access, you win.  Otherwise you lose.
>>
>> To avoid breaking existing distros, for things like modules_autoload,
>> you would set a sysctl
>> /proc/sys/kernel/required_implicit_rights/modules_autoload=1.  With
>> that set, to autoload a module without CAP_SYS_MODULE, you need the
>> /dev/implicit_rights/modules_autoload.
>
> Sounds good.
>
>>> So I don't like the idea of treating that as a special case.
>>> I'd rather see ports controlled properly. (Of course, the
>>> SELinux crowd will point out they have this handled, but I
>>> remain unconvinced of the overall solution)
>> Agreed.  But I think we should address all of these things together.
>
> What I don't want is to have to buy into a hundred things I
> don't want in order to get the one thing I do. A General mechanism
> is dandy, but I don't want to have to write a gazillion policy
> lines for features I don't want in order to get a simple control.
> The problem with SELinux is not the effort required to protect
> what you care about, it's the effort required to do everything else.
>

The point is to make it super simple.  chown, chmod and, if you want
to get fancy, setfacl.  You'll need a mkright tool, but that's
trivial.
Andy Lutomirski April 22, 2017, 6:51 a.m. UTC | #16
On Fri, Apr 21, 2017 at 5:12 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Sat, Apr 22, 2017 at 1:51 AM, Andy Lutomirski <luto@kernel.org> wrote:
> [...]
>>>> I personally like my implicit_rights idea, and it might be interesting
>>>> to prototype it.
>>>
>>> I don't like blocking a needed feature behind a large super-feature
>>> that doesn't exist yet. We'd be able to refactor this code into using
>>> such a thing in the future, so I'd prefer to move ahead with this
>>> since it would stop actual exploits.
>>
>> I don't think the super-feature is so hard, and I think we should not
>> add the per-task thing the way it's done in this patch.  Let's not add
>> per-task things where the best argument for their security is "not
>> sure how it would be exploited".
>
> Actually the XFRM framework CVE-2017-7184 [1] is one real example, of
> course there are others. The exploit was used on a generic distro
> during a security contest that distro is Ubuntu. That distro will
> never provide a module autoloading restriction by default to not harm
> it's users. Consumers or containers/sandboxes then can run their
> confined apps using such facilities.
>
> These bugs will stay in embedded devices that use these generic
> distros for ever.
>
>> Anyway, I think the sysctl is really the important bit.  The per-task
>> setting is icing on the cake IMO.  One upon a time autoload was more
>> important, but these days modaliases are supposed to do most of the
>> work.  I bet that modern distros don't need unprivileged autoload at
>> all.
>
> Actually I think they do and we can't just change that. Users may
> depend on it, it is a well established facility.
>
> Now the other problem is CAP_NET_ADMIN which does lot of things, it is
> more like the CAP_SYS_ADMIN.
>
> This is a quick list that I got from only the past months, I'm pretty
> sure there are more:
>
> * DCCP use after free CVE-2017-6074
> * n_hldc CVE-2017-2636
> * XFRM framework CVE-2017-7184
> * L2TPv3 CVE-2016-10200
>
> Most of these need CAP_NET_ADMIN to be autoloaded, however we also
> need CAP_NET_ADMIN for other things... therefore it is better to have
> an extra facility that could coexist with CAP_NET_ADMIN and other
> sandbox features.
>

I agree that the feature is important, but I think your implementation
is needlessly dangerous.  I imagine that the main uses that you care
about involve containers.  How about doing it in a safer way that
works for containers?  I can think of a few.  For example:

1. A sysctl that, if set, prevents autoloading outside the root
userns.  This isn't very flexible at all, but it might work.

2. Your patch, but require privilege within the calling namespace to
set the prctl.

3. A per-user-ns sysctl.
Djalal Harouni April 22, 2017, 12:17 p.m. UTC | #17
On Sat, Apr 22, 2017 at 1:28 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Apr 21, 2017 at 4:19 PM, Kees Cook <keescook@chromium.org> wrote:
>> On Wed, Apr 19, 2017 at 7:41 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> On Wed, Apr 19, 2017 at 4:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>> On Wed, Apr 19, 2017 at 4:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>> On Wed, Apr 19, 2017 at 3:20 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>>> +/* Sets task's modules_autoload */
>>>>>> +static inline int task_set_modules_autoload(struct task_struct *task,
>>>>>> +                                           unsigned long value)
>>>>>> +{
>>>>>> +       if (value > MODULES_AUTOLOAD_DISABLED)
>>>>>> +               return -EINVAL;
>>>>>> +       else if (task->modules_autoload > value)
>>>>>> +               return -EPERM;
>>>>>> +       else if (task->modules_autoload < value)
>>>>>> +               task->modules_autoload = value;
>>>>>> +
>>>>>> +       return 0;
>>>>>> +}
>>>>>
>>>>> This needs to be more locked down.  Otherwise someone could set this
>>>>> and then run a setuid program.  Admittedly, it would be quite odd if
>>>>> this particular thing causes a problem, but the issue exists
>>>>> nonetheless.
>>>>
>>>> Eeeh, I don't agree this needs to be changed. APIs provided by modules
>>>> are different than the existing privilege-manipulation syscalls this
>>>> concern stems from. Applications are already forced to deal with
>>>> things being missing like this in the face of it simply not being
>>>> built into the kernel.
>>>>
>>>> Having to hide this behind nnp seems like it'd reduce its utility...
>>>>
>>>
>>> I think that adding an inherited boolean to task_struct that can be
>>> set by unprivileged tasks and passed to privileged tasks is a terrible
>>> precedent.  Ideally someone would try to find all the existing things
>>> like this and kill them off.
>>
>> (Tristate, not boolean, but yeah.)
>>
>> I see two others besides seccomp and nnp:
>>
>> PR_MCE_KILL
>
> Well, that's interesting.  That should presumably be reset on setuid
> exec or something.
>
>> PR_SET_THP_DISABLE
>
> Um.  At least that's just a performance issue.
>
>>
>> I really don't think this needs nnp protection.
>>
>>> I agree that I don't see how one would exploit this particular
>>> feature, but I still think I dislike the approach.  This is a slippery
>>> slope to adding a boolean for perf_event_open(), unshare(), etc, and
>>> we should solve these for real rather than half-arsing them IMO.
>>
>> I disagree (obviously); this would be protecting the entire module
>> autoload attack surface. That's hardly a specific control, and it's a
>> demonstrably needed flag.
>>
>
> The list is just going to get longer.  We should probably have controls for:
>
>  - Use of perf.  Unclear how fine grained they should be.
>
>  - Creation of new user namespaces.  Possibly also use of things like
> iptables without global privilege.
>
>  - Ability to look up tasks owned by different uids (or maybe other
> tasks *at all*) by pid/tid.  Conceptually, this is easy.  The API is
> the only hard part, I think.
>
>  - Ability to bind ports, maybe?
>
> My point is that all of these need some way to handle configuration
> and inheritance, and I don't think that a bunch of per-task prctls is
> the right way.  As just an example, saying that interactive users can
> autoload modules but other users can't, or that certain systemd
> services can, etc, might be nice.  Linus already complained that he
> (i.e. user "torvalds" or whatever) should be able to profile the
> kernel but that other uids should not be able to.

Neat, maybe this could already be achieved with this interface and
systemd-logind,  "ModulesAutoloadUsers=andy" in logind.conf where
"andy" is the only logged-in user able to trigger and autoload kernel
modules. However maybe we should not restrict too much other bits or
functionality of the other users, please let me will follow up later
on it.

> I personally like my implicit_rights idea, and it might be interesting
> to prototype it.

Yes I would use that if it is available, in mean time there was
several bugs and 2 public exploits last months.. and we should use
available features.
Kees Cook April 22, 2017, 7:29 p.m. UTC | #18
On Fri, Apr 21, 2017 at 11:51 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Fri, Apr 21, 2017 at 5:12 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> On Sat, Apr 22, 2017 at 1:51 AM, Andy Lutomirski <luto@kernel.org> wrote:
>> [...]
>>>>> I personally like my implicit_rights idea, and it might be interesting
>>>>> to prototype it.
>>>>
>>>> I don't like blocking a needed feature behind a large super-feature
>>>> that doesn't exist yet. We'd be able to refactor this code into using
>>>> such a thing in the future, so I'd prefer to move ahead with this
>>>> since it would stop actual exploits.
>>>
>>> I don't think the super-feature is so hard, and I think we should not
>>> add the per-task thing the way it's done in this patch.  Let's not add
>>> per-task things where the best argument for their security is "not
>>> sure how it would be exploited".
>>
>> Actually the XFRM framework CVE-2017-7184 [1] is one real example, of
>> course there are others. The exploit was used on a generic distro
>> during a security contest that distro is Ubuntu. That distro will
>> never provide a module autoloading restriction by default to not harm
>> it's users. Consumers or containers/sandboxes then can run their
>> confined apps using such facilities.
>>
>> These bugs will stay in embedded devices that use these generic
>> distros for ever.
>>
>>> Anyway, I think the sysctl is really the important bit.  The per-task
>>> setting is icing on the cake IMO.  One upon a time autoload was more
>>> important, but these days modaliases are supposed to do most of the
>>> work.  I bet that modern distros don't need unprivileged autoload at
>>> all.
>>
>> Actually I think they do and we can't just change that. Users may
>> depend on it, it is a well established facility.
>>
>> Now the other problem is CAP_NET_ADMIN which does lot of things, it is
>> more like the CAP_SYS_ADMIN.
>>
>> This is a quick list that I got from only the past months, I'm pretty
>> sure there are more:
>>
>> * DCCP use after free CVE-2017-6074
>> * n_hldc CVE-2017-2636
>> * XFRM framework CVE-2017-7184
>> * L2TPv3 CVE-2016-10200
>>
>> Most of these need CAP_NET_ADMIN to be autoloaded, however we also
>> need CAP_NET_ADMIN for other things... therefore it is better to have
>> an extra facility that could coexist with CAP_NET_ADMIN and other
>> sandbox features.
>>
>
> I agree that the feature is important, but I think your implementation
> is needlessly dangerous.  I imagine that the main uses that you care
> about involve containers.  How about doing it in a safer way that
> works for containers?  I can think of a few.  For example:
>
> 1. A sysctl that, if set, prevents autoloading outside the root
> userns.  This isn't very flexible at all, but it might work.
>
> 2. Your patch, but require privilege within the calling namespace to
> set the prctl.

How about CAP_SYS_ADMIN || no_new_privs?

-Kees

>
> 3. A per-user-ns sysctl.
Rusty Russell April 24, 2017, 4:29 a.m. UTC | #19
Djalal Harouni <tixxdz@gmail.com> writes:
> When value is (1), task must have CAP_SYS_MODULE to be able to trigger a
> module auto-load operation, or CAP_NET_ADMIN for modules with a
> 'netdev-%s' alias.

Sorry, the magic 'netdev-' prefix is a crawling horror.  To do this
properly, you need to hand the capability (if any) from the
request_module() call.  Probably by adding a new request_module_cap and
making request_module() call that, then fixing up the callers.

Cheers,
Rusty.
Djalal Harouni April 24, 2017, 2:25 p.m. UTC | #20
On Sat, Apr 22, 2017 at 9:29 PM, Kees Cook <keescook@chromium.org> wrote:
> On Fri, Apr 21, 2017 at 11:51 PM, Andy Lutomirski <luto@kernel.org> wrote:
>> On Fri, Apr 21, 2017 at 5:12 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>> On Sat, Apr 22, 2017 at 1:51 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>
[...]
>>> * DCCP use after free CVE-2017-6074
>>> * n_hldc CVE-2017-2636
>>> * XFRM framework CVE-2017-7184
>>> * L2TPv3 CVE-2016-10200
>>>
>>> Most of these need CAP_NET_ADMIN to be autoloaded, however we also
>>> need CAP_NET_ADMIN for other things... therefore it is better to have
>>> an extra facility that could coexist with CAP_NET_ADMIN and other
>>> sandbox features.
>>>
>>
>> I agree that the feature is important, but I think your implementation
>> is needlessly dangerous.  I imagine that the main uses that you care
>> about involve containers.  How about doing it in a safer way that
>> works for containers?  I can think of a few.  For example:
>>
>> 1. A sysctl that, if set, prevents autoloading outside the root
>> userns.  This isn't very flexible at all, but it might work.
>>
>> 2. Your patch, but require privilege within the calling namespace to
>> set the prctl.
>
> How about CAP_SYS_ADMIN || no_new_privs?
>
> -Kees
>

Yes I can update as per Andy suggestion to require privileges inside
the calling namespace to set prctl. Other options that are not prctl
based have more variants, that make them hard to use.

So I would got with CAP_SYS_ADMIN in the calling userns ||
no_new_privs , I would have said CAP_SYS_MODULE in the userns but it
seems better to standardize on CAP_SYS_ADMIN to set the prctl.
Kees Cook April 24, 2017, 6:02 p.m. UTC | #21
On Mon, Apr 24, 2017 at 7:25 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Sat, Apr 22, 2017 at 9:29 PM, Kees Cook <keescook@chromium.org> wrote:
>> On Fri, Apr 21, 2017 at 11:51 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> On Fri, Apr 21, 2017 at 5:12 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>> On Sat, Apr 22, 2017 at 1:51 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>
> [...]
>>>> * DCCP use after free CVE-2017-6074
>>>> * n_hldc CVE-2017-2636
>>>> * XFRM framework CVE-2017-7184
>>>> * L2TPv3 CVE-2016-10200
>>>>
>>>> Most of these need CAP_NET_ADMIN to be autoloaded, however we also
>>>> need CAP_NET_ADMIN for other things... therefore it is better to have
>>>> an extra facility that could coexist with CAP_NET_ADMIN and other
>>>> sandbox features.
>>>>
>>>
>>> I agree that the feature is important, but I think your implementation
>>> is needlessly dangerous.  I imagine that the main uses that you care
>>> about involve containers.  How about doing it in a safer way that
>>> works for containers?  I can think of a few.  For example:
>>>
>>> 1. A sysctl that, if set, prevents autoloading outside the root
>>> userns.  This isn't very flexible at all, but it might work.
>>>
>>> 2. Your patch, but require privilege within the calling namespace to
>>> set the prctl.
>>
>> How about CAP_SYS_ADMIN || no_new_privs?
>>
>> -Kees
>>
>
> Yes I can update as per Andy suggestion to require privileges inside
> the calling namespace to set prctl. Other options that are not prctl
> based have more variants, that make them hard to use.
>
> So I would got with CAP_SYS_ADMIN in the calling userns ||
> no_new_privs , I would have said CAP_SYS_MODULE in the userns but it
> seems better to standardize on CAP_SYS_ADMIN to set the prctl.

Andy's concern is that it would provide an escalation from SYS_MODULE
to SYS_ADMIN through some privileged process being tricked through a
missing API provided by modules, so we have to use either SYS_ADMIN ||
nnp.

-Kees
Djalal Harouni April 24, 2017, 6:35 p.m. UTC | #22
On Mon, Apr 24, 2017 at 8:02 PM, Kees Cook <keescook@chromium.org> wrote:
> On Mon, Apr 24, 2017 at 7:25 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> On Sat, Apr 22, 2017 at 9:29 PM, Kees Cook <keescook@chromium.org> wrote:
>>> On Fri, Apr 21, 2017 at 11:51 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> On Fri, Apr 21, 2017 at 5:12 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>>>>> On Sat, Apr 22, 2017 at 1:51 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>
>> [...]
>>>>> * DCCP use after free CVE-2017-6074
>>>>> * n_hldc CVE-2017-2636
>>>>> * XFRM framework CVE-2017-7184
>>>>> * L2TPv3 CVE-2016-10200
>>>>>
>>>>> Most of these need CAP_NET_ADMIN to be autoloaded, however we also
>>>>> need CAP_NET_ADMIN for other things... therefore it is better to have
>>>>> an extra facility that could coexist with CAP_NET_ADMIN and other
>>>>> sandbox features.
>>>>>
>>>>
>>>> I agree that the feature is important, but I think your implementation
>>>> is needlessly dangerous.  I imagine that the main uses that you care
>>>> about involve containers.  How about doing it in a safer way that
>>>> works for containers?  I can think of a few.  For example:
>>>>
>>>> 1. A sysctl that, if set, prevents autoloading outside the root
>>>> userns.  This isn't very flexible at all, but it might work.
>>>>
>>>> 2. Your patch, but require privilege within the calling namespace to
>>>> set the prctl.
>>>
>>> How about CAP_SYS_ADMIN || no_new_privs?
>>>
>>> -Kees
>>>
>>
>> Yes I can update as per Andy suggestion to require privileges inside
>> the calling namespace to set prctl. Other options that are not prctl
>> based have more variants, that make them hard to use.
>>
>> So I would got with CAP_SYS_ADMIN in the calling userns ||
>> no_new_privs , I would have said CAP_SYS_MODULE in the userns but it
>> seems better to standardize on CAP_SYS_ADMIN to set the prctl.
>
> Andy's concern is that it would provide an escalation from SYS_MODULE
> to SYS_ADMIN through some privileged process being tricked through a
> missing API provided by modules, so we have to use either SYS_ADMIN ||
> nnp.

Yes, I would say that programs expect that maybe such functionality is
not provided, but we don't know. I will update.

Thanks!
Djalal Harouni April 26, 2017, 9:06 a.m. UTC | #23
Hi Rusty,

On Mon, Apr 24, 2017 at 6:29 AM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> Djalal Harouni <tixxdz@gmail.com> writes:
>> When value is (1), task must have CAP_SYS_MODULE to be able to trigger a
>> module auto-load operation, or CAP_NET_ADMIN for modules with a
>> 'netdev-%s' alias.
>
> Sorry, the magic 'netdev-' prefix is a crawling horror.  To do this

Yes I agree, that's the not the best part. I added it for backward
compatibility since I did notice that some network daemon retain
CAP_NET_ADMIN for this case. The aim of the patches is to get modules
autoload covered with CAP_SYS_MODULE and make it more like explicit
modules loading.

> properly, you need to hand the capability (if any) from the
> request_module() call.  Probably by adding a new request_module_cap and
> making request_module() call that, then fixing up the callers.

Hmm, sorry Rusty I'm a bit confused, when you refer to "callers", you
mean request_module() callers ?

If so, I'm not sure that the best thing here since it may defeat the
purpose of this feature if we allow to pass capabilities.

Right now we have modules autoload that can be triggered without
privileges, or with CAP_SYS_ADMIN, CAP_NET_ADMIN, CAP_SYS_MODULE...
and some caps may allow to load other modules to resolve symbols etc.

The public exploits did target CAP_NET_ADMIN, if we offer a way to
pass in capabilities it will be used it as it is the case now without
it, not to mention that some capabilities are overloaded, exploits
will continue for these ones.

The goal is to narrow modules autoload only to CAP_SYS_MODULE or
disable it completely for process trees. Later you can give
CAP_SYS_MODULE and you are sure that only /lib/modules/ will be
autoloaded, no arbitrary loads by using seccomp filter on module
syscalls, or separate the process in two one with CAP_SYS_MODULE and
the other with its own capabilities and feature use.

I also don't like that 'netdev-%s' but it is for compatibility for
specific cases, maybe there are others that we may have to add. But I
would prefer if we narrow it down to only CAP_SYS_MODULE.

How about I move all permission checks to security/commoncap.c
helpers, the module logic part will only contain set and read sysctl
"modules_autoload" and "task->modules_autoload" ?

I already added cap_kernel_module_request() which is called by
request_module(), so instead of counting on module to do the
permission check I will open code it inside
security/commoncap.c:cap_kernel_module_request() like other capability
helpers and note that CAP_NET_ADMIN 'netdev-%s' alias is only for
compatibility. This way we prevent that other capabilities get exposed
or targeted for exploitation. Do you think that this will work ?

Thanks!

> Cheers,
> Rusty.
Djalal Harouni May 4, 2017, 1:07 p.m. UTC | #24
On Sat, Apr 22, 2017 at 2:17 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Sat, Apr 22, 2017 at 1:28 AM, Andy Lutomirski <luto@amacapital.net> wrote:
[...]
>>
>> My point is that all of these need some way to handle configuration
>> and inheritance, and I don't think that a bunch of per-task prctls is
>> the right way.  As just an example, saying that interactive users can
>> autoload modules but other users can't, or that certain systemd
>> services can, etc, might be nice.  Linus already complained that he
>> (i.e. user "torvalds" or whatever) should be able to profile the
>> kernel but that other uids should not be able to.
>
> Neat, maybe this could already be achieved with this interface and
> systemd-logind,  "ModulesAutoloadUsers=andy" in logind.conf where
> "andy" is the only logged-in user able to trigger and autoload kernel
> modules. However maybe we should not restrict too much other bits or
> functionality of the other users, please let me will follow up later
> on it.
>
>> I personally like my implicit_rights idea, and it might be interesting
>> to prototype it.
>

Andy following on the idea of per user settings, I'm curious did you
manage to make some advance on how to store the user settings ? the
user database format is old and not extensible, there was cgmanager or
other libcgroup but for resources, and no simple thing for such
restrictions example: "RestrictLinuxModules=user" that will prevent
such users from making/loading extra Linux features/modules that are
not already available...

Thanks!
Serge E. Hallyn May 4, 2017, 2:58 p.m. UTC | #25
On Thu, May 04, 2017 at 03:07:49PM +0200, Djalal Harouni wrote:
> On Sat, Apr 22, 2017 at 2:17 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
> > On Sat, Apr 22, 2017 at 1:28 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> [...]
> >>
> >> My point is that all of these need some way to handle configuration
> >> and inheritance, and I don't think that a bunch of per-task prctls is
> >> the right way.  As just an example, saying that interactive users can
> >> autoload modules but other users can't, or that certain systemd
> >> services can, etc, might be nice.  Linus already complained that he
> >> (i.e. user "torvalds" or whatever) should be able to profile the
> >> kernel but that other uids should not be able to.
> >
> > Neat, maybe this could already be achieved with this interface and
> > systemd-logind,  "ModulesAutoloadUsers=andy" in logind.conf where
> > "andy" is the only logged-in user able to trigger and autoload kernel
> > modules. However maybe we should not restrict too much other bits or
> > functionality of the other users, please let me will follow up later
> > on it.
> >
> >> I personally like my implicit_rights idea, and it might be interesting
> >> to prototype it.
> >
> 
> Andy following on the idea of per user settings, I'm curious did you
> manage to make some advance on how to store the user settings ? the
> user database format is old and not extensible, there was cgmanager or

(v2 is under very very early consideration, would love to stay in the loop
as this is considered)

> other libcgroup but for resources, and no simple thing for such
> restrictions example: "RestrictLinuxModules=user" that will prevent
> such users from making/loading extra Linux features/modules that are
> not already available...
> 
> Thanks!
> 
> -- 
> tixxdz
Djalal Harouni May 5, 2017, 1:06 p.m. UTC | #26
Hi Serge,

On Thu, May 4, 2017 at 4:58 PM, Serge E. Hallyn <serge@hallyn.com> wrote:
> On Thu, May 04, 2017 at 03:07:49PM +0200, Djalal Harouni wrote:
>> On Sat, Apr 22, 2017 at 2:17 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> > On Sat, Apr 22, 2017 at 1:28 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> [...]
>> >>
>> >> My point is that all of these need some way to handle configuration
>> >> and inheritance, and I don't think that a bunch of per-task prctls is
>> >> the right way.  As just an example, saying that interactive users can
>> >> autoload modules but other users can't, or that certain systemd
>> >> services can, etc, might be nice.  Linus already complained that he
>> >> (i.e. user "torvalds" or whatever) should be able to profile the
>> >> kernel but that other uids should not be able to.
>> >
>> > Neat, maybe this could already be achieved with this interface and
>> > systemd-logind,  "ModulesAutoloadUsers=andy" in logind.conf where
>> > "andy" is the only logged-in user able to trigger and autoload kernel
>> > modules. However maybe we should not restrict too much other bits or
>> > functionality of the other users, please let me will follow up later
>> > on it.
>> >
>> >> I personally like my implicit_rights idea, and it might be interesting
>> >> to prototype it.
>> >
>>
>> Andy following on the idea of per user settings, I'm curious did you
>> manage to make some advance on how to store the user settings ? the
>> user database format is old and not extensible, there was cgmanager or
>
> (v2 is under very very early consideration, would love to stay in the loop
> as this is considered)
>

For user database I'm not aware of any code, but it seems that if the
database or settings are adapted to today use cases then
systemd-logind may use them. Yes if there is something I'll Cc'you.

Thanks!
Andy Lutomirski May 5, 2017, 4:18 p.m. UTC | #27
On Thu, May 4, 2017 at 6:07 AM, Djalal Harouni <tixxdz@gmail.com> wrote:
> On Sat, Apr 22, 2017 at 2:17 PM, Djalal Harouni <tixxdz@gmail.com> wrote:
>> On Sat, Apr 22, 2017 at 1:28 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> [...]
>>>
>>> My point is that all of these need some way to handle configuration
>>> and inheritance, and I don't think that a bunch of per-task prctls is
>>> the right way.  As just an example, saying that interactive users can
>>> autoload modules but other users can't, or that certain systemd
>>> services can, etc, might be nice.  Linus already complained that he
>>> (i.e. user "torvalds" or whatever) should be able to profile the
>>> kernel but that other uids should not be able to.
>>
>> Neat, maybe this could already be achieved with this interface and
>> systemd-logind,  "ModulesAutoloadUsers=andy" in logind.conf where
>> "andy" is the only logged-in user able to trigger and autoload kernel
>> modules. However maybe we should not restrict too much other bits or
>> functionality of the other users, please let me will follow up later
>> on it.
>>
>>> I personally like my implicit_rights idea, and it might be interesting
>>> to prototype it.
>>
>
> Andy following on the idea of per user settings, I'm curious did you
> manage to make some advance on how to store the user settings ? the
> user database format is old and not extensible, there was cgmanager or
> other libcgroup but for resources, and no simple thing for such
> restrictions example: "RestrictLinuxModules=user" that will prevent
> such users from making/loading extra Linux features/modules that are
> not already available...
>

I figured that user code would figure it out somehow.  Text config file?

There is another odd way it could be configured: just leave the inodes
around in /dev/rights with appropriate permissions.  Some startup
script could re-instantiate them with the same permissions (via a
syscall that does that atomically).

Patch
diff mbox

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 4cddbce..df4d145 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -194,6 +194,7 @@  read the file /proc/PID/status:
   CapBnd: ffffffffffffffff
   NoNewPrivs:     0
   Seccomp:        0
+  ModulesAutoload:      0
   voluntary_ctxt_switches:        0
   nonvoluntary_ctxt_switches:     1
 
@@ -267,6 +268,8 @@  Table 1-2: Contents of the status files (as of 4.8)
  CapBnd                      bitmap of capabilities bounding set
  NoNewPrivs                  no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...)
  Seccomp                     seccomp mode, like prctl(PR_GET_SECCOMP, ...)
+ ModulesAutoload             modules autoload, like
+                             prctl(PR_GET_MODULES_AUTOLOAD, ...)
  Cpus_allowed                mask of CPUs on which this process may run
  Cpus_allowed_list           Same as previous, but in "list format"
  Mems_allowed                mask of memory nodes allowed to this process
diff --git a/Documentation/prctl/modules_autoload.txt b/Documentation/prctl/modules_autoload.txt
new file mode 100644
index 0000000..242852e
--- /dev/null
+++ b/Documentation/prctl/modules_autoload.txt
@@ -0,0 +1,49 @@ 
+A request to a kernel feature that is implemented by a module that is
+not loaded may trigger the module auto-load feature, allowing to
+transparently satisfy userspace. In this case an implicit kernel module
+load operation happens.
+
+Usually to load or unload a kernel module, an explicit operation happens
+where programs are required to have some capabilities in order to perform
+such operations. However, with the implicit module loading, no
+capabilities are required, anyone who is able to request a certain kernel
+feature, may also implicitly load its corresponding kernel module. This
+operation can be abused by unprivileged users to expose kernel interfaces
+that maybe privileged users did not want to be made available for various
+reasons: resources, bugs, vulnerabilties, etc. The DCCP vulnerability is
+(CVE-2017-6074) is one real example.
+
+The new per-task "modules_autoload" flag, is a new way to restrict
+automatic module loading, preventing the kernel from exposing more of
+its interface. This particularly useful for containers and sandboxes
+where sandboxed processes should affect the rest of the system.
+
+Any task can set "modules_autoload". Once set, this setting is inherited
+across fork, clone and execve. With "modules_autoload" set, automatic
+module loading will have first to satisfy the per-task access permissions
+before attempting to implicitly load the module. For example, automatic
+loading of modules that contain bugs or vulnerabilities can be
+restricted and imprivileged users can no longer abuse such interfaces.
+
+To set modules_autoload, use prctl(PR_SET_MODULES_AUTOLOAD, value, 0, 0, 0).
+
+When value is (0), the default, automatic modules loading is allowed.
+
+When value is (1), task must have CAP_SYS_MODULE to be able to trigger a
+module auto-load operation, or CAP_NET_ADMIN for modules with a
+'netdev-%s' alias.
+
+When value is (2), automatic modules loading is disabled for the current
+task.
+
+The 'modules_autoload' value may only be increased, never decreased, thus
+ensuring that once applied, processes can never relax their setting.
+
+When a request to a kernel module is denied, the module name with the
+corresponding process name and its pid are logged. Administrators can use
+such information to explicitly load the appropriate modules.
+
+Please note that even if the per-task "modules_autoload" value allows to
+auto-load the corresponding module, automatic module loading may still
+fail due to the global "modules_autoload" sysctl. For more details please
+see "modules_autoload" in Documentation/sysctl/kernel.txt
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c3555..cbcf087 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -88,6 +88,7 @@ 
 #include <linux/string_helpers.h>
 #include <linux/user_namespace.h>
 #include <linux/fs_struct.h>
+#include <linux/module.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -346,10 +347,15 @@  static inline void task_cap(struct seq_file *m, struct task_struct *p)
 
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 {
+	int autoload = task_modules_autoload(p);
+
 	seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
 #ifdef CONFIG_SECCOMP
 	seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
 #endif
+	if (autoload != -ENOSYS)
+		seq_put_decimal_ull(m, "\nModulesAutoload:\t", autoload);
+
 	seq_putc(m, '\n');
 }
 
diff --git a/include/linux/module.h b/include/linux/module.h
index 4b96c10..595800f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -13,6 +13,7 @@ 
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/elf.h>
+#include <linux/sched.h>
 #include <linux/stringify.h>
 #include <linux/kobject.h>
 #include <linux/moduleparam.h>
@@ -506,7 +507,33 @@  bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
 bool is_module_percpu_address(unsigned long addr);
 bool is_module_text_address(unsigned long addr);
 
-int modules_autoload_access(char *kmod_name);
+int modules_autoload_access(struct task_struct *task, char *kmod_name);
+
+/* Sets task's modules_autoload */
+static inline int task_set_modules_autoload(struct task_struct *task,
+					    unsigned long value)
+{
+	if (value > MODULES_AUTOLOAD_DISABLED)
+		return -EINVAL;
+	else if (task->modules_autoload > value)
+		return -EPERM;
+	else if (task->modules_autoload < value)
+		task->modules_autoload = value;
+
+	return 0;
+}
+
+/* Returns task's modules_autoload */
+static inline void task_copy_modules_autoload(struct task_struct *dest,
+					      struct task_struct *src)
+{
+	dest->modules_autoload = src->modules_autoload;
+}
+
+static inline int task_modules_autoload(struct task_struct *task)
+{
+	return task->modules_autoload;
+}
 
 static inline bool within_module_core(unsigned long addr,
 				      const struct module *mod)
@@ -652,11 +679,28 @@  static inline bool is_livepatch_module(struct module *mod)
 
 #else /* !CONFIG_MODULES... */
 
-static inline int modules_autoload_access(char *kmod_name)
+static inline int modules_autoload_access(struct task_struct *task,
+					  char *kmod_name)
 {
 	return 0;
 }
 
+static inline int task_set_modules_autoload(struct task_struct *task,
+					    unsigned long value)
+{
+	return -ENOSYS;
+}
+
+static inline void task_copy_modules_autoload(struct task_struct *dest,
+					      struct task_struct *src)
+{
+}
+
+static inline int task_modules_autoload(struct task_struct *task)
+{
+	return -ENOSYS;
+}
+
 static inline struct module *__module_address(unsigned long addr)
 {
 	return NULL;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 48fb8bc..7264e62 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -613,6 +613,11 @@  struct task_struct {
 
 	struct restart_block		restart_block;
 
+#ifdef CONFIG_MODULES
+	/* per-task modules autoload access */
+	unsigned			modules_autoload:2;
+#endif
+
 	pid_t				pid;
 	pid_t				tgid;
 
diff --git a/include/linux/security.h b/include/linux/security.h
index e274bb11..9581cc5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -866,7 +866,7 @@  static inline int security_task_create(unsigned long clone_flags)
 static inline int security_task_alloc(struct task_struct *task,
 				      unsigned long clone_flags)
 {
-	return 0;
+	return cap_task_alloc(task, clone_flags);
 }
 
 static inline void security_task_free(struct task_struct *task)
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a8d0759..0244264 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -197,4 +197,12 @@  struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER		3
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
+/*
+ * Control the per-task "modules_autoload" access.
+ *
+ * See Documentation/prctl/modules_autoload.txt for more details.
+ */
+#define PR_SET_MODULES_AUTOLOAD		48
+#define PR_GET_MODULES_AUTOLOAD		49
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 81347bd..141e06b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1695,6 +1695,10 @@  static __latent_entropy struct task_struct *copy_process(
 	p->sequential_io_avg	= 0;
 #endif
 
+#ifdef CONFIG_MODULES
+	p->modules_autoload	= 0;
+#endif
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
diff --git a/kernel/module.c b/kernel/module.c
index 54cb6e0..e1eca74 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4313,19 +4313,24 @@  static int modules_autoload_privileged_access(const char *name)
 }
 
 /**
- * modules_autoload_access - Determine whether a module auto-load is permitted
+ * modules_autoload_access - Determine whether the task is allowed to perform a
+ *			     module auto-load request
+ * @task: The task performing the request
  * @kmod_name: The module name
  *
- * Determine whether a module should be automatically loaded or not. The check
- * uses the sysctl "modules_autoload" value.
+ * Determine whether the task is allowed to perform a module auto-load request.
+ * This checks the per-task "modules_autoload" flag, if the access is not denied,
+ * then the global sysctl "modules_autoload" is evaluated.
  *
  * Returns 0 if the module request is allowed or -EPERM if not.
  */
-int modules_autoload_access(char *kmod_name)
+int modules_autoload_access(struct task_struct *task, char *kmod_name)
 {
-	if (modules_autoload == MODULES_AUTOLOAD_ALLOWED)
+	unsigned int autoload = max_t(unsigned int,
+				      modules_autoload, task->modules_autoload);
+	if (autoload == MODULES_AUTOLOAD_ALLOWED)
 		return 0;
-	else if (modules_autoload == MODULES_AUTOLOAD_PRIVILEGED)
+	else if (autoload == MODULES_AUTOLOAD_PRIVILEGED)
 		return modules_autoload_privileged_access(kmod_name);
 
 	/* MODULES_AUTOLOAD_DISABLED */
diff --git a/security/commoncap.c b/security/commoncap.c
index 67a6cfe..bcc1e09 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -886,6 +886,40 @@  static int cap_prctl_drop(unsigned long cap)
 	return commit_creds(new);
 }
 
+static int pr_set_mod_autoload(unsigned long arg2, unsigned long arg3,
+			       unsigned long arg4, unsigned long arg5)
+{
+	if (arg3 || arg4 || arg5)
+		return -EINVAL;
+
+	return task_set_modules_autoload(current, arg2);
+}
+
+static inline int pr_get_mod_autoload(unsigned long arg2, unsigned long arg3,
+				      unsigned long arg4, unsigned long arg5)
+{
+	if (arg2 || arg3 || arg4 || arg5)
+		return -EINVAL;
+
+	return task_modules_autoload(current);
+}
+
+/**
+ * cap_task_alloc - Implement process context allocation for this security module
+ * @task: task being allocated
+ * @clone_flags: contains the clone flags indicating what should be shared.
+ *
+ * Allocate or initialize the task context for this security module.
+ *
+ * Returns 0.
+ */
+int cap_task_alloc(struct task_struct *task, unsigned long clone_flags)
+{
+	task_copy_modules_autoload(task, current);
+
+	return 0;
+}
+
 /**
  * cap_task_prctl - Implement process control functions for this security module
  * @option: The process control function requested
@@ -1015,6 +1049,11 @@  int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 				cap_lower(new->cap_ambient, arg3);
 			return commit_creds(new);
 		}
+	case PR_SET_MODULES_AUTOLOAD:
+		return pr_set_mod_autoload(arg2, arg3, arg4, arg5);
+
+	case PR_GET_MODULES_AUTOLOAD:
+		return pr_get_mod_autoload(arg2, arg3, arg4, arg5);
 
 	default:
 		/* No functionality available - continue with default */
@@ -1070,19 +1109,19 @@  int cap_mmap_file(struct file *file, unsigned long reqprot,
 }
 
 /**
- * cap_kernel_module_request - Determine whether a module auto-load is permitted
+ * cap_kernel_module_request - Determine whether current task is allowed to
+ *			       automatically load the specified module.
  * @kmod_name: The module name
  *
- * Determine whether a module should be automatically loaded due to a request
- * by the current task. Returns 0 if the module request should be allowed
- * -EPERM if not.
+ * Determine whether current task is allowed to automatically load the module.
+ * Returns 0 if current task is allowed to auto-load the module, -EPERM if not.
  */
 int cap_kernel_module_request(char *kmod_name)
 {
 	int ret;
 	char comm[sizeof(current->comm)];
 
-	ret = modules_autoload_access(kmod_name);
+	ret = modules_autoload_access(current, kmod_name);
 	if (ret < 0)
 		pr_notice_ratelimited(
 			"Automatic module loading of %.64s by \"%s\"[%d] was denied\n",
@@ -1106,6 +1145,7 @@  struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
 	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
 	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
+	LSM_HOOK_INIT(task_alloc, cap_task_alloc),
 	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
 	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
 	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),