diff mbox series

[v2] LSM: add SafeSetID module that gates setid calls

Message ID 20190111171302.26152-1-mortonm@chromium.org (mailing list archive)
State New, archived
Headers show
Series [v2] LSM: add SafeSetID module that gates setid calls | expand

Commit Message

Micah Morton Jan. 11, 2019, 5:13 p.m. UTC
From: Micah Morton <mortonm@chromium.org>

SafeSetID gates the setid family of syscalls to restrict UID/GID
transitions from a given UID/GID to only those approved by a
system-wide whitelist. These restrictions also prohibit the given
UIDs/GIDs from obtaining auxiliary privileges associated with
CAP_SET{U/G}ID, such as allowing a user to set up user namespace UID
mappings. For now, only gating the set*uid family of syscalls is
supported, with support for set*gid coming in a future patch set.

Signed-off-by: Micah Morton <mortonm@chromium.org>
---
Changes since the last patch set: Rebase after commit
a35ce66b801631823fc78c8a78d104f9c0976867 got applied to next-general.
As a result of that commit, we can remove the changes in arch/ and the
setuid_syscall function in lsm.c, since this code no longer needs to do
arch-specific operations to see if security_capable is being called from
a setid syscall. Instead, we add the ns_capable_insetid function and
call it from the setid syscalls in kernel/sys.c (rather than calling the
original ns_capable function), which allows us to signal to the
security_capable hooks whether the hook is being called from within a
setid syscall.

 Documentation/admin-guide/LSM/SafeSetID.rst | 107 ++++++++
 Documentation/admin-guide/LSM/index.rst     |   1 +
 include/linux/capability.h                  |   5 +
 kernel/capability.c                         |  19 ++
 kernel/sys.c                                |  10 +-
 security/Kconfig                            |   1 +
 security/Makefile                           |   2 +
 security/safesetid/Kconfig                  |  12 +
 security/safesetid/Makefile                 |   7 +
 security/safesetid/lsm.c                    | 272 ++++++++++++++++++++
 security/safesetid/lsm.h                    |  30 +++
 security/safesetid/securityfs.c             | 189 ++++++++++++++
 12 files changed, 650 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/admin-guide/LSM/SafeSetID.rst
 create mode 100644 security/safesetid/Kconfig
 create mode 100644 security/safesetid/Makefile
 create mode 100644 security/safesetid/lsm.c
 create mode 100644 security/safesetid/lsm.h
 create mode 100644 security/safesetid/securityfs.c

Comments

Kees Cook Jan. 15, 2019, 12:38 a.m. UTC | #1
On Fri, Jan 11, 2019 at 9:13 AM <mortonm@chromium.org> wrote:
>
> From: Micah Morton <mortonm@chromium.org>
>
> SafeSetID gates the setid family of syscalls to restrict UID/GID
> transitions from a given UID/GID to only those approved by a
> system-wide whitelist. These restrictions also prohibit the given
> UIDs/GIDs from obtaining auxiliary privileges associated with
> CAP_SET{U/G}ID, such as allowing a user to set up user namespace UID
> mappings. For now, only gating the set*uid family of syscalls is
> supported, with support for set*gid coming in a future patch set.
>
> Signed-off-by: Micah Morton <mortonm@chromium.org>
> ---
> Changes since the last patch set: Rebase after commit
> a35ce66b801631823fc78c8a78d104f9c0976867 got applied to next-general.
> As a result of that commit, we can remove the changes in arch/ and the
> setuid_syscall function in lsm.c, since this code no longer needs to do
> arch-specific operations to see if security_capable is being called from
> a setid syscall. Instead, we add the ns_capable_insetid function and
> call it from the setid syscalls in kernel/sys.c (rather than calling the
> original ns_capable function), which allows us to signal to the
> security_capable hooks whether the hook is being called from within a
> setid syscall.

I would split this patch into two halfs: the "no op" change that
"marks" all the setid call sites in the first patch, then the LSM
itself in the second patch.

> +bool ns_capable_insetid(struct user_namespace *ns, int cap)
> +{
> +       return ns_capable_common(ns, cap, CAP_OPT_INSETID);
> +}
> +EXPORT_SYMBOL(ns_capable_insetid);

Since we have the noaudit helper still, using this one seems fine to
me. I might bikeshed the name to "ns_capable_setid()". If others don't
want a new helper, then it should be fine to just change the callsites
to the direct ns_capable_common(ns, cap, CAP_OPT_INSETID).

> +static int safesetid_security_capable(const struct cred *cred,
> +                                     struct user_namespace *ns,
> +                                     int cap,
> +                                     unsigned int opts)
> +{
> +       if (cap == CAP_SETUID &&
> +           check_setuid_policy_hashtable_key(cred->uid)) {
> +               if (!(opts & CAP_OPT_INSETID)) {
> +                       /*
> +                        * Deny if we're not in a set*uid() syscall to avoid
> +                        * giving powers gated by CAP_SETUID that are related
> +                        * to functionality other than calling set*uid() (e.g.
> +                        * allowing user to set up userns uid mappings).
> +                        */
> +                       pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions",
> +                               __kuid_val(cred->uid));
> +                       return -1;
> +                }
> +       }
> +       return 0;
> +}

Much cleaner than the per-arch syscall tests. :)

> +static void setuid_policy_violation(kuid_t parent, kuid_t child)
> +{
> +       pr_warn("UID transition (%d -> %d) blocked",
> +               __kuid_val(parent),
> +               __kuid_val(child));
> +        /*
> +         * Kill this process to avoid potential security vulnerabilities
> +         * that could arise from a missing whitelist entry preventing a
> +         * privileged process from dropping to a lesser-privileged one.
> +         */
> +        do_exit(SIGKILL);

I think I asked earlier if this should be an unblockable signal raise
instead of a do_exit(). I don't remember if that got answered?

> +}
> +
> +static int check_uid_transition(kuid_t parent, kuid_t child)
> +{
> +       if (check_setuid_policy_hashtable_key_value(parent, child))
> +               return 0;
> +       setuid_policy_violation(parent, child);
> +       return -1;
> +}

Any reason not to just collapse setuid_policy_violation() into this function?

> +
> +/*
> + * Check whether there is either an exception for user under old cred struct to
> + * set*uid to user under new cred struct, or the UID transition is allowed (by
> + * Linux set*uid rules) even without CAP_SETUID.
> + */
> +static int safesetid_task_fix_setuid(struct cred *new,
> +                                    const struct cred *old,
> +                                    int flags)
> +{
> +
> +       /* Do nothing if there are no setuid restrictions for this UID. */
> +       if (!check_setuid_policy_hashtable_key(old->uid))
> +               return 0;
> +
> +       switch (flags) {
> +       case LSM_SETID_RE:
> +               /*
> +                * Users for which setuid restrictions exist can only set the
> +                * real UID to the real UID or the effective UID, unless an
> +                * explicit whitelist policy allows the transition.
> +                */
> +               if (!uid_eq(old->uid, new->uid) &&
> +                       !uid_eq(old->euid, new->uid)) {
> +                       return check_uid_transition(old->uid, new->uid);
> +               }
> +               /*
> +                * Users for which setuid restrictions exist can only set the
> +                * effective UID to the real UID, the effective UID, or the
> +                * saved set-UID, unless an explicit whitelist policy allows
> +                * the transition.
> +                */
> +               if (!uid_eq(old->uid, new->euid) &&
> +                       !uid_eq(old->euid, new->euid) &&
> +                       !uid_eq(old->suid, new->euid)) {
> +                       return check_uid_transition(old->euid, new->euid);
> +               }
> +               break;
> +       case LSM_SETID_ID:
> +               /*
> +                * Users for which setuid restrictions exist cannot change the
> +                * real UID or saved set-UID unless an explicit whitelist
> +                * policy allows the transition.
> +                */
> +               if (!uid_eq(old->uid, new->uid))
> +                       return check_uid_transition(old->uid, new->uid);
> +               if (!uid_eq(old->suid, new->suid))
> +                       return check_uid_transition(old->suid, new->suid);
> +               break;
> +       case LSM_SETID_RES:
> +               /*
> +                * Users for which setuid restrictions exist cannot change the
> +                * real UID, effective UID, or saved set-UID to anything but
> +                * one of: the current real UID, the current effective UID or
> +                * the current saved set-user-ID unless an explicit whitelist
> +                * policy allows the transition.
> +                */
> +               if (!uid_eq(new->uid, old->uid) &&
> +                       !uid_eq(new->uid, old->euid) &&
> +                       !uid_eq(new->uid, old->suid)) {
> +                       return check_uid_transition(old->uid, new->uid);
> +               }
> +               if (!uid_eq(new->euid, old->uid) &&
> +                       !uid_eq(new->euid, old->euid) &&
> +                       !uid_eq(new->euid, old->suid)) {
> +                       return check_uid_transition(old->euid, new->euid);
> +               }
> +               if (!uid_eq(new->suid, old->uid) &&
> +                       !uid_eq(new->suid, old->euid) &&
> +                       !uid_eq(new->suid, old->suid)) {
> +                       return check_uid_transition(old->suid, new->suid);
> +               }
> +               break;
> +       case LSM_SETID_FS:
> +               /*
> +                * Users for which setuid restrictions exist cannot change the
> +                * filesystem UID to anything but one of: the current real UID,
> +                * the current effective UID or the current saved set-UID
> +                * unless an explicit whitelist policy allows the transition.
> +                */
> +               if (!uid_eq(new->fsuid, old->uid)  &&
> +                       !uid_eq(new->fsuid, old->euid)  &&
> +                       !uid_eq(new->fsuid, old->suid) &&
> +                       !uid_eq(new->fsuid, old->fsuid)) {
> +                       return check_uid_transition(old->fsuid, new->fsuid);
> +               }
> +               break;
> +       }
> +       return 0;
> +}
> +
> +int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child)
> +{
> +       struct entry *new;
> +
> +       /* Return if entry already exists */
> +       if (check_setuid_policy_hashtable_key_value(parent, child))
> +               return 0;
> +
> +       new = kzalloc(sizeof(struct entry), GFP_KERNEL);
> +       if (!new)
> +               return -ENOMEM;
> +       new->parent_kuid = __kuid_val(parent);
> +       new->child_kuid = __kuid_val(child);
> +       spin_lock(&safesetid_whitelist_hashtable_spinlock);
> +       hash_add_rcu(safesetid_whitelist_hashtable,
> +                    &new->next,
> +                    __kuid_val(parent));
> +       spin_unlock(&safesetid_whitelist_hashtable_spinlock);
> +       return 0;
> +}
> +
> +void flush_safesetid_whitelist_entries(void)
> +{
> +       struct entry *entry;
> +       struct hlist_node *hlist_node;
> +       unsigned int bkt_loop_cursor;
> +       HLIST_HEAD(free_list);
> +
> +       /*
> +        * Could probably use hash_for_each_rcu here instead, but this should
> +        * be fine as well.
> +        */
> +       spin_lock(&safesetid_whitelist_hashtable_spinlock);
> +       hash_for_each_safe(safesetid_whitelist_hashtable, bkt_loop_cursor,
> +                          hlist_node, entry, next) {
> +               hash_del_rcu(&entry->next);
> +               hlist_add_head(&entry->dlist, &free_list);
> +       }
> +       spin_unlock(&safesetid_whitelist_hashtable_spinlock);
> +       synchronize_rcu();
> +       hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist) {
> +               hlist_del(&entry->dlist);
> +               kfree(entry);
> +       }
> +}
> +
> +static struct security_hook_list safesetid_security_hooks[] = {
> +       LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
> +       LSM_HOOK_INIT(capable, safesetid_security_capable)
> +};
> +
> +static int __init safesetid_security_init(void)
> +{
> +       security_add_hooks(safesetid_security_hooks,
> +                          ARRAY_SIZE(safesetid_security_hooks), "safesetid");
> +
> +       return 0;
> +}
> +
> +DEFINE_LSM(safesetid_security_init) = {
> +       .init = safesetid_security_init,
> +};
> diff --git a/security/safesetid/lsm.h b/security/safesetid/lsm.h
> new file mode 100644
> index 000000000000..bf78af9bf314
> --- /dev/null
> +++ b/security/safesetid/lsm.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * SafeSetID Linux Security Module
> + *
> + * Author: Micah Morton <mortonm@chromium.org>
> + *
> + * Copyright (C) 2018 The Chromium OS Authors.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2, as
> + * published by the Free Software Foundation.
> + *
> + */
> +#ifndef _SAFESETID_H
> +#define _SAFESETID_H
> +
> +#include <linux/types.h>
> +
> +/* Function type. */
> +enum safesetid_whitelist_file_write_type {
> +       SAFESETID_WHITELIST_ADD, /* Add whitelist policy. */
> +       SAFESETID_WHITELIST_FLUSH, /* Flush whitelist policies. */
> +};
> +
> +/* Add entry to safesetid whitelist to allow 'parent' to setid to 'child'. */
> +int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child);
> +
> +void flush_safesetid_whitelist_entries(void);
> +
> +#endif /* _SAFESETID_H */
> diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c
> new file mode 100644
> index 000000000000..ff5fcf2c1b37
> --- /dev/null
> +++ b/security/safesetid/securityfs.c
> @@ -0,0 +1,189 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * SafeSetID Linux Security Module
> + *
> + * Author: Micah Morton <mortonm@chromium.org>
> + *
> + * Copyright (C) 2018 The Chromium OS Authors.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2, as
> + * published by the Free Software Foundation.
> + *
> + */
> +#include <linux/security.h>
> +#include <linux/cred.h>
> +
> +#include "lsm.h"
> +
> +static struct dentry *safesetid_policy_dir;
> +
> +struct safesetid_file_entry {
> +       const char *name;
> +       enum safesetid_whitelist_file_write_type type;
> +       struct dentry *dentry;
> +};
> +
> +static struct safesetid_file_entry safesetid_files[] = {
> +       {.name = "add_whitelist_policy",
> +        .type = SAFESETID_WHITELIST_ADD},
> +       {.name = "flush_whitelist_policies",
> +        .type = SAFESETID_WHITELIST_FLUSH},
> +};
> +
> +/*
> + * In the case the input buffer contains one or more invalid UIDs, the kuid_t
> + * variables pointed to by 'parent' and 'child' will get updated but this
> + * function will return an error.
> + */
> +static int parse_safesetid_whitelist_policy(const char __user *buf,
> +                                           size_t len,
> +                                           kuid_t *parent,
> +                                           kuid_t *child)
> +{
> +       char *kern_buf;
> +       char *parent_buf;
> +       char *child_buf;
> +       const char separator[] = ":";
> +       int ret;
> +       size_t first_substring_length;
> +       long parsed_parent;
> +       long parsed_child;
> +
> +       /* Duplicate string from user memory and NULL-terminate */
> +       kern_buf = memdup_user_nul(buf, len);
> +       if (IS_ERR(kern_buf))
> +               return PTR_ERR(kern_buf);
> +
> +       /*
> +        * Format of |buf| string should be <UID>:<UID>.
> +        * Find location of ":" in kern_buf (copied from |buf|).
> +        */
> +       first_substring_length = strcspn(kern_buf, separator);
> +       if (first_substring_length == 0 || first_substring_length == len) {
> +               ret = -EINVAL;
> +               goto free_kern;
> +       }
> +
> +       parent_buf = kmemdup_nul(kern_buf, first_substring_length, GFP_KERNEL);
> +       if (!parent_buf) {
> +               ret = -ENOMEM;
> +               goto free_kern;
> +       }
> +
> +       ret = kstrtol(parent_buf, 0, &parsed_parent);
> +       if (ret)
> +               goto free_both;
> +
> +       child_buf = kern_buf + first_substring_length + 1;
> +       ret = kstrtol(child_buf, 0, &parsed_child);
> +       if (ret)
> +               goto free_both;
> +
> +       *parent = make_kuid(current_user_ns(), parsed_parent);
> +       if (!uid_valid(*parent)) {
> +               ret = -EINVAL;
> +               goto free_both;
> +       }
> +
> +       *child = make_kuid(current_user_ns(), parsed_child);
> +       if (!uid_valid(*child)) {
> +               ret = -EINVAL;
> +               goto free_both;
> +       }
> +
> +free_both:
> +       kfree(parent_buf);
> +free_kern:
> +       kfree(kern_buf);
> +       return ret;
> +}
> +
> +static ssize_t safesetid_file_write(struct file *file,
> +                                   const char __user *buf,
> +                                   size_t len,
> +                                   loff_t *ppos)
> +{
> +       struct safesetid_file_entry *file_entry =
> +               file->f_inode->i_private;
> +       kuid_t parent;
> +       kuid_t child;
> +       int ret;
> +
> +       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))

Maybe CAP_MAC_ADMIN instead of (the overloaded) CAP_SYS_ADMIN?

> +               return -EPERM;
> +
> +       if (*ppos != 0)
> +               return -EINVAL;
> +
> +       if (file_entry->type == SAFESETID_WHITELIST_FLUSH) {
> +               flush_safesetid_whitelist_entries();
> +               return len;
> +       }
> +
> +       /*
> +        * If we get to here, must be the case that file_entry->type equals
> +        * SAFESETID_WHITELIST_ADD

It seems a bit silly with only two options here, but it'll change for
gids, yes? How about just building a switch() around file_entry->type
instead and avoid needing to refactor this later?

> +        */
> +       ret = parse_safesetid_whitelist_policy(buf, len, &parent,
> +                                                        &child);
> +       if (ret)
> +               return ret;
> +
> +       ret = add_safesetid_whitelist_entry(parent, child);
> +       if (ret)
> +               return ret;
> +
> +       /* Return len on success so caller won't keep trying to write */
> +       return len;
> +}
> +
> +static const struct file_operations safesetid_file_fops = {
> +       .write = safesetid_file_write,
> +};
> +
> +static void safesetid_shutdown_securityfs(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
> +               struct safesetid_file_entry *entry =
> +                       &safesetid_files[i];
> +               securityfs_remove(entry->dentry);
> +               entry->dentry = NULL;
> +       }
> +
> +       securityfs_remove(safesetid_policy_dir);
> +       safesetid_policy_dir = NULL;
> +}
> +
> +static int __init safesetid_init_securityfs(void)
> +{
> +       int i;
> +       int ret;
> +
> +       safesetid_policy_dir = securityfs_create_dir("safesetid", NULL);
> +       if (!safesetid_policy_dir) {
> +               ret = PTR_ERR(safesetid_policy_dir);
> +               goto error;
> +       }
> +
> +       for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
> +               struct safesetid_file_entry *entry =
> +                       &safesetid_files[i];
> +               entry->dentry = securityfs_create_file(
> +                       entry->name, 0200, safesetid_policy_dir,
> +                       entry, &safesetid_file_fops);
> +               if (IS_ERR(entry->dentry)) {
> +                       ret = PTR_ERR(entry->dentry);
> +                       goto error;
> +               }
> +       }
> +
> +       return 0;
> +
> +error:
> +       safesetid_shutdown_securityfs();
> +       return ret;
> +}
> +fs_initcall(safesetid_init_securityfs);
> --
> 2.20.1.97.g81188d93c3-goog
>

But overall, it looks good to me. :)
James Morris Jan. 15, 2019, 4:07 a.m. UTC | #2
On Fri, 11 Jan 2019, mortonm@chromium.org wrote:

> From: Micah Morton <mortonm@chromium.org>
> 
> SafeSetID gates the setid family of syscalls to restrict UID/GID
> transitions from a given UID/GID to only those approved by a
> system-wide whitelist. These restrictions also prohibit the given
> UIDs/GIDs from obtaining auxiliary privileges associated with
> CAP_SET{U/G}ID, such as allowing a user to set up user namespace UID
> mappings. For now, only gating the set*uid family of syscalls is
> supported, with support for set*gid coming in a future patch set.
> 

I can't recall if this has been mentioned, but is this code already 
shipping in any distros or products, and are any distros planning on 
enabling this feature?



- James
Micah Morton Jan. 15, 2019, 7:42 p.m. UTC | #3
On Mon, Jan 14, 2019 at 8:07 PM James Morris <jmorris@namei.org> wrote:
>
> On Fri, 11 Jan 2019, mortonm@chromium.org wrote:
>
> > From: Micah Morton <mortonm@chromium.org>
> >
> > SafeSetID gates the setid family of syscalls to restrict UID/GID
> > transitions from a given UID/GID to only those approved by a
> > system-wide whitelist. These restrictions also prohibit the given
> > UIDs/GIDs from obtaining auxiliary privileges associated with
> > CAP_SET{U/G}ID, such as allowing a user to set up user namespace UID
> > mappings. For now, only gating the set*uid family of syscalls is
> > supported, with support for set*gid coming in a future patch set.
> >
>
> I can't recall if this has been mentioned, but is this code already
> shipping in any distros or products, and are any distros planning on
> enabling this feature?

It is shipping on ChromeOS (the hooking is done in our own LSM that we
maintain, but everything else is the same, and we have integration
tests for it). We use it to lock down a handful of system daemons that
need to switch to certain, predetermined UIDs on the system (but not
root). There look to be a few use cases for this LSM in Android as
well, which is a possibility in the future.

>
>
>
> - James
> --
> James Morris
> <jmorris@namei.org>
>
Micah Morton Jan. 15, 2019, 7:49 p.m. UTC | #4
On Mon, Jan 14, 2019 at 4:38 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Fri, Jan 11, 2019 at 9:13 AM <mortonm@chromium.org> wrote:
> >
> > From: Micah Morton <mortonm@chromium.org>
> >
> > SafeSetID gates the setid family of syscalls to restrict UID/GID
> > transitions from a given UID/GID to only those approved by a
> > system-wide whitelist. These restrictions also prohibit the given
> > UIDs/GIDs from obtaining auxiliary privileges associated with
> > CAP_SET{U/G}ID, such as allowing a user to set up user namespace UID
> > mappings. For now, only gating the set*uid family of syscalls is
> > supported, with support for set*gid coming in a future patch set.
> >
> > Signed-off-by: Micah Morton <mortonm@chromium.org>
> > ---
> > Changes since the last patch set: Rebase after commit
> > a35ce66b801631823fc78c8a78d104f9c0976867 got applied to next-general.
> > As a result of that commit, we can remove the changes in arch/ and the
> > setuid_syscall function in lsm.c, since this code no longer needs to do
> > arch-specific operations to see if security_capable is being called from
> > a setid syscall. Instead, we add the ns_capable_insetid function and
> > call it from the setid syscalls in kernel/sys.c (rather than calling the
> > original ns_capable function), which allows us to signal to the
> > security_capable hooks whether the hook is being called from within a
> > setid syscall.
>
> I would split this patch into two halfs: the "no op" change that
> "marks" all the setid call sites in the first patch, then the LSM
> itself in the second patch.

Done.

>
> > +bool ns_capable_insetid(struct user_namespace *ns, int cap)
> > +{
> > +       return ns_capable_common(ns, cap, CAP_OPT_INSETID);
> > +}
> > +EXPORT_SYMBOL(ns_capable_insetid);
>
> Since we have the noaudit helper still, using this one seems fine to
> me. I might bikeshed the name to "ns_capable_setid()". If others don't
> want a new helper, then it should be fine to just change the callsites
> to the direct ns_capable_common(ns, cap, CAP_OPT_INSETID).

Done.

>
> > +static int safesetid_security_capable(const struct cred *cred,
> > +                                     struct user_namespace *ns,
> > +                                     int cap,
> > +                                     unsigned int opts)
> > +{
> > +       if (cap == CAP_SETUID &&
> > +           check_setuid_policy_hashtable_key(cred->uid)) {
> > +               if (!(opts & CAP_OPT_INSETID)) {
> > +                       /*
> > +                        * Deny if we're not in a set*uid() syscall to avoid
> > +                        * giving powers gated by CAP_SETUID that are related
> > +                        * to functionality other than calling set*uid() (e.g.
> > +                        * allowing user to set up userns uid mappings).
> > +                        */
> > +                       pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions",
> > +                               __kuid_val(cred->uid));
> > +                       return -1;
> > +                }
> > +       }
> > +       return 0;
> > +}
>
> Much cleaner than the per-arch syscall tests. :)
>
> > +static void setuid_policy_violation(kuid_t parent, kuid_t child)
> > +{
> > +       pr_warn("UID transition (%d -> %d) blocked",
> > +               __kuid_val(parent),
> > +               __kuid_val(child));
> > +        /*
> > +         * Kill this process to avoid potential security vulnerabilities
> > +         * that could arise from a missing whitelist entry preventing a
> > +         * privileged process from dropping to a lesser-privileged one.
> > +         */
> > +        do_exit(SIGKILL);
>
> I think I asked earlier if this should be an unblockable signal raise
> instead of a do_exit(). I don't remember if that got answered?

Could you elaborate on this a bit, or share a pointer to some code/doc
that explains the difference? I don't recall this point being raised
before (might have missed it), and I'm no expert on the different
approaches to killing a process in this way.

>
> > +}
> > +
> > +static int check_uid_transition(kuid_t parent, kuid_t child)
> > +{
> > +       if (check_setuid_policy_hashtable_key_value(parent, child))
> > +               return 0;
> > +       setuid_policy_violation(parent, child);
> > +       return -1;
> > +}
>
> Any reason not to just collapse setuid_policy_violation() into this function?

Done.

>
> > +
> > +/*
> > + * Check whether there is either an exception for user under old cred struct to
> > + * set*uid to user under new cred struct, or the UID transition is allowed (by
> > + * Linux set*uid rules) even without CAP_SETUID.
> > + */
> > +static int safesetid_task_fix_setuid(struct cred *new,
> > +                                    const struct cred *old,
> > +                                    int flags)
> > +{
> > +
> > +       /* Do nothing if there are no setuid restrictions for this UID. */
> > +       if (!check_setuid_policy_hashtable_key(old->uid))
> > +               return 0;
> > +
> > +       switch (flags) {
> > +       case LSM_SETID_RE:
> > +               /*
> > +                * Users for which setuid restrictions exist can only set the
> > +                * real UID to the real UID or the effective UID, unless an
> > +                * explicit whitelist policy allows the transition.
> > +                */
> > +               if (!uid_eq(old->uid, new->uid) &&
> > +                       !uid_eq(old->euid, new->uid)) {
> > +                       return check_uid_transition(old->uid, new->uid);
> > +               }
> > +               /*
> > +                * Users for which setuid restrictions exist can only set the
> > +                * effective UID to the real UID, the effective UID, or the
> > +                * saved set-UID, unless an explicit whitelist policy allows
> > +                * the transition.
> > +                */
> > +               if (!uid_eq(old->uid, new->euid) &&
> > +                       !uid_eq(old->euid, new->euid) &&
> > +                       !uid_eq(old->suid, new->euid)) {
> > +                       return check_uid_transition(old->euid, new->euid);
> > +               }
> > +               break;
> > +       case LSM_SETID_ID:
> > +               /*
> > +                * Users for which setuid restrictions exist cannot change the
> > +                * real UID or saved set-UID unless an explicit whitelist
> > +                * policy allows the transition.
> > +                */
> > +               if (!uid_eq(old->uid, new->uid))
> > +                       return check_uid_transition(old->uid, new->uid);
> > +               if (!uid_eq(old->suid, new->suid))
> > +                       return check_uid_transition(old->suid, new->suid);
> > +               break;
> > +       case LSM_SETID_RES:
> > +               /*
> > +                * Users for which setuid restrictions exist cannot change the
> > +                * real UID, effective UID, or saved set-UID to anything but
> > +                * one of: the current real UID, the current effective UID or
> > +                * the current saved set-user-ID unless an explicit whitelist
> > +                * policy allows the transition.
> > +                */
> > +               if (!uid_eq(new->uid, old->uid) &&
> > +                       !uid_eq(new->uid, old->euid) &&
> > +                       !uid_eq(new->uid, old->suid)) {
> > +                       return check_uid_transition(old->uid, new->uid);
> > +               }
> > +               if (!uid_eq(new->euid, old->uid) &&
> > +                       !uid_eq(new->euid, old->euid) &&
> > +                       !uid_eq(new->euid, old->suid)) {
> > +                       return check_uid_transition(old->euid, new->euid);
> > +               }
> > +               if (!uid_eq(new->suid, old->uid) &&
> > +                       !uid_eq(new->suid, old->euid) &&
> > +                       !uid_eq(new->suid, old->suid)) {
> > +                       return check_uid_transition(old->suid, new->suid);
> > +               }
> > +               break;
> > +       case LSM_SETID_FS:
> > +               /*
> > +                * Users for which setuid restrictions exist cannot change the
> > +                * filesystem UID to anything but one of: the current real UID,
> > +                * the current effective UID or the current saved set-UID
> > +                * unless an explicit whitelist policy allows the transition.
> > +                */
> > +               if (!uid_eq(new->fsuid, old->uid)  &&
> > +                       !uid_eq(new->fsuid, old->euid)  &&
> > +                       !uid_eq(new->fsuid, old->suid) &&
> > +                       !uid_eq(new->fsuid, old->fsuid)) {
> > +                       return check_uid_transition(old->fsuid, new->fsuid);
> > +               }
> > +               break;
> > +       }
> > +       return 0;
> > +}
> > +
> > +int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child)
> > +{
> > +       struct entry *new;
> > +
> > +       /* Return if entry already exists */
> > +       if (check_setuid_policy_hashtable_key_value(parent, child))
> > +               return 0;
> > +
> > +       new = kzalloc(sizeof(struct entry), GFP_KERNEL);
> > +       if (!new)
> > +               return -ENOMEM;
> > +       new->parent_kuid = __kuid_val(parent);
> > +       new->child_kuid = __kuid_val(child);
> > +       spin_lock(&safesetid_whitelist_hashtable_spinlock);
> > +       hash_add_rcu(safesetid_whitelist_hashtable,
> > +                    &new->next,
> > +                    __kuid_val(parent));
> > +       spin_unlock(&safesetid_whitelist_hashtable_spinlock);
> > +       return 0;
> > +}
> > +
> > +void flush_safesetid_whitelist_entries(void)
> > +{
> > +       struct entry *entry;
> > +       struct hlist_node *hlist_node;
> > +       unsigned int bkt_loop_cursor;
> > +       HLIST_HEAD(free_list);
> > +
> > +       /*
> > +        * Could probably use hash_for_each_rcu here instead, but this should
> > +        * be fine as well.
> > +        */
> > +       spin_lock(&safesetid_whitelist_hashtable_spinlock);
> > +       hash_for_each_safe(safesetid_whitelist_hashtable, bkt_loop_cursor,
> > +                          hlist_node, entry, next) {
> > +               hash_del_rcu(&entry->next);
> > +               hlist_add_head(&entry->dlist, &free_list);
> > +       }
> > +       spin_unlock(&safesetid_whitelist_hashtable_spinlock);
> > +       synchronize_rcu();
> > +       hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist) {
> > +               hlist_del(&entry->dlist);
> > +               kfree(entry);
> > +       }
> > +}
> > +
> > +static struct security_hook_list safesetid_security_hooks[] = {
> > +       LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
> > +       LSM_HOOK_INIT(capable, safesetid_security_capable)
> > +};
> > +
> > +static int __init safesetid_security_init(void)
> > +{
> > +       security_add_hooks(safesetid_security_hooks,
> > +                          ARRAY_SIZE(safesetid_security_hooks), "safesetid");
> > +
> > +       return 0;
> > +}
> > +
> > +DEFINE_LSM(safesetid_security_init) = {
> > +       .init = safesetid_security_init,
> > +};
> > diff --git a/security/safesetid/lsm.h b/security/safesetid/lsm.h
> > new file mode 100644
> > index 000000000000..bf78af9bf314
> > --- /dev/null
> > +++ b/security/safesetid/lsm.h
> > @@ -0,0 +1,30 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * SafeSetID Linux Security Module
> > + *
> > + * Author: Micah Morton <mortonm@chromium.org>
> > + *
> > + * Copyright (C) 2018 The Chromium OS Authors.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2, as
> > + * published by the Free Software Foundation.
> > + *
> > + */
> > +#ifndef _SAFESETID_H
> > +#define _SAFESETID_H
> > +
> > +#include <linux/types.h>
> > +
> > +/* Function type. */
> > +enum safesetid_whitelist_file_write_type {
> > +       SAFESETID_WHITELIST_ADD, /* Add whitelist policy. */
> > +       SAFESETID_WHITELIST_FLUSH, /* Flush whitelist policies. */
> > +};
> > +
> > +/* Add entry to safesetid whitelist to allow 'parent' to setid to 'child'. */
> > +int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child);
> > +
> > +void flush_safesetid_whitelist_entries(void);
> > +
> > +#endif /* _SAFESETID_H */
> > diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c
> > new file mode 100644
> > index 000000000000..ff5fcf2c1b37
> > --- /dev/null
> > +++ b/security/safesetid/securityfs.c
> > @@ -0,0 +1,189 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * SafeSetID Linux Security Module
> > + *
> > + * Author: Micah Morton <mortonm@chromium.org>
> > + *
> > + * Copyright (C) 2018 The Chromium OS Authors.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2, as
> > + * published by the Free Software Foundation.
> > + *
> > + */
> > +#include <linux/security.h>
> > +#include <linux/cred.h>
> > +
> > +#include "lsm.h"
> > +
> > +static struct dentry *safesetid_policy_dir;
> > +
> > +struct safesetid_file_entry {
> > +       const char *name;
> > +       enum safesetid_whitelist_file_write_type type;
> > +       struct dentry *dentry;
> > +};
> > +
> > +static struct safesetid_file_entry safesetid_files[] = {
> > +       {.name = "add_whitelist_policy",
> > +        .type = SAFESETID_WHITELIST_ADD},
> > +       {.name = "flush_whitelist_policies",
> > +        .type = SAFESETID_WHITELIST_FLUSH},
> > +};
> > +
> > +/*
> > + * In the case the input buffer contains one or more invalid UIDs, the kuid_t
> > + * variables pointed to by 'parent' and 'child' will get updated but this
> > + * function will return an error.
> > + */
> > +static int parse_safesetid_whitelist_policy(const char __user *buf,
> > +                                           size_t len,
> > +                                           kuid_t *parent,
> > +                                           kuid_t *child)
> > +{
> > +       char *kern_buf;
> > +       char *parent_buf;
> > +       char *child_buf;
> > +       const char separator[] = ":";
> > +       int ret;
> > +       size_t first_substring_length;
> > +       long parsed_parent;
> > +       long parsed_child;
> > +
> > +       /* Duplicate string from user memory and NULL-terminate */
> > +       kern_buf = memdup_user_nul(buf, len);
> > +       if (IS_ERR(kern_buf))
> > +               return PTR_ERR(kern_buf);
> > +
> > +       /*
> > +        * Format of |buf| string should be <UID>:<UID>.
> > +        * Find location of ":" in kern_buf (copied from |buf|).
> > +        */
> > +       first_substring_length = strcspn(kern_buf, separator);
> > +       if (first_substring_length == 0 || first_substring_length == len) {
> > +               ret = -EINVAL;
> > +               goto free_kern;
> > +       }
> > +
> > +       parent_buf = kmemdup_nul(kern_buf, first_substring_length, GFP_KERNEL);
> > +       if (!parent_buf) {
> > +               ret = -ENOMEM;
> > +               goto free_kern;
> > +       }
> > +
> > +       ret = kstrtol(parent_buf, 0, &parsed_parent);
> > +       if (ret)
> > +               goto free_both;
> > +
> > +       child_buf = kern_buf + first_substring_length + 1;
> > +       ret = kstrtol(child_buf, 0, &parsed_child);
> > +       if (ret)
> > +               goto free_both;
> > +
> > +       *parent = make_kuid(current_user_ns(), parsed_parent);
> > +       if (!uid_valid(*parent)) {
> > +               ret = -EINVAL;
> > +               goto free_both;
> > +       }
> > +
> > +       *child = make_kuid(current_user_ns(), parsed_child);
> > +       if (!uid_valid(*child)) {
> > +               ret = -EINVAL;
> > +               goto free_both;
> > +       }
> > +
> > +free_both:
> > +       kfree(parent_buf);
> > +free_kern:
> > +       kfree(kern_buf);
> > +       return ret;
> > +}
> > +
> > +static ssize_t safesetid_file_write(struct file *file,
> > +                                   const char __user *buf,
> > +                                   size_t len,
> > +                                   loff_t *ppos)
> > +{
> > +       struct safesetid_file_entry *file_entry =
> > +               file->f_inode->i_private;
> > +       kuid_t parent;
> > +       kuid_t child;
> > +       int ret;
> > +
> > +       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
>
> Maybe CAP_MAC_ADMIN instead of (the overloaded) CAP_SYS_ADMIN?

Makes sense. Done.

>
> > +               return -EPERM;
> > +
> > +       if (*ppos != 0)
> > +               return -EINVAL;
> > +
> > +       if (file_entry->type == SAFESETID_WHITELIST_FLUSH) {
> > +               flush_safesetid_whitelist_entries();
> > +               return len;
> > +       }
> > +
> > +       /*
> > +        * If we get to here, must be the case that file_entry->type equals
> > +        * SAFESETID_WHITELIST_ADD
>
> It seems a bit silly with only two options here, but it'll change for
> gids, yes? How about just building a switch() around file_entry->type
> instead and avoid needing to refactor this later?

Yes, there will likely be more entries when gids are introduced. Done.

>
> > +        */
> > +       ret = parse_safesetid_whitelist_policy(buf, len, &parent,
> > +                                                        &child);
> > +       if (ret)
> > +               return ret;
> > +
> > +       ret = add_safesetid_whitelist_entry(parent, child);
> > +       if (ret)
> > +               return ret;
> > +
> > +       /* Return len on success so caller won't keep trying to write */
> > +       return len;
> > +}
> > +
> > +static const struct file_operations safesetid_file_fops = {
> > +       .write = safesetid_file_write,
> > +};
> > +
> > +static void safesetid_shutdown_securityfs(void)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
> > +               struct safesetid_file_entry *entry =
> > +                       &safesetid_files[i];
> > +               securityfs_remove(entry->dentry);
> > +               entry->dentry = NULL;
> > +       }
> > +
> > +       securityfs_remove(safesetid_policy_dir);
> > +       safesetid_policy_dir = NULL;
> > +}
> > +
> > +static int __init safesetid_init_securityfs(void)
> > +{
> > +       int i;
> > +       int ret;
> > +
> > +       safesetid_policy_dir = securityfs_create_dir("safesetid", NULL);
> > +       if (!safesetid_policy_dir) {
> > +               ret = PTR_ERR(safesetid_policy_dir);
> > +               goto error;
> > +       }
> > +
> > +       for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
> > +               struct safesetid_file_entry *entry =
> > +                       &safesetid_files[i];
> > +               entry->dentry = securityfs_create_file(
> > +                       entry->name, 0200, safesetid_policy_dir,
> > +                       entry, &safesetid_file_fops);
> > +               if (IS_ERR(entry->dentry)) {
> > +                       ret = PTR_ERR(entry->dentry);
> > +                       goto error;
> > +               }
> > +       }
> > +
> > +       return 0;
> > +
> > +error:
> > +       safesetid_shutdown_securityfs();
> > +       return ret;
> > +}
> > +fs_initcall(safesetid_init_securityfs);
> > --
> > 2.20.1.97.g81188d93c3-goog
> >
>
> But overall, it looks good to me. :)
>
> --
> Kees Cook
Kees Cook Jan. 15, 2019, 7:53 p.m. UTC | #5
On Tue, Jan 15, 2019 at 11:49 AM Micah Morton <mortonm@chromium.org> wrote:
> On Mon, Jan 14, 2019 at 4:38 PM Kees Cook <keescook@chromium.org> wrote:
> > On Fri, Jan 11, 2019 at 9:13 AM <mortonm@chromium.org> wrote:
> > > From: Micah Morton <mortonm@chromium.org>
> > > +static void setuid_policy_violation(kuid_t parent, kuid_t child)
> > > +{
> > > +       pr_warn("UID transition (%d -> %d) blocked",
> > > +               __kuid_val(parent),
> > > +               __kuid_val(child));
> > > +        /*
> > > +         * Kill this process to avoid potential security vulnerabilities
> > > +         * that could arise from a missing whitelist entry preventing a
> > > +         * privileged process from dropping to a lesser-privileged one.
> > > +         */
> > > +        do_exit(SIGKILL);
> >
> > I think I asked earlier if this should be an unblockable signal raise
> > instead of a do_exit(). I don't remember if that got answered?
>
> Could you elaborate on this a bit, or share a pointer to some code/doc
> that explains the difference? I don't recall this point being raised
> before (might have missed it), and I'm no expert on the different
> approaches to killing a process in this way.

Sure! So, do_exit() is a big hammer, and skips a lot of clean-up-like
things (for example, this will skip core dumping, if it was desired by
the process, etc). It certainly has its place (and this may be it),
but I think it would be more sensible to use:

force_sig(SIGKILL, current);

and then the regular processing continues after this, and the kernel
will check for pending signals before returning to userspace. Though
please check this with strace to make sure the bad setuid() call never
returns... if it does, then I've got this wrong and do_exit() really
is appropriate here.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/LSM/SafeSetID.rst b/Documentation/admin-guide/LSM/SafeSetID.rst
new file mode 100644
index 000000000000..ffb64be67f7a
--- /dev/null
+++ b/Documentation/admin-guide/LSM/SafeSetID.rst
@@ -0,0 +1,107 @@ 
+=========
+SafeSetID
+=========
+SafeSetID is an LSM module that gates the setid family of syscalls to restrict
+UID/GID transitions from a given UID/GID to only those approved by a
+system-wide whitelist. These restrictions also prohibit the given UIDs/GIDs
+from obtaining auxiliary privileges associated with CAP_SET{U/G}ID, such as
+allowing a user to set up user namespace UID mappings.
+
+
+Background
+==========
+In absence of file capabilities, processes spawned on a Linux system that need
+to switch to a different user must be spawned with CAP_SETUID privileges.
+CAP_SETUID is granted to programs running as root or those running as a non-root
+user that have been explicitly given the CAP_SETUID runtime capability. It is
+often preferable to use Linux runtime capabilities rather than file
+capabilities, since using file capabilities to run a program with elevated
+privileges opens up possible security holes since any user with access to the
+file can exec() that program to gain the elevated privileges.
+
+While it is possible to implement a tree of processes by giving full
+CAP_SET{U/G}ID capabilities, this is often at odds with the goals of running a
+tree of processes under non-root user(s) in the first place. Specifically,
+since CAP_SETUID allows changing to any user on the system, including the root
+user, it is an overpowered capability for what is needed in this scenario,
+especially since programs often only call setuid() to drop privileges to a
+lesser-privileged user -- not elevate privileges. Unfortunately, there is no
+generally feasible way in Linux to restrict the potential UIDs that a user can
+switch to through setuid() beyond allowing a switch to any user on the system.
+This SafeSetID LSM seeks to provide a solution for restricting setid
+capabilities in such a way.
+
+The main use case for this LSM is to allow a non-root program to transition to
+other untrusted uids without full blown CAP_SETUID capabilities. The non-root
+program would still need CAP_SETUID to do any kind of transition, but the
+additional restrictions imposed by this LSM would mean it is a "safer" version
+of CAP_SETUID since the non-root program cannot take advantage of CAP_SETUID to
+do any unapproved actions (e.g. setuid to uid 0 or create/enter new user
+namespace). The higher level goal is to allow for uid-based sandboxing of system
+services without having to give out CAP_SETUID all over the place just so that
+non-root programs can drop to even-lesser-privileged uids. This is especially
+relevant when one non-root daemon on the system should be allowed to spawn other
+processes as different uids, but its undesirable to give the daemon a
+basically-root-equivalent CAP_SETUID.
+
+
+Other Approaches Considered
+===========================
+
+Solve this problem in userspace
+-------------------------------
+For candidate applications that would like to have restricted setid capabilities
+as implemented in this LSM, an alternative option would be to simply take away
+setid capabilities from the application completely and refactor the process
+spawning semantics in the application (e.g. by using a privileged helper program
+to do process spawning and UID/GID transitions). Unfortunately, there are a
+number of semantics around process spawning that would be affected by this, such
+as fork() calls where the program doesn’t immediately call exec() after the
+fork(), parent processes specifying custom environment variables or command line
+args for spawned child processes, or inheritance of file handles across a
+fork()/exec(). Because of this, as solution that uses a privileged helper in
+userspace would likely be less appealing to incorporate into existing projects
+that rely on certain process-spawning semantics in Linux.
+
+Use user namespaces
+-------------------
+Another possible approach would be to run a given process tree in its own user
+namespace and give programs in the tree setid capabilities. In this way,
+programs in the tree could change to any desired UID/GID in the context of their
+own user namespace, and only approved UIDs/GIDs could be mapped back to the
+initial system user namespace, affectively preventing privilege escalation.
+Unfortunately, it is not generally feasible to use user namespaces in isolation,
+without pairing them with other namespace types, which is not always an option.
+Linux checks for capabilities based off of the user namespace that “owns” some
+entity. For example, Linux has the notion that network namespaces are owned by
+the user namespace in which they were created. A consequence of this is that
+capability checks for access to a given network namespace are done by checking
+whether a task has the given capability in the context of the user namespace
+that owns the network namespace -- not necessarily the user namespace under
+which the given task runs. Therefore spawning a process in a new user namespace
+effectively prevents it from accessing the network namespace owned by the
+initial namespace. This is a deal-breaker for any application that expects to
+retain the CAP_NET_ADMIN capability for the purpose of adjusting network
+configurations. Using user namespaces in isolation causes problems regarding
+other system interactions, including use of pid namespaces and device creation.
+
+Use an existing LSM
+-------------------
+None of the other in-tree LSMs have the capability to gate setid transitions, or
+even employ the security_task_fix_setuid hook at all. SELinux says of that hook:
+"Since setuid only affects the current process, and since the SELinux controls
+are not based on the Linux identity attributes, SELinux does not need to control
+this operation."
+
+
+Directions for use
+==================
+This LSM hooks the setid syscalls to make sure transitions are allowed if an
+applicable restriction policy is in place. Policies are configured through
+securityfs by writing to the safesetid/add_whitelist_policy and
+safesetid/flush_whitelist_policies files at the location where securityfs is
+mounted. The format for adding a policy is '<UID>:<UID>', using literal
+numbers, such as '123:456'. To flush the policies, any write to the file is
+sufficient. Again, configuring a policy for a UID will prevent that UID from
+obtaining auxiliary setid privileges, such as allowing a user to set up user
+namespace UID mappings.
diff --git a/Documentation/admin-guide/LSM/index.rst b/Documentation/admin-guide/LSM/index.rst
index 9842e21afd4a..a6ba95fbaa9f 100644
--- a/Documentation/admin-guide/LSM/index.rst
+++ b/Documentation/admin-guide/LSM/index.rst
@@ -46,3 +46,4 @@  subdirectories.
    Smack
    tomoyo
    Yama
+   SafeSetID
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..f4771974dcfc 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -209,6 +209,7 @@  extern bool has_ns_capability_noaudit(struct task_struct *t,
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
+extern bool ns_capable_insetid(struct user_namespace *ns, int cap);
 #else
 static inline bool has_capability(struct task_struct *t, int cap)
 {
@@ -240,6 +241,10 @@  static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
 {
 	return true;
 }
+static inline bool ns_capable_insetid(struct user_namespace *ns, int cap)
+{
+	return true;
+}
 #endif /* CONFIG_MULTIUSER */
 extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
diff --git a/kernel/capability.c b/kernel/capability.c
index 7718d7dcadc7..184f544fb7b1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -417,6 +417,25 @@  bool ns_capable_noaudit(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable_noaudit);
 
+/**
+ * ns_capable_insetid - Determine if the current task has a superior capability
+ * in effect, while signalling that this check is being done from within a
+ * setid syscall.
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_insetid(struct user_namespace *ns, int cap)
+{
+	return ns_capable_common(ns, cap, CAP_OPT_INSETID);
+}
+EXPORT_SYMBOL(ns_capable_insetid);
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
diff --git a/kernel/sys.c b/kernel/sys.c
index a48cbf1414b8..d31bf3a2afc2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -516,7 +516,7 @@  long __sys_setreuid(uid_t ruid, uid_t euid)
 		new->uid = kruid;
 		if (!uid_eq(old->uid, kruid) &&
 		    !uid_eq(old->euid, kruid) &&
-		    !ns_capable(old->user_ns, CAP_SETUID))
+		    !ns_capable_insetid(old->user_ns, CAP_SETUID))
 			goto error;
 	}
 
@@ -525,7 +525,7 @@  long __sys_setreuid(uid_t ruid, uid_t euid)
 		if (!uid_eq(old->uid, keuid) &&
 		    !uid_eq(old->euid, keuid) &&
 		    !uid_eq(old->suid, keuid) &&
-		    !ns_capable(old->user_ns, CAP_SETUID))
+		    !ns_capable_insetid(old->user_ns, CAP_SETUID))
 			goto error;
 	}
 
@@ -584,7 +584,7 @@  long __sys_setuid(uid_t uid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (ns_capable(old->user_ns, CAP_SETUID)) {
+	if (ns_capable_insetid(old->user_ns, CAP_SETUID)) {
 		new->suid = new->uid = kuid;
 		if (!uid_eq(kuid, old->uid)) {
 			retval = set_user(new);
@@ -646,7 +646,7 @@  long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (!ns_capable(old->user_ns, CAP_SETUID)) {
+	if (!ns_capable_insetid(old->user_ns, CAP_SETUID)) {
 		if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
 		    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
 			goto error;
@@ -814,7 +814,7 @@  long __sys_setfsuid(uid_t uid)
 
 	if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
 	    uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
-	    ns_capable(old->user_ns, CAP_SETUID)) {
+	    ns_capable_insetid(old->user_ns, CAP_SETUID)) {
 		if (!uid_eq(kuid, old->fsuid)) {
 			new->fsuid = kuid;
 			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
diff --git a/security/Kconfig b/security/Kconfig
index 78dc12b7eeb3..9efc7a5e3280 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -236,6 +236,7 @@  source "security/tomoyo/Kconfig"
 source "security/apparmor/Kconfig"
 source "security/loadpin/Kconfig"
 source "security/yama/Kconfig"
+source "security/safesetid/Kconfig"
 
 source "security/integrity/Kconfig"
 
diff --git a/security/Makefile b/security/Makefile
index 4d2d3782ddef..c598b904938f 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -10,6 +10,7 @@  subdir-$(CONFIG_SECURITY_TOMOYO)        += tomoyo
 subdir-$(CONFIG_SECURITY_APPARMOR)	+= apparmor
 subdir-$(CONFIG_SECURITY_YAMA)		+= yama
 subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
+subdir-$(CONFIG_SECURITY_SAFESETID)    += safesetid
 
 # always enable default capabilities
 obj-y					+= commoncap.o
@@ -25,6 +26,7 @@  obj-$(CONFIG_SECURITY_TOMOYO)		+= tomoyo/
 obj-$(CONFIG_SECURITY_APPARMOR)		+= apparmor/
 obj-$(CONFIG_SECURITY_YAMA)		+= yama/
 obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
+obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
 
 # Object integrity file lists
diff --git a/security/safesetid/Kconfig b/security/safesetid/Kconfig
new file mode 100644
index 000000000000..bf89a47ffcc8
--- /dev/null
+++ b/security/safesetid/Kconfig
@@ -0,0 +1,12 @@ 
+config SECURITY_SAFESETID
+        bool "Gate setid transitions to limit CAP_SET{U/G}ID capabilities"
+        default n
+        help
+          SafeSetID is an LSM module that gates the setid family of syscalls to
+          restrict UID/GID transitions from a given UID/GID to only those
+          approved by a system-wide whitelist. These restrictions also prohibit
+          the given UIDs/GIDs from obtaining auxiliary privileges associated
+          with CAP_SET{U/G}ID, such as allowing a user to set up user namespace
+          UID mappings.
+
+          If you are unsure how to answer this question, answer N.
diff --git a/security/safesetid/Makefile b/security/safesetid/Makefile
new file mode 100644
index 000000000000..6b0660321164
--- /dev/null
+++ b/security/safesetid/Makefile
@@ -0,0 +1,7 @@ 
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the safesetid LSM.
+#
+
+obj-$(CONFIG_SECURITY_SAFESETID) := safesetid.o
+safesetid-y := lsm.o securityfs.o
diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c
new file mode 100644
index 000000000000..a1721ed85544
--- /dev/null
+++ b/security/safesetid/lsm.c
@@ -0,0 +1,272 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SafeSetID Linux Security Module
+ *
+ * Author: Micah Morton <mortonm@chromium.org>
+ *
+ * Copyright (C) 2018 The Chromium OS Authors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) "SafeSetID: " fmt
+
+#include <asm/syscall.h>
+#include <linux/hashtable.h>
+#include <linux/lsm_hooks.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/sched/task_stack.h>
+#include <linux/security.h>
+
+#define NUM_BITS 8 /* 128 buckets in hash table */
+
+static DEFINE_HASHTABLE(safesetid_whitelist_hashtable, NUM_BITS);
+
+/*
+ * Hash table entry to store safesetid policy signifying that 'parent' user
+ * can setid to 'child' user.
+ */
+struct entry {
+	struct hlist_node next;
+	struct hlist_node dlist; /* for deletion cleanup */
+	uint64_t parent_kuid;
+	uint64_t child_kuid;
+};
+
+static DEFINE_SPINLOCK(safesetid_whitelist_hashtable_spinlock);
+
+static bool check_setuid_policy_hashtable_key(kuid_t parent)
+{
+	struct entry *entry;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(safesetid_whitelist_hashtable,
+				   entry, next, __kuid_val(parent)) {
+		if (entry->parent_kuid == __kuid_val(parent)) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
+static bool check_setuid_policy_hashtable_key_value(kuid_t parent,
+						    kuid_t child)
+{
+	struct entry *entry;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(safesetid_whitelist_hashtable,
+				   entry, next, __kuid_val(parent)) {
+		if (entry->parent_kuid == __kuid_val(parent) &&
+		    entry->child_kuid == __kuid_val(child)) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
+static int safesetid_security_capable(const struct cred *cred,
+				      struct user_namespace *ns,
+				      int cap,
+				      unsigned int opts)
+{
+	if (cap == CAP_SETUID &&
+	    check_setuid_policy_hashtable_key(cred->uid)) {
+		if (!(opts & CAP_OPT_INSETID)) {
+			/*
+			 * Deny if we're not in a set*uid() syscall to avoid
+			 * giving powers gated by CAP_SETUID that are related
+			 * to functionality other than calling set*uid() (e.g.
+			 * allowing user to set up userns uid mappings).
+			 */
+			pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions",
+				__kuid_val(cred->uid));
+			return -1;
+                }
+	}
+	return 0;
+}
+
+static void setuid_policy_violation(kuid_t parent, kuid_t child)
+{
+	pr_warn("UID transition (%d -> %d) blocked",
+		__kuid_val(parent),
+		__kuid_val(child));
+        /*
+         * Kill this process to avoid potential security vulnerabilities
+         * that could arise from a missing whitelist entry preventing a
+         * privileged process from dropping to a lesser-privileged one.
+         */
+        do_exit(SIGKILL);
+}
+
+static int check_uid_transition(kuid_t parent, kuid_t child)
+{
+	if (check_setuid_policy_hashtable_key_value(parent, child))
+		return 0;
+	setuid_policy_violation(parent, child);
+	return -1;
+}
+
+/*
+ * Check whether there is either an exception for user under old cred struct to
+ * set*uid to user under new cred struct, or the UID transition is allowed (by
+ * Linux set*uid rules) even without CAP_SETUID.
+ */
+static int safesetid_task_fix_setuid(struct cred *new,
+				     const struct cred *old,
+				     int flags)
+{
+
+	/* Do nothing if there are no setuid restrictions for this UID. */
+	if (!check_setuid_policy_hashtable_key(old->uid))
+		return 0;
+
+	switch (flags) {
+	case LSM_SETID_RE:
+		/*
+		 * Users for which setuid restrictions exist can only set the
+		 * real UID to the real UID or the effective UID, unless an
+		 * explicit whitelist policy allows the transition.
+		 */
+		if (!uid_eq(old->uid, new->uid) &&
+			!uid_eq(old->euid, new->uid)) {
+			return check_uid_transition(old->uid, new->uid);
+		}
+		/*
+		 * Users for which setuid restrictions exist can only set the
+		 * effective UID to the real UID, the effective UID, or the
+		 * saved set-UID, unless an explicit whitelist policy allows
+		 * the transition.
+		 */
+		if (!uid_eq(old->uid, new->euid) &&
+			!uid_eq(old->euid, new->euid) &&
+			!uid_eq(old->suid, new->euid)) {
+			return check_uid_transition(old->euid, new->euid);
+		}
+		break;
+	case LSM_SETID_ID:
+		/*
+		 * Users for which setuid restrictions exist cannot change the
+		 * real UID or saved set-UID unless an explicit whitelist
+		 * policy allows the transition.
+		 */
+		if (!uid_eq(old->uid, new->uid))
+			return check_uid_transition(old->uid, new->uid);
+		if (!uid_eq(old->suid, new->suid))
+			return check_uid_transition(old->suid, new->suid);
+		break;
+	case LSM_SETID_RES:
+		/*
+		 * Users for which setuid restrictions exist cannot change the
+		 * real UID, effective UID, or saved set-UID to anything but
+		 * one of: the current real UID, the current effective UID or
+		 * the current saved set-user-ID unless an explicit whitelist
+		 * policy allows the transition.
+		 */
+		if (!uid_eq(new->uid, old->uid) &&
+			!uid_eq(new->uid, old->euid) &&
+			!uid_eq(new->uid, old->suid)) {
+			return check_uid_transition(old->uid, new->uid);
+		}
+		if (!uid_eq(new->euid, old->uid) &&
+			!uid_eq(new->euid, old->euid) &&
+			!uid_eq(new->euid, old->suid)) {
+			return check_uid_transition(old->euid, new->euid);
+		}
+		if (!uid_eq(new->suid, old->uid) &&
+			!uid_eq(new->suid, old->euid) &&
+			!uid_eq(new->suid, old->suid)) {
+			return check_uid_transition(old->suid, new->suid);
+		}
+		break;
+	case LSM_SETID_FS:
+		/*
+		 * Users for which setuid restrictions exist cannot change the
+		 * filesystem UID to anything but one of: the current real UID,
+		 * the current effective UID or the current saved set-UID
+		 * unless an explicit whitelist policy allows the transition.
+		 */
+		if (!uid_eq(new->fsuid, old->uid)  &&
+			!uid_eq(new->fsuid, old->euid)  &&
+			!uid_eq(new->fsuid, old->suid) &&
+			!uid_eq(new->fsuid, old->fsuid)) {
+			return check_uid_transition(old->fsuid, new->fsuid);
+		}
+		break;
+	}
+	return 0;
+}
+
+int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child)
+{
+	struct entry *new;
+
+	/* Return if entry already exists */
+	if (check_setuid_policy_hashtable_key_value(parent, child))
+		return 0;
+
+	new = kzalloc(sizeof(struct entry), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+	new->parent_kuid = __kuid_val(parent);
+	new->child_kuid = __kuid_val(child);
+	spin_lock(&safesetid_whitelist_hashtable_spinlock);
+	hash_add_rcu(safesetid_whitelist_hashtable,
+		     &new->next,
+		     __kuid_val(parent));
+	spin_unlock(&safesetid_whitelist_hashtable_spinlock);
+	return 0;
+}
+
+void flush_safesetid_whitelist_entries(void)
+{
+	struct entry *entry;
+	struct hlist_node *hlist_node;
+	unsigned int bkt_loop_cursor;
+	HLIST_HEAD(free_list);
+
+	/*
+	 * Could probably use hash_for_each_rcu here instead, but this should
+	 * be fine as well.
+	 */
+	spin_lock(&safesetid_whitelist_hashtable_spinlock);
+	hash_for_each_safe(safesetid_whitelist_hashtable, bkt_loop_cursor,
+			   hlist_node, entry, next) {
+		hash_del_rcu(&entry->next);
+		hlist_add_head(&entry->dlist, &free_list);
+	}
+	spin_unlock(&safesetid_whitelist_hashtable_spinlock);
+	synchronize_rcu();
+	hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist) {
+		hlist_del(&entry->dlist);
+		kfree(entry);
+	}
+}
+
+static struct security_hook_list safesetid_security_hooks[] = {
+	LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
+	LSM_HOOK_INIT(capable, safesetid_security_capable)
+};
+
+static int __init safesetid_security_init(void)
+{
+	security_add_hooks(safesetid_security_hooks,
+			   ARRAY_SIZE(safesetid_security_hooks), "safesetid");
+
+	return 0;
+}
+
+DEFINE_LSM(safesetid_security_init) = {
+	.init = safesetid_security_init,
+};
diff --git a/security/safesetid/lsm.h b/security/safesetid/lsm.h
new file mode 100644
index 000000000000..bf78af9bf314
--- /dev/null
+++ b/security/safesetid/lsm.h
@@ -0,0 +1,30 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SafeSetID Linux Security Module
+ *
+ * Author: Micah Morton <mortonm@chromium.org>
+ *
+ * Copyright (C) 2018 The Chromium OS Authors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+#ifndef _SAFESETID_H
+#define _SAFESETID_H
+
+#include <linux/types.h>
+
+/* Function type. */
+enum safesetid_whitelist_file_write_type {
+	SAFESETID_WHITELIST_ADD, /* Add whitelist policy. */
+	SAFESETID_WHITELIST_FLUSH, /* Flush whitelist policies. */
+};
+
+/* Add entry to safesetid whitelist to allow 'parent' to setid to 'child'. */
+int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child);
+
+void flush_safesetid_whitelist_entries(void);
+
+#endif /* _SAFESETID_H */
diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c
new file mode 100644
index 000000000000..ff5fcf2c1b37
--- /dev/null
+++ b/security/safesetid/securityfs.c
@@ -0,0 +1,189 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SafeSetID Linux Security Module
+ *
+ * Author: Micah Morton <mortonm@chromium.org>
+ *
+ * Copyright (C) 2018 The Chromium OS Authors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/security.h>
+#include <linux/cred.h>
+
+#include "lsm.h"
+
+static struct dentry *safesetid_policy_dir;
+
+struct safesetid_file_entry {
+	const char *name;
+	enum safesetid_whitelist_file_write_type type;
+	struct dentry *dentry;
+};
+
+static struct safesetid_file_entry safesetid_files[] = {
+	{.name = "add_whitelist_policy",
+	 .type = SAFESETID_WHITELIST_ADD},
+	{.name = "flush_whitelist_policies",
+	 .type = SAFESETID_WHITELIST_FLUSH},
+};
+
+/*
+ * In the case the input buffer contains one or more invalid UIDs, the kuid_t
+ * variables pointed to by 'parent' and 'child' will get updated but this
+ * function will return an error.
+ */
+static int parse_safesetid_whitelist_policy(const char __user *buf,
+					    size_t len,
+					    kuid_t *parent,
+					    kuid_t *child)
+{
+	char *kern_buf;
+	char *parent_buf;
+	char *child_buf;
+	const char separator[] = ":";
+	int ret;
+	size_t first_substring_length;
+	long parsed_parent;
+	long parsed_child;
+
+	/* Duplicate string from user memory and NULL-terminate */
+	kern_buf = memdup_user_nul(buf, len);
+	if (IS_ERR(kern_buf))
+		return PTR_ERR(kern_buf);
+
+	/*
+	 * Format of |buf| string should be <UID>:<UID>.
+	 * Find location of ":" in kern_buf (copied from |buf|).
+	 */
+	first_substring_length = strcspn(kern_buf, separator);
+	if (first_substring_length == 0 || first_substring_length == len) {
+		ret = -EINVAL;
+		goto free_kern;
+	}
+
+	parent_buf = kmemdup_nul(kern_buf, first_substring_length, GFP_KERNEL);
+	if (!parent_buf) {
+		ret = -ENOMEM;
+		goto free_kern;
+	}
+
+	ret = kstrtol(parent_buf, 0, &parsed_parent);
+	if (ret)
+		goto free_both;
+
+	child_buf = kern_buf + first_substring_length + 1;
+	ret = kstrtol(child_buf, 0, &parsed_child);
+	if (ret)
+		goto free_both;
+
+	*parent = make_kuid(current_user_ns(), parsed_parent);
+	if (!uid_valid(*parent)) {
+		ret = -EINVAL;
+		goto free_both;
+	}
+
+	*child = make_kuid(current_user_ns(), parsed_child);
+	if (!uid_valid(*child)) {
+		ret = -EINVAL;
+		goto free_both;
+	}
+
+free_both:
+	kfree(parent_buf);
+free_kern:
+	kfree(kern_buf);
+	return ret;
+}
+
+static ssize_t safesetid_file_write(struct file *file,
+				    const char __user *buf,
+				    size_t len,
+				    loff_t *ppos)
+{
+	struct safesetid_file_entry *file_entry =
+		file->f_inode->i_private;
+	kuid_t parent;
+	kuid_t child;
+	int ret;
+
+	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (*ppos != 0)
+		return -EINVAL;
+
+	if (file_entry->type == SAFESETID_WHITELIST_FLUSH) {
+		flush_safesetid_whitelist_entries();
+		return len;
+	}
+
+	/*
+	 * If we get to here, must be the case that file_entry->type equals
+	 * SAFESETID_WHITELIST_ADD
+	 */
+	ret = parse_safesetid_whitelist_policy(buf, len, &parent,
+							 &child);
+	if (ret)
+		return ret;
+
+	ret = add_safesetid_whitelist_entry(parent, child);
+	if (ret)
+		return ret;
+
+	/* Return len on success so caller won't keep trying to write */
+	return len;
+}
+
+static const struct file_operations safesetid_file_fops = {
+	.write = safesetid_file_write,
+};
+
+static void safesetid_shutdown_securityfs(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
+		struct safesetid_file_entry *entry =
+			&safesetid_files[i];
+		securityfs_remove(entry->dentry);
+		entry->dentry = NULL;
+	}
+
+	securityfs_remove(safesetid_policy_dir);
+	safesetid_policy_dir = NULL;
+}
+
+static int __init safesetid_init_securityfs(void)
+{
+	int i;
+	int ret;
+
+	safesetid_policy_dir = securityfs_create_dir("safesetid", NULL);
+	if (!safesetid_policy_dir) {
+		ret = PTR_ERR(safesetid_policy_dir);
+		goto error;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
+		struct safesetid_file_entry *entry =
+			&safesetid_files[i];
+		entry->dentry = securityfs_create_file(
+			entry->name, 0200, safesetid_policy_dir,
+			entry, &safesetid_file_fops);
+		if (IS_ERR(entry->dentry)) {
+			ret = PTR_ERR(entry->dentry);
+			goto error;
+		}
+	}
+
+	return 0;
+
+error:
+	safesetid_shutdown_securityfs();
+	return ret;
+}
+fs_initcall(safesetid_init_securityfs);