diff mbox series

[ghak90,V8,16/16] audit: add capcontid to set contid outside init_user_ns

Message ID 5941671b6b6b5de28ab2cc80e72f288cf83291d5.1577736799.git.rgb@redhat.com (mailing list archive)
State New, archived
Headers show
Series audit: implement container identifier | expand

Commit Message

Richard Guy Briggs Dec. 31, 2019, 7:48 p.m. UTC
Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
process in a non-init user namespace the capability to set audit
container identifiers.

Provide /proc/$PID/audit_capcontid interface to capcontid.
Valid values are: 1==enabled, 0==disabled

Report this action in message type AUDIT_SET_CAPCONTID 1022 with fields
opid= capcontid= old-capcontid=

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 fs/proc/base.c             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/audit.h      | 14 ++++++++++++
 include/uapi/linux/audit.h |  1 +
 kernel/audit.c             | 35 +++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+)

Comments

Paul Moore Jan. 22, 2020, 9:29 p.m. UTC | #1
On Tue, Dec 31, 2019 at 2:51 PM Richard Guy Briggs <rgb@redhat.com> wrote:
>
> Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> process in a non-init user namespace the capability to set audit
> container identifiers.
>
> Provide /proc/$PID/audit_capcontid interface to capcontid.
> Valid values are: 1==enabled, 0==disabled

It would be good to be more explicit about "enabled" and "disabled" in
the commit description.  For example, which setting allows the target
task to set audit container IDs of it's children processes?

> Report this action in message type AUDIT_SET_CAPCONTID 1022 with fields
> opid= capcontid= old-capcontid=
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  fs/proc/base.c             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/audit.h      | 14 ++++++++++++
>  include/uapi/linux/audit.h |  1 +
>  kernel/audit.c             | 35 +++++++++++++++++++++++++++++
>  4 files changed, 105 insertions(+)

...

> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 26091800180c..283ef8e006e7 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1360,6 +1360,59 @@ static ssize_t proc_contid_write(struct file *file, const char __user *buf,
>         .write          = proc_contid_write,
>         .llseek         = generic_file_llseek,
>  };
> +
> +static ssize_t proc_capcontid_read(struct file *file, char __user *buf,
> +                                 size_t count, loff_t *ppos)
> +{
> +       struct inode *inode = file_inode(file);
> +       struct task_struct *task = get_proc_task(inode);
> +       ssize_t length;
> +       char tmpbuf[TMPBUFLEN];
> +
> +       if (!task)
> +               return -ESRCH;
> +       /* if we don't have caps, reject */
> +       if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> +               return -EPERM;
> +       length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_capcontid(task));
> +       put_task_struct(task);
> +       return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
> +}
> +
> +static ssize_t proc_capcontid_write(struct file *file, const char __user *buf,
> +                                  size_t count, loff_t *ppos)
> +{
> +       struct inode *inode = file_inode(file);
> +       u32 capcontid;
> +       int rv;
> +       struct task_struct *task = get_proc_task(inode);
> +
> +       if (!task)
> +               return -ESRCH;
> +       if (*ppos != 0) {
> +               /* No partial writes. */
> +               put_task_struct(task);
> +               return -EINVAL;
> +       }
> +
> +       rv = kstrtou32_from_user(buf, count, 10, &capcontid);
> +       if (rv < 0) {
> +               put_task_struct(task);
> +               return rv;
> +       }
> +
> +       rv = audit_set_capcontid(task, capcontid);
> +       put_task_struct(task);
> +       if (rv < 0)
> +               return rv;
> +       return count;
> +}
> +
> +static const struct file_operations proc_capcontid_operations = {
> +       .read           = proc_capcontid_read,
> +       .write          = proc_capcontid_write,
> +       .llseek         = generic_file_llseek,
> +};
>  #endif
>
>  #ifdef CONFIG_FAULT_INJECTION
> @@ -3121,6 +3174,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
>         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
>         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
>         REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
> +       REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> @@ -3522,6 +3576,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
>         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
>         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
>         REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
> +       REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 28b9c7cd86a6..62c453306c2a 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -116,6 +116,7 @@ struct audit_task_info {
>         kuid_t                  loginuid;
>         unsigned int            sessionid;
>         struct audit_contobj    *cont;
> +       u32                     capcontid;

Where is the code change that actually uses this to enforce the
described policy on setting an audit container ID?

> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index 2844d78cd7af..01251e6dcec0 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -73,6 +73,7 @@
>  #define AUDIT_GET_FEATURE      1019    /* Get which features are enabled */
>  #define AUDIT_CONTAINER_OP     1020    /* Define the container id and info */
>  #define AUDIT_SIGNAL_INFO2     1021    /* Get info auditd signal sender */
> +#define AUDIT_SET_CAPCONTID    1022    /* Set cap_contid of a task */
>
>  #define AUDIT_FIRST_USER_MSG   1100    /* Userspace messages mostly uninteresting to kernel */
>  #define AUDIT_USER_AVC         1107    /* We filter this differently */
> diff --git a/kernel/audit.c b/kernel/audit.c
> index 1287f0b63757..1c22dd084ae8 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -2698,6 +2698,41 @@ static bool audit_contid_isowner(struct task_struct *tsk)
>         return false;
>  }
>
> +int audit_set_capcontid(struct task_struct *task, u32 enable)
> +{
> +       u32 oldcapcontid;
> +       int rc = 0;
> +       struct audit_buffer *ab;
> +
> +       if (!task->audit)
> +               return -ENOPROTOOPT;
> +       oldcapcontid = audit_get_capcontid(task);
> +       /* if task is not descendant, block */
> +       if (task == current)
> +               rc = -EBADSLT;
> +       else if (!task_is_descendant(current, task))
> +               rc = -EXDEV;

See my previous comments about error code sanity.

> +       else if (current_user_ns() == &init_user_ns) {
> +               if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> +                       rc = -EPERM;

I think we just want to use ns_capable() in the context of the current
userns to check CAP_AUDIT_CONTROL, yes?  Something like this ...

  if (current_user_ns() != &init_user_ns) {
    if (!ns_capable(CAP_AUDIT_CONTROL) || !audit_get_capcontid())
      rc = -EPERM;
  } else if (!capable(CAP_AUDIT_CONTROL))
    rc = -EPERM;

> +       }
> +       if (!rc)
> +               task->audit->capcontid = enable;
> +
> +       if (!audit_enabled)
> +               return rc;
> +
> +       ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SET_CAPCONTID);
> +       if (!ab)
> +               return rc;
> +
> +       audit_log_format(ab,
> +                        "opid=%d capcontid=%u old-capcontid=%u",
> +                        task_tgid_nr(task), enable, oldcapcontid);
> +       audit_log_end(ab);

My prior comments about recording the success/failure, or not emitting
the record on failure, seem relevant here too.

> +       return rc;
> +}

--
paul moore
www.paul-moore.com
Richard Guy Briggs Feb. 5, 2020, 12:39 a.m. UTC | #2
On 2020-01-22 16:29, Paul Moore wrote:
> On Tue, Dec 31, 2019 at 2:51 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> >
> > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > process in a non-init user namespace the capability to set audit
> > container identifiers.
> >
> > Provide /proc/$PID/audit_capcontid interface to capcontid.
> > Valid values are: 1==enabled, 0==disabled
> 
> It would be good to be more explicit about "enabled" and "disabled" in
> the commit description.  For example, which setting allows the target
> task to set audit container IDs of it's children processes?

Ok...

> > Report this action in message type AUDIT_SET_CAPCONTID 1022 with fields
> > opid= capcontid= old-capcontid=
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  fs/proc/base.c             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/audit.h      | 14 ++++++++++++
> >  include/uapi/linux/audit.h |  1 +
> >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++
> >  4 files changed, 105 insertions(+)
> 
> ...
> 
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 26091800180c..283ef8e006e7 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -1360,6 +1360,59 @@ static ssize_t proc_contid_write(struct file *file, const char __user *buf,
> >         .write          = proc_contid_write,
> >         .llseek         = generic_file_llseek,
> >  };
> > +
> > +static ssize_t proc_capcontid_read(struct file *file, char __user *buf,
> > +                                 size_t count, loff_t *ppos)
> > +{
> > +       struct inode *inode = file_inode(file);
> > +       struct task_struct *task = get_proc_task(inode);
> > +       ssize_t length;
> > +       char tmpbuf[TMPBUFLEN];
> > +
> > +       if (!task)
> > +               return -ESRCH;
> > +       /* if we don't have caps, reject */
> > +       if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> > +               return -EPERM;
> > +       length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_capcontid(task));
> > +       put_task_struct(task);
> > +       return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
> > +}
> > +
> > +static ssize_t proc_capcontid_write(struct file *file, const char __user *buf,
> > +                                  size_t count, loff_t *ppos)
> > +{
> > +       struct inode *inode = file_inode(file);
> > +       u32 capcontid;
> > +       int rv;
> > +       struct task_struct *task = get_proc_task(inode);
> > +
> > +       if (!task)
> > +               return -ESRCH;
> > +       if (*ppos != 0) {
> > +               /* No partial writes. */
> > +               put_task_struct(task);
> > +               return -EINVAL;
> > +       }
> > +
> > +       rv = kstrtou32_from_user(buf, count, 10, &capcontid);
> > +       if (rv < 0) {
> > +               put_task_struct(task);
> > +               return rv;
> > +       }
> > +
> > +       rv = audit_set_capcontid(task, capcontid);
> > +       put_task_struct(task);
> > +       if (rv < 0)
> > +               return rv;
> > +       return count;
> > +}
> > +
> > +static const struct file_operations proc_capcontid_operations = {
> > +       .read           = proc_capcontid_read,
> > +       .write          = proc_capcontid_write,
> > +       .llseek         = generic_file_llseek,
> > +};
> >  #endif
> >
> >  #ifdef CONFIG_FAULT_INJECTION
> > @@ -3121,6 +3174,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
> >         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> >         REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
> > +       REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> > @@ -3522,6 +3576,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
> >         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> >         REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
> > +       REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > index 28b9c7cd86a6..62c453306c2a 100644
> > --- a/include/linux/audit.h
> > +++ b/include/linux/audit.h
> > @@ -116,6 +116,7 @@ struct audit_task_info {
> >         kuid_t                  loginuid;
> >         unsigned int            sessionid;
> >         struct audit_contobj    *cont;
> > +       u32                     capcontid;
> 
> Where is the code change that actually uses this to enforce the
> described policy on setting an audit container ID?

Oops, lost in shuffle of refactorisation when dumping the netlink code in
favour of /proc.

> > diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> > index 2844d78cd7af..01251e6dcec0 100644
> > --- a/include/uapi/linux/audit.h
> > +++ b/include/uapi/linux/audit.h
> > @@ -73,6 +73,7 @@
> >  #define AUDIT_GET_FEATURE      1019    /* Get which features are enabled */
> >  #define AUDIT_CONTAINER_OP     1020    /* Define the container id and info */
> >  #define AUDIT_SIGNAL_INFO2     1021    /* Get info auditd signal sender */
> > +#define AUDIT_SET_CAPCONTID    1022    /* Set cap_contid of a task */
> >
> >  #define AUDIT_FIRST_USER_MSG   1100    /* Userspace messages mostly uninteresting to kernel */
> >  #define AUDIT_USER_AVC         1107    /* We filter this differently */
> > diff --git a/kernel/audit.c b/kernel/audit.c
> > index 1287f0b63757..1c22dd084ae8 100644
> > --- a/kernel/audit.c
> > +++ b/kernel/audit.c
> > @@ -2698,6 +2698,41 @@ static bool audit_contid_isowner(struct task_struct *tsk)
> >         return false;
> >  }
> >
> > +int audit_set_capcontid(struct task_struct *task, u32 enable)
> > +{
> > +       u32 oldcapcontid;
> > +       int rc = 0;
> > +       struct audit_buffer *ab;
> > +
> > +       if (!task->audit)
> > +               return -ENOPROTOOPT;
> > +       oldcapcontid = audit_get_capcontid(task);
> > +       /* if task is not descendant, block */
> > +       if (task == current)
> > +               rc = -EBADSLT;
> > +       else if (!task_is_descendant(current, task))
> > +               rc = -EXDEV;
> 
> See my previous comments about error code sanity.

I'll go with EXDEV.

> > +       else if (current_user_ns() == &init_user_ns) {
> > +               if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> > +                       rc = -EPERM;
> 
> I think we just want to use ns_capable() in the context of the current
> userns to check CAP_AUDIT_CONTROL, yes?  Something like this ...

I thought we had firmly established in previous discussion that
CAP_AUDIT_CONTROL in anything other than init_user_ns was completely irrelevant
and untrustable.

>   if (current_user_ns() != &init_user_ns) {
>     if (!ns_capable(CAP_AUDIT_CONTROL) || !audit_get_capcontid())
>       rc = -EPERM;
>   } else if (!capable(CAP_AUDIT_CONTROL))
>     rc = -EPERM;
> 
> > +       }
> > +       if (!rc)
> > +               task->audit->capcontid = enable;
> > +
> > +       if (!audit_enabled)
> > +               return rc;
> > +
> > +       ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SET_CAPCONTID);
> > +       if (!ab)
> > +               return rc;
> > +
> > +       audit_log_format(ab,
> > +                        "opid=%d capcontid=%u old-capcontid=%u",
> > +                        task_tgid_nr(task), enable, oldcapcontid);
> > +       audit_log_end(ab);
> 
> My prior comments about recording the success/failure, or not emitting
> the record on failure, seem relevant here too.

It should be recorded in the syscall record.

> > +       return rc;
> > +}
> 
> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Feb. 5, 2020, 10:56 p.m. UTC | #3
On Tue, Feb 4, 2020 at 7:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2020-01-22 16:29, Paul Moore wrote:
> > On Tue, Dec 31, 2019 at 2:51 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > >
> > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > process in a non-init user namespace the capability to set audit
> > > container identifiers.
> > >
> > > Provide /proc/$PID/audit_capcontid interface to capcontid.
> > > Valid values are: 1==enabled, 0==disabled
> >
> > It would be good to be more explicit about "enabled" and "disabled" in
> > the commit description.  For example, which setting allows the target
> > task to set audit container IDs of it's children processes?
>
> Ok...
>
> > > Report this action in message type AUDIT_SET_CAPCONTID 1022 with fields
> > > opid= capcontid= old-capcontid=
> > >
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  fs/proc/base.c             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/linux/audit.h      | 14 ++++++++++++
> > >  include/uapi/linux/audit.h |  1 +
> > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++
> > >  4 files changed, 105 insertions(+)

...

> > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > index 1287f0b63757..1c22dd084ae8 100644
> > > --- a/kernel/audit.c
> > > +++ b/kernel/audit.c
> > > @@ -2698,6 +2698,41 @@ static bool audit_contid_isowner(struct task_struct *tsk)
> > >         return false;
> > >  }
> > >
> > > +int audit_set_capcontid(struct task_struct *task, u32 enable)
> > > +{
> > > +       u32 oldcapcontid;
> > > +       int rc = 0;
> > > +       struct audit_buffer *ab;
> > > +
> > > +       if (!task->audit)
> > > +               return -ENOPROTOOPT;
> > > +       oldcapcontid = audit_get_capcontid(task);
> > > +       /* if task is not descendant, block */
> > > +       if (task == current)
> > > +               rc = -EBADSLT;
> > > +       else if (!task_is_descendant(current, task))
> > > +               rc = -EXDEV;
> >
> > See my previous comments about error code sanity.
>
> I'll go with EXDEV.
>
> > > +       else if (current_user_ns() == &init_user_ns) {
> > > +               if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> > > +                       rc = -EPERM;
> >
> > I think we just want to use ns_capable() in the context of the current
> > userns to check CAP_AUDIT_CONTROL, yes?  Something like this ...
>
> I thought we had firmly established in previous discussion that
> CAP_AUDIT_CONTROL in anything other than init_user_ns was completely irrelevant
> and untrustable.

In the case of a container with multiple users, and multiple
applications, one being a nested orchestrator, it seems relevant to
allow that container to control which of it's processes are able to
exercise CAP_AUDIT_CONTROL.  Granted, we still want to control it
within the overall host, e.g. the container in question must be
allowed to run a nested orchestrator, but allowing the container
itself to provide it's own granularity seems like the right thing to
do.

> >   if (current_user_ns() != &init_user_ns) {
> >     if (!ns_capable(CAP_AUDIT_CONTROL) || !audit_get_capcontid())
> >       rc = -EPERM;
> >   } else if (!capable(CAP_AUDIT_CONTROL))
> >     rc = -EPERM;
> >
Richard Guy Briggs Feb. 6, 2020, 12:51 p.m. UTC | #4
On 2020-02-05 17:56, Paul Moore wrote:
> On Tue, Feb 4, 2020 at 7:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2020-01-22 16:29, Paul Moore wrote:
> > > On Tue, Dec 31, 2019 at 2:51 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > >
> > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > process in a non-init user namespace the capability to set audit
> > > > container identifiers.
> > > >
> > > > Provide /proc/$PID/audit_capcontid interface to capcontid.
> > > > Valid values are: 1==enabled, 0==disabled
> > >
> > > It would be good to be more explicit about "enabled" and "disabled" in
> > > the commit description.  For example, which setting allows the target
> > > task to set audit container IDs of it's children processes?
> >
> > Ok...
> >
> > > > Report this action in message type AUDIT_SET_CAPCONTID 1022 with fields
> > > > opid= capcontid= old-capcontid=
> > > >
> > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > ---
> > > >  fs/proc/base.c             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  include/linux/audit.h      | 14 ++++++++++++
> > > >  include/uapi/linux/audit.h |  1 +
> > > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++
> > > >  4 files changed, 105 insertions(+)
> 
> ...
> 
> > > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > > index 1287f0b63757..1c22dd084ae8 100644
> > > > --- a/kernel/audit.c
> > > > +++ b/kernel/audit.c
> > > > @@ -2698,6 +2698,41 @@ static bool audit_contid_isowner(struct task_struct *tsk)
> > > >         return false;
> > > >  }
> > > >
> > > > +int audit_set_capcontid(struct task_struct *task, u32 enable)
> > > > +{
> > > > +       u32 oldcapcontid;
> > > > +       int rc = 0;
> > > > +       struct audit_buffer *ab;
> > > > +
> > > > +       if (!task->audit)
> > > > +               return -ENOPROTOOPT;
> > > > +       oldcapcontid = audit_get_capcontid(task);
> > > > +       /* if task is not descendant, block */
> > > > +       if (task == current)
> > > > +               rc = -EBADSLT;
> > > > +       else if (!task_is_descendant(current, task))
> > > > +               rc = -EXDEV;
> > >
> > > See my previous comments about error code sanity.
> >
> > I'll go with EXDEV.
> >
> > > > +       else if (current_user_ns() == &init_user_ns) {
> > > > +               if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> > > > +                       rc = -EPERM;
> > >
> > > I think we just want to use ns_capable() in the context of the current
> > > userns to check CAP_AUDIT_CONTROL, yes?  Something like this ...
> >
> > I thought we had firmly established in previous discussion that
> > CAP_AUDIT_CONTROL in anything other than init_user_ns was completely irrelevant
> > and untrustable.
> 
> In the case of a container with multiple users, and multiple
> applications, one being a nested orchestrator, it seems relevant to
> allow that container to control which of it's processes are able to
> exercise CAP_AUDIT_CONTROL.  Granted, we still want to control it
> within the overall host, e.g. the container in question must be
> allowed to run a nested orchestrator, but allowing the container
> itself to provide it's own granularity seems like the right thing to
> do.

Looking back to discussion on the v6 patch 2/10 (2019-05-30 15:29 Paul
Moore[1], 2019-07-08 14:05 RGB[2]) , it occurs to me that the
ns_capable(CAP_AUDIT_CONTROL) application was dangerous since there was
no parental accountability in storage or reporting.  Now that is in
place, it does seem a bit more reasonable to allow it, but I'm still not
clear on why we would want both mechanisms now.  I don't understand what
the last line in that email meant: "We would probably still want a
ns_capable(CAP_AUDIT_CONTROL) restriction in this case."  Allow
ns_capable(CAP_AUDIT_CONTROL) to govern these actions, or restrict
ns_capable(CAP_AUDIT_CONTROL) from being used to govern these actions?

If an unprivileged user has been given capcontid to be able run their
own container orchestrator/engine and spawns a user namespace with
CAP_AUDIT_CONTROL, what matters is capcontid, and not CAP_AUDIT_CONTROL.
I could see needing CAP_AUDIT_CONTROL *in addition* to capcontid to give
it finer grained control, but since capcontid would have to be given to
each process explicitly anways, I don't see the point.

If that unprivileged user had not been given capcontid,
giving itself or one of its descendants CAP_AUDIT_CONTROL should not let
it jump into the game all of a sudden unless the now chained audit
container identifiers are deemed accountable enough.  And then now we
need those hard limits on container depth and network namespace
container membership.

> > >   if (current_user_ns() != &init_user_ns) {
> > >     if (!ns_capable(CAP_AUDIT_CONTROL) || !audit_get_capcontid())
> > >       rc = -EPERM;
> > >   } else if (!capable(CAP_AUDIT_CONTROL))
> > >     rc = -EPERM;
> > >
> 
> paul moore

[1] https://www.redhat.com/archives/linux-audit/2019-May/msg00085.html
	https://lkml.org/lkml/2019/5/30/1380
[2] https://www.redhat.com/archives/linux-audit/2019-July/msg00003.html
	https://lkml.org/lkml/2019/7/8/1051

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Feb. 13, 2020, 9:58 p.m. UTC | #5
On Thu, Feb 6, 2020 at 7:52 AM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2020-02-05 17:56, Paul Moore wrote:
> > On Tue, Feb 4, 2020 at 7:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > On 2020-01-22 16:29, Paul Moore wrote:
> > > > On Tue, Dec 31, 2019 at 2:51 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > >
> > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > process in a non-init user namespace the capability to set audit
> > > > > container identifiers.
> > > > >
> > > > > Provide /proc/$PID/audit_capcontid interface to capcontid.
> > > > > Valid values are: 1==enabled, 0==disabled
> > > >
> > > > It would be good to be more explicit about "enabled" and "disabled" in
> > > > the commit description.  For example, which setting allows the target
> > > > task to set audit container IDs of it's children processes?
> > >
> > > Ok...
> > >
> > > > > Report this action in message type AUDIT_SET_CAPCONTID 1022 with fields
> > > > > opid= capcontid= old-capcontid=
> > > > >
> > > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > > ---
> > > > >  fs/proc/base.c             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  include/linux/audit.h      | 14 ++++++++++++
> > > > >  include/uapi/linux/audit.h |  1 +
> > > > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++
> > > > >  4 files changed, 105 insertions(+)
> >
> > ...
> >
> > > > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > > > index 1287f0b63757..1c22dd084ae8 100644
> > > > > --- a/kernel/audit.c
> > > > > +++ b/kernel/audit.c
> > > > > @@ -2698,6 +2698,41 @@ static bool audit_contid_isowner(struct task_struct *tsk)
> > > > >         return false;
> > > > >  }
> > > > >
> > > > > +int audit_set_capcontid(struct task_struct *task, u32 enable)
> > > > > +{
> > > > > +       u32 oldcapcontid;
> > > > > +       int rc = 0;
> > > > > +       struct audit_buffer *ab;
> > > > > +
> > > > > +       if (!task->audit)
> > > > > +               return -ENOPROTOOPT;
> > > > > +       oldcapcontid = audit_get_capcontid(task);
> > > > > +       /* if task is not descendant, block */
> > > > > +       if (task == current)
> > > > > +               rc = -EBADSLT;
> > > > > +       else if (!task_is_descendant(current, task))
> > > > > +               rc = -EXDEV;
> > > >
> > > > See my previous comments about error code sanity.
> > >
> > > I'll go with EXDEV.
> > >
> > > > > +       else if (current_user_ns() == &init_user_ns) {
> > > > > +               if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> > > > > +                       rc = -EPERM;
> > > >
> > > > I think we just want to use ns_capable() in the context of the current
> > > > userns to check CAP_AUDIT_CONTROL, yes?  Something like this ...
> > >
> > > I thought we had firmly established in previous discussion that
> > > CAP_AUDIT_CONTROL in anything other than init_user_ns was completely irrelevant
> > > and untrustable.
> >
> > In the case of a container with multiple users, and multiple
> > applications, one being a nested orchestrator, it seems relevant to
> > allow that container to control which of it's processes are able to
> > exercise CAP_AUDIT_CONTROL.  Granted, we still want to control it
> > within the overall host, e.g. the container in question must be
> > allowed to run a nested orchestrator, but allowing the container
> > itself to provide it's own granularity seems like the right thing to
> > do.
>
> Looking back to discussion on the v6 patch 2/10 (2019-05-30 15:29 Paul
> Moore[1], 2019-07-08 14:05 RGB[2]) , it occurs to me that the
> ns_capable(CAP_AUDIT_CONTROL) application was dangerous since there was
> no parental accountability in storage or reporting.  Now that is in
> place, it does seem a bit more reasonable to allow it, but I'm still not
> clear on why we would want both mechanisms now.  I don't understand what
> the last line in that email meant: "We would probably still want a
> ns_capable(CAP_AUDIT_CONTROL) restriction in this case."  Allow
> ns_capable(CAP_AUDIT_CONTROL) to govern these actions, or restrict
> ns_capable(CAP_AUDIT_CONTROL) from being used to govern these actions?
>
> If an unprivileged user has been given capcontid to be able run their
> own container orchestrator/engine and spawns a user namespace with
> CAP_AUDIT_CONTROL, what matters is capcontid, and not CAP_AUDIT_CONTROL.
> I could see needing CAP_AUDIT_CONTROL *in addition* to capcontid to give
> it finer grained control, but since capcontid would have to be given to
> each process explicitly anways, I don't see the point.
>
> If that unprivileged user had not been given capcontid,
> giving itself or one of its descendants CAP_AUDIT_CONTROL should not let
> it jump into the game all of a sudden unless the now chained audit
> container identifiers are deemed accountable enough.  And then now we
> need those hard limits on container depth and network namespace
> container membership.

Perhaps I'm not correctly understanding what you are trying to do with
this patchset, but my current understanding is that you are trying to
use capcontid to control which child audit container IDs (ACIDs) are
allowed to manage their own ACIDs.  Further, I believe that the
capcontid setting operates at a per-ACID level, meaning there is no
provision for the associated container to further restrict that
ability, i.e. no access control granularity below the ACID level.  My
thinking is that ns_capable(CAP_AUDIT_CONTROL) could be used within an
ACID to increase the granularity of the access controls so that only
privileged processes running inside the ACID would be able to manage
the ACIDs.  Does that make sense?
Richard Guy Briggs March 12, 2020, 9:58 p.m. UTC | #6
On 2020-02-13 16:58, Paul Moore wrote:
> On Thu, Feb 6, 2020 at 7:52 AM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2020-02-05 17:56, Paul Moore wrote:
> > > On Tue, Feb 4, 2020 at 7:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > On 2020-01-22 16:29, Paul Moore wrote:
> > > > > On Tue, Dec 31, 2019 at 2:51 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > >
> > > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > > process in a non-init user namespace the capability to set audit
> > > > > > container identifiers.
> > > > > >
> > > > > > Provide /proc/$PID/audit_capcontid interface to capcontid.
> > > > > > Valid values are: 1==enabled, 0==disabled
> > > > >
> > > > > It would be good to be more explicit about "enabled" and "disabled" in
> > > > > the commit description.  For example, which setting allows the target
> > > > > task to set audit container IDs of it's children processes?
> > > >
> > > > Ok...
> > > >
> > > > > > Report this action in message type AUDIT_SET_CAPCONTID 1022 with fields
> > > > > > opid= capcontid= old-capcontid=
> > > > > >
> > > > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > > > ---
> > > > > >  fs/proc/base.c             | 55 ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  include/linux/audit.h      | 14 ++++++++++++
> > > > > >  include/uapi/linux/audit.h |  1 +
> > > > > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++
> > > > > >  4 files changed, 105 insertions(+)
> > >
> > > ...
> > >
> > > > > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > > > > index 1287f0b63757..1c22dd084ae8 100644
> > > > > > --- a/kernel/audit.c
> > > > > > +++ b/kernel/audit.c
> > > > > > @@ -2698,6 +2698,41 @@ static bool audit_contid_isowner(struct task_struct *tsk)
> > > > > >         return false;
> > > > > >  }
> > > > > >
> > > > > > +int audit_set_capcontid(struct task_struct *task, u32 enable)
> > > > > > +{
> > > > > > +       u32 oldcapcontid;
> > > > > > +       int rc = 0;
> > > > > > +       struct audit_buffer *ab;
> > > > > > +
> > > > > > +       if (!task->audit)
> > > > > > +               return -ENOPROTOOPT;
> > > > > > +       oldcapcontid = audit_get_capcontid(task);
> > > > > > +       /* if task is not descendant, block */
> > > > > > +       if (task == current)
> > > > > > +               rc = -EBADSLT;
> > > > > > +       else if (!task_is_descendant(current, task))
> > > > > > +               rc = -EXDEV;
> > > > >
> > > > > See my previous comments about error code sanity.
> > > >
> > > > I'll go with EXDEV.
> > > >
> > > > > > +       else if (current_user_ns() == &init_user_ns) {
> > > > > > +               if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> > > > > > +                       rc = -EPERM;
> > > > >
> > > > > I think we just want to use ns_capable() in the context of the current
> > > > > userns to check CAP_AUDIT_CONTROL, yes?  Something like this ...
> > > >
> > > > I thought we had firmly established in previous discussion that
> > > > CAP_AUDIT_CONTROL in anything other than init_user_ns was completely irrelevant
> > > > and untrustable.
> > >
> > > In the case of a container with multiple users, and multiple
> > > applications, one being a nested orchestrator, it seems relevant to
> > > allow that container to control which of it's processes are able to
> > > exercise CAP_AUDIT_CONTROL.  Granted, we still want to control it
> > > within the overall host, e.g. the container in question must be
> > > allowed to run a nested orchestrator, but allowing the container
> > > itself to provide it's own granularity seems like the right thing to
> > > do.
> >
> > Looking back to discussion on the v6 patch 2/10 (2019-05-30 15:29 Paul
> > Moore[1], 2019-07-08 14:05 RGB[2]) , it occurs to me that the
> > ns_capable(CAP_AUDIT_CONTROL) application was dangerous since there was
> > no parental accountability in storage or reporting.  Now that is in
> > place, it does seem a bit more reasonable to allow it, but I'm still not
> > clear on why we would want both mechanisms now.  I don't understand what
> > the last line in that email meant: "We would probably still want a
> > ns_capable(CAP_AUDIT_CONTROL) restriction in this case."  Allow
> > ns_capable(CAP_AUDIT_CONTROL) to govern these actions, or restrict
> > ns_capable(CAP_AUDIT_CONTROL) from being used to govern these actions?
> >
> > If an unprivileged user has been given capcontid to be able run their
> > own container orchestrator/engine and spawns a user namespace with
> > CAP_AUDIT_CONTROL, what matters is capcontid, and not CAP_AUDIT_CONTROL.
> > I could see needing CAP_AUDIT_CONTROL *in addition* to capcontid to give
> > it finer grained control, but since capcontid would have to be given to
> > each process explicitly anways, I don't see the point.
> >
> > If that unprivileged user had not been given capcontid,
> > giving itself or one of its descendants CAP_AUDIT_CONTROL should not let
> > it jump into the game all of a sudden unless the now chained audit
> > container identifiers are deemed accountable enough.  And then now we
> > need those hard limits on container depth and network namespace
> > container membership.
> 
> Perhaps I'm not correctly understanding what you are trying to do with
> this patchset, but my current understanding is that you are trying to
> use capcontid to control which child audit container IDs (ACIDs) are
> allowed to manage their own ACIDs.  Further, I believe that the
> capcontid setting operates at a per-ACID level, meaning there is no
> provision for the associated container to further restrict that
> ability, i.e. no access control granularity below the ACID level.  My
> thinking is that ns_capable(CAP_AUDIT_CONTROL) could be used within an
> ACID to increase the granularity of the access controls so that only
> privileged processes running inside the ACID would be able to manage
> the ACIDs.  Does that make sense?

The capcontid is not inherited like the contid (or contobj) in
audit_alloc(), so it stops at that process that was granted capcontid.
That process that was granted capcontid can then explicitly further
grant capcontid to any of its children should it deem necessary.

Since it is a boolean, it defaults to unset in init_struct_audit which
isn't relevant anyways since that is in the initial user namespace.
It isn't set in audit_alloc() and would default to false.
I can set them explicitly both to false to be certain if that makes
things clearer and more certain.

I still believe ns_capable() is irrelevant here.

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
diff mbox series

Patch

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 26091800180c..283ef8e006e7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1360,6 +1360,59 @@  static ssize_t proc_contid_write(struct file *file, const char __user *buf,
 	.write		= proc_contid_write,
 	.llseek		= generic_file_llseek,
 };
+
+static ssize_t proc_capcontid_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *task = get_proc_task(inode);
+	ssize_t length;
+	char tmpbuf[TMPBUFLEN];
+
+	if (!task)
+		return -ESRCH;
+	/* if we don't have caps, reject */
+	if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
+		return -EPERM;
+	length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_capcontid(task));
+	put_task_struct(task);
+	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static ssize_t proc_capcontid_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode *inode = file_inode(file);
+	u32 capcontid;
+	int rv;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+	if (*ppos != 0) {
+		/* No partial writes. */
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	rv = kstrtou32_from_user(buf, count, 10, &capcontid);
+	if (rv < 0) {
+		put_task_struct(task);
+		return rv;
+	}
+
+	rv = audit_set_capcontid(task, capcontid);
+	put_task_struct(task);
+	if (rv < 0)
+		return rv;
+	return count;
+}
+
+static const struct file_operations proc_capcontid_operations = {
+	.read		= proc_capcontid_read,
+	.write		= proc_capcontid_write,
+	.llseek		= generic_file_llseek,
+};
 #endif
 
 #ifdef CONFIG_FAULT_INJECTION
@@ -3121,6 +3174,7 @@  static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 	REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
+	REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
@@ -3522,6 +3576,7 @@  static int proc_tid_comm_permission(struct inode *inode, int mask)
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 	REG("audit_containerid", S_IWUSR|S_IRUSR, proc_contid_operations),
+	REG("audit_capcontainerid", S_IWUSR|S_IRUSR|S_IRUSR, proc_capcontid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 28b9c7cd86a6..62c453306c2a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -116,6 +116,7 @@  struct audit_task_info {
 	kuid_t			loginuid;
 	unsigned int		sessionid;
 	struct audit_contobj	*cont;
+	u32			capcontid;
 #ifdef CONFIG_AUDITSYSCALL
 	struct audit_context	*ctx;
 #endif
@@ -224,6 +225,14 @@  static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->audit->sessionid;
 }
 
+static inline u32 audit_get_capcontid(struct task_struct *tsk)
+{
+	if (!tsk->audit)
+		return 0;
+	return tsk->audit->capcontid;
+}
+
+extern int audit_set_capcontid(struct task_struct *tsk, u32 enable);
 extern int audit_set_contid(struct task_struct *tsk, u64 contid);
 
 static inline u64 audit_get_contid(struct task_struct *tsk)
@@ -305,6 +314,11 @@  static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return AUDIT_SID_UNSET;
 }
 
+static inline u32 audit_get_capcontid(struct task_struct *tsk)
+{
+	return 0;
+}
+
 static inline u64 audit_get_contid(struct task_struct *tsk)
 {
 	return AUDIT_CID_UNSET;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 2844d78cd7af..01251e6dcec0 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -73,6 +73,7 @@ 
 #define AUDIT_GET_FEATURE	1019	/* Get which features are enabled */
 #define AUDIT_CONTAINER_OP	1020	/* Define the container id and info */
 #define AUDIT_SIGNAL_INFO2	1021	/* Get info auditd signal sender */
+#define AUDIT_SET_CAPCONTID	1022	/* Set cap_contid of a task */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
diff --git a/kernel/audit.c b/kernel/audit.c
index 1287f0b63757..1c22dd084ae8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2698,6 +2698,41 @@  static bool audit_contid_isowner(struct task_struct *tsk)
 	return false;
 }
 
+int audit_set_capcontid(struct task_struct *task, u32 enable)
+{
+	u32 oldcapcontid;
+	int rc = 0;
+	struct audit_buffer *ab;
+
+	if (!task->audit)
+		return -ENOPROTOOPT;
+	oldcapcontid = audit_get_capcontid(task);
+	/* if task is not descendant, block */
+	if (task == current)
+		rc = -EBADSLT;
+	else if (!task_is_descendant(current, task))
+		rc = -EXDEV;
+	else if (current_user_ns() == &init_user_ns) {
+		if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
+			rc = -EPERM;
+	}
+	if (!rc)
+		task->audit->capcontid = enable;
+
+	if (!audit_enabled)
+		return rc;
+
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SET_CAPCONTID);
+	if (!ab)
+		return rc;
+
+	audit_log_format(ab,
+			 "opid=%d capcontid=%u old-capcontid=%u",
+			 task_tgid_nr(task), enable, oldcapcontid);
+	audit_log_end(ab);
+	return rc;
+}
+
 /*
  * audit_set_contid - set current task's audit contid
  * @task: target task