diff mbox series

[ghak90,V7,20/21] audit: add capcontid to set contid outside init_user_ns

Message ID 214163d11a75126f610bcedfad67a4d89575dc77.1568834525.git.rgb@redhat.com (mailing list archive)
State New, archived
Headers show
Series audit: implement container identifier | expand

Commit Message

Richard Guy Briggs Sept. 19, 2019, 1:22 a.m. UTC
Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
process in a non-init user namespace the capability to set audit
container identifiers.

Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
AUDIT_SET_CAPCONTID 1028.  The message format includes the data
structure:
struct audit_capcontid_status {
        pid_t   pid;
        u32     enable;
};

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h      | 14 +++++++
 include/uapi/linux/audit.h |  2 +
 kernel/audit.c             | 98 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/audit.h             |  5 +++
 4 files changed, 117 insertions(+), 2 deletions(-)

Comments

Richard Guy Briggs Oct. 19, 2019, 1:39 a.m. UTC | #1
On 2019-09-18 21:22, Richard Guy Briggs wrote:
> Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> process in a non-init user namespace the capability to set audit
> container identifiers.
> 
> Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> structure:
> struct audit_capcontid_status {
>         pid_t   pid;
>         u32     enable;
> };

Paul, can I get a review of the general idea here to see if you're ok
with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
setting contid from beyond the init user namespace where capable() can't
reach and ns_capable() is meaningless for these purposes?

Last weekend was Canadian Thanksgiving where I took an extra day for an
annual bike trip and I'm buried to my neck in a complete kitchen gut
(down to 1920 structural double brick and knob/tube wiring), but I've
got fixes or responses to almost everything else you've raised which
I'll post shortly.

Thanks!

> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  include/linux/audit.h      | 14 +++++++
>  include/uapi/linux/audit.h |  2 +
>  kernel/audit.c             | 98 +++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/audit.h             |  5 +++
>  4 files changed, 117 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 1ce27af686ea..dcc53e62e266 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -117,6 +117,7 @@ struct audit_task_info {
>  	kuid_t			loginuid;
>  	unsigned int		sessionid;
>  	struct audit_cont	*cont;
> +	u32			capcontid;
>  #ifdef CONFIG_AUDITSYSCALL
>  	struct audit_context	*ctx;
>  #endif
> @@ -224,6 +225,14 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
>  	return tsk->audit->sessionid;
>  }
>  
> +static inline u32 audit_get_capcontid(struct task_struct *tsk)
> +{
> +	if (!tsk->audit)
> +		return 0;
> +	return tsk->audit->capcontid;
> +}
> +
> +extern int audit_set_capcontid(struct task_struct *tsk, u32 enable);
>  extern int audit_set_contid(struct task_struct *tsk, u64 contid);
>  
>  static inline u64 audit_get_contid(struct task_struct *tsk)
> @@ -309,6 +318,11 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
>  	return AUDIT_SID_UNSET;
>  }
>  
> +static inline u32 audit_get_capcontid(struct task_struct *tsk)
> +{
> +	return 0;
> +}
> +
>  static inline u64 audit_get_contid(struct task_struct *tsk)
>  {
>  	return AUDIT_CID_UNSET;
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index eef42c8eea77..011b0a8ee9b2 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -78,6 +78,8 @@
>  #define AUDIT_GET_LOGINUID	1024	/* Get loginuid of a task */
>  #define AUDIT_SET_LOGINUID	1025	/* Set loginuid of a task */
>  #define AUDIT_GET_SESSIONID	1026	/* Set sessionid of a task */
> +#define AUDIT_GET_CAPCONTID	1027	/* Get cap_contid of a task */
> +#define AUDIT_SET_CAPCONTID	1028	/* Set cap_contid of a task */
>  
>  #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
>  #define AUDIT_USER_AVC		1107	/* We filter this differently */
> diff --git a/kernel/audit.c b/kernel/audit.c
> index a70c9184e5d9..7160da464849 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -1192,6 +1192,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
>  	case AUDIT_GET_SESSIONID:
>  		return 0;
>  		break;
> +	case AUDIT_GET_CAPCONTID:
> +	case AUDIT_SET_CAPCONTID:
> +	case AUDIT_GET_CONTID:
> +	case AUDIT_SET_CONTID:
> +		if (!netlink_capable(skb, CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> +			return -EPERM;
> +		return 0;
> +		break;
>  	default:  /* do more checks below */
>  		break;
>  	}
> @@ -1227,8 +1235,6 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
>  	case AUDIT_TTY_SET:
>  	case AUDIT_TRIM:
>  	case AUDIT_MAKE_EQUIV:
> -	case AUDIT_GET_CONTID:
> -	case AUDIT_SET_CONTID:
>  	case AUDIT_SET_LOGINUID:
>  		/* Only support auditd and auditctl in initial pid namespace
>  		 * for now. */
> @@ -1304,6 +1310,23 @@ static int audit_get_contid_status(struct sk_buff *skb)
>  	return 0;
>  }
>  
> +static int audit_get_capcontid_status(struct sk_buff *skb)
> +{
> +	struct nlmsghdr *nlh = nlmsg_hdr(skb);
> +	u32 seq = nlh->nlmsg_seq;
> +	void *data = nlmsg_data(nlh);
> +	struct audit_capcontid_status cs;
> +
> +	cs.pid = ((struct audit_capcontid_status *)data)->pid;
> +	if (!cs.pid)
> +		cs.pid = task_tgid_nr(current);
> +	rcu_read_lock();
> +	cs.enable = audit_get_capcontid(find_task_by_vpid(cs.pid));
> +	rcu_read_unlock();
> +	audit_send_reply(skb, seq, AUDIT_GET_CAPCONTID, 0, 0, &cs, sizeof(cs));
> +	return 0;
> +}
> +
>  struct audit_loginuid_status { uid_t loginuid; };
>  
>  static int audit_get_loginuid_status(struct sk_buff *skb)
> @@ -1779,6 +1802,27 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
>  		if (err)
>  			return err;
>  		break;
> +	case AUDIT_SET_CAPCONTID: {
> +		struct audit_capcontid_status *s = data;
> +		struct task_struct *tsk;
> +
> +		/* check if new data is valid */
> +		if (nlmsg_len(nlh) < sizeof(*s))
> +			return -EINVAL;
> +		tsk = find_get_task_by_vpid(s->pid);
> +		if (!tsk)
> +			return -EINVAL;
> +
> +		err = audit_set_capcontid(tsk, s->enable);
> +		put_task_struct(tsk);
> +		return err;
> +		break;
> +	}
> +	case AUDIT_GET_CAPCONTID:
> +		err = audit_get_capcontid_status(skb);
> +		if (err)
> +			return err;
> +		break;
>  	case AUDIT_SET_LOGINUID: {
>  		uid_t *loginuid = data;
>  		kuid_t kloginuid;
> @@ -2711,6 +2755,56 @@ static struct task_struct *audit_cont_owner(struct task_struct *tsk)
>  	return NULL;
>  }
>  
> +int audit_set_capcontid(struct task_struct *task, u32 enable)
> +{
> +	u32 oldcapcontid;
> +	int rc = 0;
> +	struct audit_buffer *ab;
> +	uid_t uid;
> +	struct tty_struct *tty;
> +	char comm[sizeof(current->comm)];
> +
> +	if (!task->audit)
> +		return -ENOPROTOOPT;
> +	oldcapcontid = audit_get_capcontid(task);
> +	/* if task is not descendant, block */
> +	if (task == current)
> +		rc = -EBADSLT;
> +	else if (!task_is_descendant(current, task))
> +		rc = -EXDEV;
> +	else if (current_user_ns() == &init_user_ns) {
> +		if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
> +			rc = -EPERM;
> +	}
> +	if (!rc)
> +		task->audit->capcontid = enable;
> +
> +	if (!audit_enabled)
> +		return rc;
> +
> +	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SET_CAPCONTID);
> +	if (!ab)
> +		return rc;
> +
> +	uid = from_kuid(&init_user_ns, task_uid(current));
> +	tty = audit_get_tty();
> +	audit_log_format(ab,
> +			 "opid=%d capcontid=%u old-capcontid=%u pid=%d uid=%u auid=%u tty=%s ses=%u",
> +			 task_tgid_nr(task), enable, oldcapcontid,
> +			 task_tgid_nr(current), uid,
> +			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
> +			 tty ? tty_name(tty) : "(none)",
> +			 audit_get_sessionid(current));
> +	audit_put_tty(tty);
> +	audit_log_task_context(ab);
> +	audit_log_format(ab, " comm=");
> +	audit_log_untrustedstring(ab, get_task_comm(comm, current));
> +	audit_log_d_path_exe(ab, current->mm);
> +	audit_log_format(ab, " res=%d", !rc);
> +	audit_log_end(ab);
> +	return rc;
> +}
> +
>  /*
>   * audit_set_contid - set current task's audit contid
>   * @task: target task
> diff --git a/kernel/audit.h b/kernel/audit.h
> index cb25341c1a0f..ac4694e88485 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -231,6 +231,11 @@ struct audit_contid_status {
>  	u64	id;
>  };
>  
> +struct audit_capcontid_status {
> +	pid_t	pid;
> +	u32	enable;
> +};
> +
>  #define AUDIT_CONTID_DEPTH	5
>  
>  /* Indicates that audit should log the full pathname. */
> -- 
> 1.8.3.1
> 

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Oct. 21, 2019, 7:53 p.m. UTC | #2
On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > process in a non-init user namespace the capability to set audit
> > container identifiers.
> >
> > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > structure:
> > struct audit_capcontid_status {
> >         pid_t   pid;
> >         u32     enable;
> > };
>
> Paul, can I get a review of the general idea here to see if you're ok
> with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> setting contid from beyond the init user namespace where capable() can't
> reach and ns_capable() is meaningless for these purposes?

I think my previous comment about having both the procfs and netlink
interfaces apply here.  I don't see why we need two different APIs at
the start; explain to me why procfs isn't sufficient.  If the argument
is simply the desire to avoid mounting procfs in the container, how
many container orchestrators can function today without a valid /proc?
Richard Guy Briggs Oct. 21, 2019, 9:38 p.m. UTC | #3
On 2019-10-21 15:53, Paul Moore wrote:
> On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > process in a non-init user namespace the capability to set audit
> > > container identifiers.
> > >
> > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > structure:
> > > struct audit_capcontid_status {
> > >         pid_t   pid;
> > >         u32     enable;
> > > };
> >
> > Paul, can I get a review of the general idea here to see if you're ok
> > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > setting contid from beyond the init user namespace where capable() can't
> > reach and ns_capable() is meaningless for these purposes?
> 
> I think my previous comment about having both the procfs and netlink
> interfaces apply here.  I don't see why we need two different APIs at
> the start; explain to me why procfs isn't sufficient.  If the argument
> is simply the desire to avoid mounting procfs in the container, how
> many container orchestrators can function today without a valid /proc?

Ok, sorry, I meant to address that question from a previous patch
comment at the same time.

It was raised by Eric Biederman that the proc filesystem interface for
audit had its limitations and he had suggested an audit netlink
interface made more sense.

The intent was to switch to the audit netlink interface for contid,
capcontid and to add the audit netlink interface for loginuid and
sessionid while deprecating the proc interface for loginuid and
sessionid.  This was alluded to in the cover letter, but not very clear,
I'm afraid.  I have patches to remove the contid and loginuid/sessionid
interfaces in another tree which is why I had forgotten to outline that
plan more explicitly in the cover letter.

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Oct. 21, 2019, 9:43 p.m. UTC | #4
On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-10-21 15:53, Paul Moore wrote:
> > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > process in a non-init user namespace the capability to set audit
> > > > container identifiers.
> > > >
> > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > structure:
> > > > struct audit_capcontid_status {
> > > >         pid_t   pid;
> > > >         u32     enable;
> > > > };
> > >
> > > Paul, can I get a review of the general idea here to see if you're ok
> > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > setting contid from beyond the init user namespace where capable() can't
> > > reach and ns_capable() is meaningless for these purposes?
> >
> > I think my previous comment about having both the procfs and netlink
> > interfaces apply here.  I don't see why we need two different APIs at
> > the start; explain to me why procfs isn't sufficient.  If the argument
> > is simply the desire to avoid mounting procfs in the container, how
> > many container orchestrators can function today without a valid /proc?
>
> Ok, sorry, I meant to address that question from a previous patch
> comment at the same time.
>
> It was raised by Eric Biederman that the proc filesystem interface for
> audit had its limitations and he had suggested an audit netlink
> interface made more sense.

I'm sure you've got it handy, so I'm going to be lazy and ask: archive
pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
of using the netlink interface for this, so unless Eric presents a
super compelling reason for why we shouldn't use procfs I'm inclined
to stick with /proc.

> The intent was to switch to the audit netlink interface for contid,
> capcontid and to add the audit netlink interface for loginuid and
> sessionid while deprecating the proc interface for loginuid and
> sessionid.  This was alluded to in the cover letter, but not very clear,
> I'm afraid.  I have patches to remove the contid and loginuid/sessionid
> interfaces in another tree which is why I had forgotten to outline that
> plan more explicitly in the cover letter.
Richard Guy Briggs Oct. 21, 2019, 11:57 p.m. UTC | #5
On 2019-10-21 17:43, Paul Moore wrote:
> On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-10-21 15:53, Paul Moore wrote:
> > > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > process in a non-init user namespace the capability to set audit
> > > > > container identifiers.
> > > > >
> > > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > > structure:
> > > > > struct audit_capcontid_status {
> > > > >         pid_t   pid;
> > > > >         u32     enable;
> > > > > };
> > > >
> > > > Paul, can I get a review of the general idea here to see if you're ok
> > > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > > setting contid from beyond the init user namespace where capable() can't
> > > > reach and ns_capable() is meaningless for these purposes?
> > >
> > > I think my previous comment about having both the procfs and netlink
> > > interfaces apply here.  I don't see why we need two different APIs at
> > > the start; explain to me why procfs isn't sufficient.  If the argument
> > > is simply the desire to avoid mounting procfs in the container, how
> > > many container orchestrators can function today without a valid /proc?
> >
> > Ok, sorry, I meant to address that question from a previous patch
> > comment at the same time.
> >
> > It was raised by Eric Biederman that the proc filesystem interface for
> > audit had its limitations and he had suggested an audit netlink
> > interface made more sense.
> 
> I'm sure you've got it handy, so I'm going to be lazy and ask: archive
> pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
> of using the netlink interface for this, so unless Eric presents a
> super compelling reason for why we shouldn't use procfs I'm inclined
> to stick with /proc.

It was actually a video call with Eric and Steve where that was
recommended, so I can't provide you with any first-hand communication
about it.  I'll get more details...

So, with that out of the way, could you please comment on the general
idea of what was intended to be the central idea of this mechanism to be
able to nest containers beyond the initial user namespace (knowing that
a /proc interface is available and the audit netlink interface isn't
necessary for it to work and the latter can be easily removed)?

> > The intent was to switch to the audit netlink interface for contid,
> > capcontid and to add the audit netlink interface for loginuid and
> > sessionid while deprecating the proc interface for loginuid and
> > sessionid.  This was alluded to in the cover letter, but not very clear,
> > I'm afraid.  I have patches to remove the contid and loginuid/sessionid
> > interfaces in another tree which is why I had forgotten to outline that
> > plan more explicitly in the cover letter.
> 
> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Oct. 22, 2019, 12:31 a.m. UTC | #6
On Mon, Oct 21, 2019 at 7:58 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-10-21 17:43, Paul Moore wrote:
> > On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > On 2019-10-21 15:53, Paul Moore wrote:
> > > > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > > process in a non-init user namespace the capability to set audit
> > > > > > container identifiers.
> > > > > >
> > > > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > > > structure:
> > > > > > struct audit_capcontid_status {
> > > > > >         pid_t   pid;
> > > > > >         u32     enable;
> > > > > > };
> > > > >
> > > > > Paul, can I get a review of the general idea here to see if you're ok
> > > > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > > > setting contid from beyond the init user namespace where capable() can't
> > > > > reach and ns_capable() is meaningless for these purposes?
> > > >
> > > > I think my previous comment about having both the procfs and netlink
> > > > interfaces apply here.  I don't see why we need two different APIs at
> > > > the start; explain to me why procfs isn't sufficient.  If the argument
> > > > is simply the desire to avoid mounting procfs in the container, how
> > > > many container orchestrators can function today without a valid /proc?
> > >
> > > Ok, sorry, I meant to address that question from a previous patch
> > > comment at the same time.
> > >
> > > It was raised by Eric Biederman that the proc filesystem interface for
> > > audit had its limitations and he had suggested an audit netlink
> > > interface made more sense.
> >
> > I'm sure you've got it handy, so I'm going to be lazy and ask: archive
> > pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
> > of using the netlink interface for this, so unless Eric presents a
> > super compelling reason for why we shouldn't use procfs I'm inclined
> > to stick with /proc.
>
> It was actually a video call with Eric and Steve where that was
> recommended, so I can't provide you with any first-hand communication
> about it.  I'll get more details...

Yeah, that sort of information really needs to be on the list.

> So, with that out of the way, could you please comment on the general
> idea of what was intended to be the central idea of this mechanism to be
> able to nest containers beyond the initial user namespace (knowing that
> a /proc interface is available and the audit netlink interface isn't
> necessary for it to work and the latter can be easily removed)?

I'm not entirely clear what you are asking about, are you asking why I
care about nesting container orchestrators?  Simply put, it is not
uncommon for the LXC/LXD folks to see nested container orchestrators,
so I felt it was important to support that use case.  When we
originally started this effort we probably should have done a better
job reaching out to the LXC/LXD folks, we may have caught this
earlier.  Regardless, we caught it, and it looks like we are on our
way to supporting it (that's good).

Are you asking why I prefer the procfs approach to setting/getting the
audit container ID?  For one, it makes it easier for a LSM to enforce
the audit container ID operations independent of the other audit
control APIs.  It also provides a simpler interface for container
orchestrators.  Both seem like desirable traits as far as I'm
concerned.

> > > The intent was to switch to the audit netlink interface for contid,
> > > capcontid and to add the audit netlink interface for loginuid and
> > > sessionid while deprecating the proc interface for loginuid and
> > > sessionid.  This was alluded to in the cover letter, but not very clear,
> > > I'm afraid.  I have patches to remove the contid and loginuid/sessionid
> > > interfaces in another tree which is why I had forgotten to outline that
> > > plan more explicitly in the cover letter.
Neil Horman Oct. 22, 2019, 12:13 p.m. UTC | #7
On Mon, Oct 21, 2019 at 08:31:37PM -0400, Paul Moore wrote:
> On Mon, Oct 21, 2019 at 7:58 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-10-21 17:43, Paul Moore wrote:
> > > On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > On 2019-10-21 15:53, Paul Moore wrote:
> > > > > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > > > process in a non-init user namespace the capability to set audit
> > > > > > > container identifiers.
> > > > > > >
> > > > > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > > > > structure:
> > > > > > > struct audit_capcontid_status {
> > > > > > >         pid_t   pid;
> > > > > > >         u32     enable;
> > > > > > > };
> > > > > >
> > > > > > Paul, can I get a review of the general idea here to see if you're ok
> > > > > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > > > > setting contid from beyond the init user namespace where capable() can't
> > > > > > reach and ns_capable() is meaningless for these purposes?
> > > > >
> > > > > I think my previous comment about having both the procfs and netlink
> > > > > interfaces apply here.  I don't see why we need two different APIs at
> > > > > the start; explain to me why procfs isn't sufficient.  If the argument
> > > > > is simply the desire to avoid mounting procfs in the container, how
> > > > > many container orchestrators can function today without a valid /proc?
> > > >
> > > > Ok, sorry, I meant to address that question from a previous patch
> > > > comment at the same time.
> > > >
> > > > It was raised by Eric Biederman that the proc filesystem interface for
> > > > audit had its limitations and he had suggested an audit netlink
> > > > interface made more sense.
> > >
> > > I'm sure you've got it handy, so I'm going to be lazy and ask: archive
> > > pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
> > > of using the netlink interface for this, so unless Eric presents a
> > > super compelling reason for why we shouldn't use procfs I'm inclined
> > > to stick with /proc.
> >
> > It was actually a video call with Eric and Steve where that was
> > recommended, so I can't provide you with any first-hand communication
> > about it.  I'll get more details...
> 
> Yeah, that sort of information really needs to be on the list.
> 
> > So, with that out of the way, could you please comment on the general
> > idea of what was intended to be the central idea of this mechanism to be
> > able to nest containers beyond the initial user namespace (knowing that
> > a /proc interface is available and the audit netlink interface isn't
> > necessary for it to work and the latter can be easily removed)?
> 
> I'm not entirely clear what you are asking about, are you asking why I
> care about nesting container orchestrators?  Simply put, it is not
> uncommon for the LXC/LXD folks to see nested container orchestrators,
> so I felt it was important to support that use case.  When we
> originally started this effort we probably should have done a better
> job reaching out to the LXC/LXD folks, we may have caught this
> earlier.  Regardless, we caught it, and it looks like we are on our
> way to supporting it (that's good).
> 
> Are you asking why I prefer the procfs approach to setting/getting the
> audit container ID?  For one, it makes it easier for a LSM to enforce
> the audit container ID operations independent of the other audit
> control APIs.  It also provides a simpler interface for container
> orchestrators.  Both seem like desirable traits as far as I'm
> concerned.
> 
I agree that one api is probably the best approach here, but I actually
think that the netlink interface is the more flexible approach.  Its a
little more work for userspace (you have to marshal your data into a
netlink message before sending it, and wait for an async response), but
thats a well known pattern, and it provides significantly more
flexibility for the kernel.  LSM already has a hook to audit netlink
messages in sock_sendmsg, so thats not a problem, and if you use
netlink, you get the advantage of being able to broadcast messages
within your network namespaces, facilitating any needed orchestrator
co-ordination.  To do the same thing with a filesystem api, you need to
use the fanotify api, which IIRC doesn't work on proc.

Neil

> > > > The intent was to switch to the audit netlink interface for contid,
> > > > capcontid and to add the audit netlink interface for loginuid and
> > > > sessionid while deprecating the proc interface for loginuid and
> > > > sessionid.  This was alluded to in the cover letter, but not very clear,
> > > > I'm afraid.  I have patches to remove the contid and loginuid/sessionid
> > > > interfaces in another tree which is why I had forgotten to outline that
> > > > plan more explicitly in the cover letter.
> 
> -- 
> paul moore
> www.paul-moore.com
>
Paul Moore Oct. 22, 2019, 2:04 p.m. UTC | #8
On Tue, Oct 22, 2019 at 8:13 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> On Mon, Oct 21, 2019 at 08:31:37PM -0400, Paul Moore wrote:
> > On Mon, Oct 21, 2019 at 7:58 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > On 2019-10-21 17:43, Paul Moore wrote:
> > > > On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > On 2019-10-21 15:53, Paul Moore wrote:
> > > > > > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > > > > process in a non-init user namespace the capability to set audit
> > > > > > > > container identifiers.
> > > > > > > >
> > > > > > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > > > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > > > > > structure:
> > > > > > > > struct audit_capcontid_status {
> > > > > > > >         pid_t   pid;
> > > > > > > >         u32     enable;
> > > > > > > > };
> > > > > > >
> > > > > > > Paul, can I get a review of the general idea here to see if you're ok
> > > > > > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > > > > > setting contid from beyond the init user namespace where capable() can't
> > > > > > > reach and ns_capable() is meaningless for these purposes?
> > > > > >
> > > > > > I think my previous comment about having both the procfs and netlink
> > > > > > interfaces apply here.  I don't see why we need two different APIs at
> > > > > > the start; explain to me why procfs isn't sufficient.  If the argument
> > > > > > is simply the desire to avoid mounting procfs in the container, how
> > > > > > many container orchestrators can function today without a valid /proc?
> > > > >
> > > > > Ok, sorry, I meant to address that question from a previous patch
> > > > > comment at the same time.
> > > > >
> > > > > It was raised by Eric Biederman that the proc filesystem interface for
> > > > > audit had its limitations and he had suggested an audit netlink
> > > > > interface made more sense.
> > > >
> > > > I'm sure you've got it handy, so I'm going to be lazy and ask: archive
> > > > pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
> > > > of using the netlink interface for this, so unless Eric presents a
> > > > super compelling reason for why we shouldn't use procfs I'm inclined
> > > > to stick with /proc.
> > >
> > > It was actually a video call with Eric and Steve where that was
> > > recommended, so I can't provide you with any first-hand communication
> > > about it.  I'll get more details...
> >
> > Yeah, that sort of information really needs to be on the list.
> >
> > > So, with that out of the way, could you please comment on the general
> > > idea of what was intended to be the central idea of this mechanism to be
> > > able to nest containers beyond the initial user namespace (knowing that
> > > a /proc interface is available and the audit netlink interface isn't
> > > necessary for it to work and the latter can be easily removed)?
> >
> > I'm not entirely clear what you are asking about, are you asking why I
> > care about nesting container orchestrators?  Simply put, it is not
> > uncommon for the LXC/LXD folks to see nested container orchestrators,
> > so I felt it was important to support that use case.  When we
> > originally started this effort we probably should have done a better
> > job reaching out to the LXC/LXD folks, we may have caught this
> > earlier.  Regardless, we caught it, and it looks like we are on our
> > way to supporting it (that's good).
> >
> > Are you asking why I prefer the procfs approach to setting/getting the
> > audit container ID?  For one, it makes it easier for a LSM to enforce
> > the audit container ID operations independent of the other audit
> > control APIs.  It also provides a simpler interface for container
> > orchestrators.  Both seem like desirable traits as far as I'm
> > concerned.
> >
> I agree that one api is probably the best approach here, but I actually
> think that the netlink interface is the more flexible approach.  Its a
> little more work for userspace (you have to marshal your data into a
> netlink message before sending it, and wait for an async response), but
> thats a well known pattern, and it provides significantly more
> flexibility for the kernel.  LSM already has a hook to audit netlink
> messages in sock_sendmsg, so thats not a problem ...

Look closely at how the LSM controls for netlink work and you'll see a
number of problems; basically command level granularity it hard.  On
the other hand, per-file granularity it easy.

> ... and if you use
> netlink, you get the advantage of being able to broadcast messages
> within your network namespaces, facilitating any needed orchestrator
> co-ordination.

Please don't read this comment as support of the netlink approach, but
I don't think we want to use the multicast netlink; we would want it
to be more of client/server model so that we could enforce access
controls a bit easier.  Besides, is this even a use case?

> To do the same thing with a filesystem api, you need to
> use the fanotify api, which IIRC doesn't work on proc.

Once again, I'm not sure this is a problem we are trying to solve
(broadcasting audit container ID across multiple tasks), is it?
Access to the audit container ID in userspace is something I've always
thought needs to be tightly controlled to prevent abuse.
Richard Guy Briggs Oct. 22, 2019, 2:27 p.m. UTC | #9
On 2019-10-21 20:31, Paul Moore wrote:
> On Mon, Oct 21, 2019 at 7:58 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-10-21 17:43, Paul Moore wrote:
> > > On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > On 2019-10-21 15:53, Paul Moore wrote:
> > > > > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > > > process in a non-init user namespace the capability to set audit
> > > > > > > container identifiers.
> > > > > > >
> > > > > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > > > > structure:
> > > > > > > struct audit_capcontid_status {
> > > > > > >         pid_t   pid;
> > > > > > >         u32     enable;
> > > > > > > };
> > > > > >
> > > > > > Paul, can I get a review of the general idea here to see if you're ok
> > > > > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > > > > setting contid from beyond the init user namespace where capable() can't
> > > > > > reach and ns_capable() is meaningless for these purposes?
> > > > >
> > > > > I think my previous comment about having both the procfs and netlink
> > > > > interfaces apply here.  I don't see why we need two different APIs at
> > > > > the start; explain to me why procfs isn't sufficient.  If the argument
> > > > > is simply the desire to avoid mounting procfs in the container, how
> > > > > many container orchestrators can function today without a valid /proc?
> > > >
> > > > Ok, sorry, I meant to address that question from a previous patch
> > > > comment at the same time.
> > > >
> > > > It was raised by Eric Biederman that the proc filesystem interface for
> > > > audit had its limitations and he had suggested an audit netlink
> > > > interface made more sense.
> > >
> > > I'm sure you've got it handy, so I'm going to be lazy and ask: archive
> > > pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
> > > of using the netlink interface for this, so unless Eric presents a
> > > super compelling reason for why we shouldn't use procfs I'm inclined
> > > to stick with /proc.
> >
> > It was actually a video call with Eric and Steve where that was
> > recommended, so I can't provide you with any first-hand communication
> > about it.  I'll get more details...
> 
> Yeah, that sort of information really needs to be on the list.
> 
> > So, with that out of the way, could you please comment on the general
> > idea of what was intended to be the central idea of this mechanism to be
> > able to nest containers beyond the initial user namespace (knowing that
> > a /proc interface is available and the audit netlink interface isn't
> > necessary for it to work and the latter can be easily removed)?
> 
> I'm not entirely clear what you are asking about, are you asking why I
> care about nesting container orchestrators?  Simply put, it is not
> uncommon for the LXC/LXD folks to see nested container orchestrators,
> so I felt it was important to support that use case.  When we
> originally started this effort we probably should have done a better
> job reaching out to the LXC/LXD folks, we may have caught this
> earlier.  Regardless, we caught it, and it looks like we are on our
> way to supporting it (that's good).

I'm not asking why you care about container orchestrators.

> Are you asking why I prefer the procfs approach to setting/getting the
> audit container ID?  For one, it makes it easier for a LSM to enforce
> the audit container ID operations independent of the other audit
> control APIs.  It also provides a simpler interface for container
> orchestrators.  Both seem like desirable traits as far as I'm
> concerned.

I'd like to leave the proc/netlink decision/debate out of this
discussion, though it does need to happen and I was hoping that would
happen on the loginuid/sessionid proc/netlink patch thread.

I'd like your perspective on how the capcontid feature was implemented
(aside from the proc/netlink api issue which was intended to be
consistent across loginuid/sessionid/contid/capcontid).  Do you see this
feature as potentially solving the nested container issue in child user
namespaces?

> > > > The intent was to switch to the audit netlink interface for contid,
> > > > capcontid and to add the audit netlink interface for loginuid and
> > > > sessionid while deprecating the proc interface for loginuid and
> > > > sessionid.  This was alluded to in the cover letter, but not very clear,
> > > > I'm afraid.  I have patches to remove the contid and loginuid/sessionid
> > > > interfaces in another tree which is why I had forgotten to outline that
> > > > plan more explicitly in the cover letter.
> 
> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Oct. 22, 2019, 2:34 p.m. UTC | #10
On Tue, Oct 22, 2019 at 10:27 AM Richard Guy Briggs <rgb@redhat.com> wrote:
> I'd like your perspective on how the capcontid feature was implemented
> (aside from the proc/netlink api issue which was intended to be
> consistent across loginuid/sessionid/contid/capcontid).  Do you see this
> feature as potentially solving the nested container issue in child user
> namespaces?

The patchset is a bit messy at this point in the stack due to the
"fixup!" confusion and a few other things which I already mentioned so
I don't really want to comment too much on that until I can see
everything in a reasonable patch stack.  Let's leave that for the next
draft.
Richard Guy Briggs Oct. 22, 2019, 8:06 p.m. UTC | #11
On 2019-10-22 08:13, Neil Horman wrote:
> On Mon, Oct 21, 2019 at 08:31:37PM -0400, Paul Moore wrote:
> > On Mon, Oct 21, 2019 at 7:58 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > On 2019-10-21 17:43, Paul Moore wrote:
> > > > On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > On 2019-10-21 15:53, Paul Moore wrote:
> > > > > > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > > > > process in a non-init user namespace the capability to set audit
> > > > > > > > container identifiers.
> > > > > > > >
> > > > > > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > > > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > > > > > structure:
> > > > > > > > struct audit_capcontid_status {
> > > > > > > >         pid_t   pid;
> > > > > > > >         u32     enable;
> > > > > > > > };
> > > > > > >
> > > > > > > Paul, can I get a review of the general idea here to see if you're ok
> > > > > > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > > > > > setting contid from beyond the init user namespace where capable() can't
> > > > > > > reach and ns_capable() is meaningless for these purposes?
> > > > > >
> > > > > > I think my previous comment about having both the procfs and netlink
> > > > > > interfaces apply here.  I don't see why we need two different APIs at
> > > > > > the start; explain to me why procfs isn't sufficient.  If the argument
> > > > > > is simply the desire to avoid mounting procfs in the container, how
> > > > > > many container orchestrators can function today without a valid /proc?
> > > > >
> > > > > Ok, sorry, I meant to address that question from a previous patch
> > > > > comment at the same time.
> > > > >
> > > > > It was raised by Eric Biederman that the proc filesystem interface for
> > > > > audit had its limitations and he had suggested an audit netlink
> > > > > interface made more sense.
> > > >
> > > > I'm sure you've got it handy, so I'm going to be lazy and ask: archive
> > > > pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
> > > > of using the netlink interface for this, so unless Eric presents a
> > > > super compelling reason for why we shouldn't use procfs I'm inclined
> > > > to stick with /proc.
> > >
> > > It was actually a video call with Eric and Steve where that was
> > > recommended, so I can't provide you with any first-hand communication
> > > about it.  I'll get more details...
> > 
> > Yeah, that sort of information really needs to be on the list.
> > 
> > > So, with that out of the way, could you please comment on the general
> > > idea of what was intended to be the central idea of this mechanism to be
> > > able to nest containers beyond the initial user namespace (knowing that
> > > a /proc interface is available and the audit netlink interface isn't
> > > necessary for it to work and the latter can be easily removed)?
> > 
> > I'm not entirely clear what you are asking about, are you asking why I
> > care about nesting container orchestrators?  Simply put, it is not
> > uncommon for the LXC/LXD folks to see nested container orchestrators,
> > so I felt it was important to support that use case.  When we
> > originally started this effort we probably should have done a better
> > job reaching out to the LXC/LXD folks, we may have caught this
> > earlier.  Regardless, we caught it, and it looks like we are on our
> > way to supporting it (that's good).
> > 
> > Are you asking why I prefer the procfs approach to setting/getting the
> > audit container ID?  For one, it makes it easier for a LSM to enforce
> > the audit container ID operations independent of the other audit
> > control APIs.  It also provides a simpler interface for container
> > orchestrators.  Both seem like desirable traits as far as I'm
> > concerned.
> 
> I agree that one api is probably the best approach here, but I actually
> think that the netlink interface is the more flexible approach.  Its a
> little more work for userspace (you have to marshal your data into a
> netlink message before sending it, and wait for an async response), but
> thats a well known pattern, and it provides significantly more
> flexibility for the kernel.  LSM already has a hook to audit netlink
> messages in sock_sendmsg, so thats not a problem, and if you use
> netlink, you get the advantage of being able to broadcast messages
> within your network namespaces, facilitating any needed orchestrator
> co-ordination.  To do the same thing with a filesystem api, you need to
> use the fanotify api, which IIRC doesn't work on proc.

One api was the intent, deprecating proc for loginuid and sessionid if
netlink was the chosen way to go.

I don't think we had discussed the possibility or need to use netlink
multicast for this purpose and see it as a liability to limiting access
to only those processes that need it.

> Neil
> 
> > > > > The intent was to switch to the audit netlink interface for contid,
> > > > > capcontid and to add the audit netlink interface for loginuid and
> > > > > sessionid while deprecating the proc interface for loginuid and
> > > > > sessionid.  This was alluded to in the cover letter, but not very clear,
> > > > > I'm afraid.  I have patches to remove the contid and loginuid/sessionid
> > > > > interfaces in another tree which is why I had forgotten to outline that
> > > > > plan more explicitly in the cover letter.
> > 
> > paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Richard Guy Briggs Oct. 24, 2019, 9 p.m. UTC | #12
On 2019-10-21 20:31, Paul Moore wrote:
> On Mon, Oct 21, 2019 at 7:58 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-10-21 17:43, Paul Moore wrote:
> > > On Mon, Oct 21, 2019 at 5:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > On 2019-10-21 15:53, Paul Moore wrote:
> > > > > On Fri, Oct 18, 2019 at 9:39 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > > On 2019-09-18 21:22, Richard Guy Briggs wrote:
> > > > > > > Provide a mechanism similar to CAP_AUDIT_CONTROL to explicitly give a
> > > > > > > process in a non-init user namespace the capability to set audit
> > > > > > > container identifiers.
> > > > > > >
> > > > > > > Use audit netlink message types AUDIT_GET_CAPCONTID 1027 and
> > > > > > > AUDIT_SET_CAPCONTID 1028.  The message format includes the data
> > > > > > > structure:
> > > > > > > struct audit_capcontid_status {
> > > > > > >         pid_t   pid;
> > > > > > >         u32     enable;
> > > > > > > };
> > > > > >
> > > > > > Paul, can I get a review of the general idea here to see if you're ok
> > > > > > with this way of effectively extending CAP_AUDIT_CONTROL for the sake of
> > > > > > setting contid from beyond the init user namespace where capable() can't
> > > > > > reach and ns_capable() is meaningless for these purposes?
> > > > >
> > > > > I think my previous comment about having both the procfs and netlink
> > > > > interfaces apply here.  I don't see why we need two different APIs at
> > > > > the start; explain to me why procfs isn't sufficient.  If the argument
> > > > > is simply the desire to avoid mounting procfs in the container, how
> > > > > many container orchestrators can function today without a valid /proc?
> > > >
> > > > Ok, sorry, I meant to address that question from a previous patch
> > > > comment at the same time.
> > > >
> > > > It was raised by Eric Biederman that the proc filesystem interface for
> > > > audit had its limitations and he had suggested an audit netlink
> > > > interface made more sense.
> > >
> > > I'm sure you've got it handy, so I'm going to be lazy and ask: archive
> > > pointer to Eric's comments?  Just a heads-up, I'm really *not* a fan
> > > of using the netlink interface for this, so unless Eric presents a
> > > super compelling reason for why we shouldn't use procfs I'm inclined
> > > to stick with /proc.
> >
> > It was actually a video call with Eric and Steve where that was
> > recommended, so I can't provide you with any first-hand communication
> > about it.  I'll get more details...
> 
> Yeah, that sort of information really needs to be on the list.

Here's the note I had from that meeting:

- Eric raised the issue that using /proc is likely to get more and more
  hoary due to mount namespaces and suggested that we use a netlink
audit message (or a new syscall) to set the audit container identifier
and since the loginuid is a similar type of operation, that it should be
migrated over to a similar mechanism to get it away from /proc.  Get
could be done with a netlink audit message that triggers an audit log
message to deliver the information.  I'm reluctant to further pollute
the syscall space if we can find another method.  The netlink audit
message makes sense since any audit-enabled service is likely to already
have an audit socket open.

I don't have more detailed notes about what Eric said specifically.

> > So, with that out of the way, could you please comment on the general
> > idea of what was intended to be the central idea of this mechanism to be
> > able to nest containers beyond the initial user namespace (knowing that
> > a /proc interface is available and the audit netlink interface isn't
> > necessary for it to work and the latter can be easily removed)?
> 
> I'm not entirely clear what you are asking about, are you asking why I
> care about nesting container orchestrators?  Simply put, it is not
> uncommon for the LXC/LXD folks to see nested container orchestrators,
> so I felt it was important to support that use case.  When we
> originally started this effort we probably should have done a better
> job reaching out to the LXC/LXD folks, we may have caught this
> earlier.  Regardless, we caught it, and it looks like we are on our
> way to supporting it (that's good).
> 
> Are you asking why I prefer the procfs approach to setting/getting the
> audit container ID?  For one, it makes it easier for a LSM to enforce
> the audit container ID operations independent of the other audit
> control APIs.  It also provides a simpler interface for container
> orchestrators.  Both seem like desirable traits as far as I'm
> concerned.
> 
> > > > The intent was to switch to the audit netlink interface for contid,
> > > > capcontid and to add the audit netlink interface for loginuid and
> > > > sessionid while deprecating the proc interface for loginuid and
> > > > sessionid.  This was alluded to in the cover letter, but not very clear,
> > > > I'm afraid.  I have patches to remove the contid and loginuid/sessionid
> > > > interfaces in another tree which is why I had forgotten to outline that
> > > > plan more explicitly in the cover letter.
> 
> -- 
> paul moore
> www.paul-moore.com

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Oct. 30, 2019, 8:27 p.m. UTC | #13
On Thu, Oct 24, 2019 at 5:00 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Here's the note I had from that meeting:
>
> - Eric raised the issue that using /proc is likely to get more and more
>   hoary due to mount namespaces and suggested that we use a netlink
> audit message (or a new syscall) to set the audit container identifier
> and since the loginuid is a similar type of operation, that it should be
> migrated over to a similar mechanism to get it away from /proc.  Get
> could be done with a netlink audit message that triggers an audit log
> message to deliver the information.  I'm reluctant to further pollute
> the syscall space if we can find another method.  The netlink audit
> message makes sense since any audit-enabled service is likely to already
> have an audit socket open.

Thanks for the background info on the off-list meeting.  I would
encourage you to have discussions like this on-list in the future; if
that isn't possible, hosting a public call would okay-ish, but a
distant second.

At this point in time I'm not overly concerned about /proc completely
going away in namespaces/containers that are full featured enough to
host a container orchestrator.  If/when reliance on procfs becomes an
issue, we can look at alternate APIs, but given the importance of
/proc to userspace (including to audit) I suspect we are going to see
it persist for some time.  I would prefer to see you to drop the audit
container ID netlink API portions of this patchset and focus on the
procfs API.

Also, for the record, removing the audit loginuid from procfs is not
something to take lightly, if at all; like it or not, it's part of the
kernel API.
Richard Guy Briggs Oct. 30, 2019, 10:03 p.m. UTC | #14
On 2019-10-30 16:27, Paul Moore wrote:
> On Thu, Oct 24, 2019 at 5:00 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > Here's the note I had from that meeting:
> >
> > - Eric raised the issue that using /proc is likely to get more and more
> >   hoary due to mount namespaces and suggested that we use a netlink
> > audit message (or a new syscall) to set the audit container identifier
> > and since the loginuid is a similar type of operation, that it should be
> > migrated over to a similar mechanism to get it away from /proc.  Get
> > could be done with a netlink audit message that triggers an audit log
> > message to deliver the information.  I'm reluctant to further pollute
> > the syscall space if we can find another method.  The netlink audit
> > message makes sense since any audit-enabled service is likely to already
> > have an audit socket open.
> 
> Thanks for the background info on the off-list meeting.  I would
> encourage you to have discussions like this on-list in the future; if
> that isn't possible, hosting a public call would okay-ish, but a
> distant second.

I'm still trying to get Eric's attention to get him to weigh in here and
provide a more eloquent representation of his ideas and concerns.  Some
of it was related to CRIU(sp?) issues which we've already of which we've
already seen similar concerns in namespace identifiers including the
device identity to qualify it.

> At this point in time I'm not overly concerned about /proc completely
> going away in namespaces/containers that are full featured enough to
> host a container orchestrator.  If/when reliance on procfs becomes an
> issue, we can look at alternate APIs, but given the importance of
> /proc to userspace (including to audit) I suspect we are going to see
> it persist for some time.  I would prefer to see you to drop the audit
> container ID netlink API portions of this patchset and focus on the
> procfs API.

I've already refactored the code to put the netlink bits at the end as
completely optional pieces for completeness so they won't get in the way
of the real substance of this patchset.  The nesting depth and total
number of containers checks have also been punted to the end of the
patchset to get them out of the way of discussion.

> Also, for the record, removing the audit loginuid from procfs is not
> something to take lightly, if at all; like it or not, it's part of the
> kernel API.

Oh, I'm quite aware of how important this change is and it was discussed
with Steve Grubb who saw the concern and value of considering such a
disruptive change.  Removing proc support for auid/ses would be a
long-term deprecation if accepted.

Really, I should have labelled the v7 patchset as RFC since there were
so many new and disruptive ideas presented in it.

> paul moore
> www.paul-moore.com

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Oct. 31, 2019, 1:59 p.m. UTC | #15
On Wed, Oct 30, 2019 at 6:04 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-10-30 16:27, Paul Moore wrote:
> > On Thu, Oct 24, 2019 at 5:00 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > Here's the note I had from that meeting:
> > >
> > > - Eric raised the issue that using /proc is likely to get more and more
> > >   hoary due to mount namespaces and suggested that we use a netlink
> > > audit message (or a new syscall) to set the audit container identifier
> > > and since the loginuid is a similar type of operation, that it should be
> > > migrated over to a similar mechanism to get it away from /proc.  Get
> > > could be done with a netlink audit message that triggers an audit log
> > > message to deliver the information.  I'm reluctant to further pollute
> > > the syscall space if we can find another method.  The netlink audit
> > > message makes sense since any audit-enabled service is likely to already
> > > have an audit socket open.
> >
> > Thanks for the background info on the off-list meeting.  I would
> > encourage you to have discussions like this on-list in the future; if
> > that isn't possible, hosting a public call would okay-ish, but a
> > distant second.
>
> I'm still trying to get Eric's attention to get him to weigh in here and
> provide a more eloquent representation of his ideas and concerns.  Some
> of it was related to CRIU(sp?) issues which we've already of which we've
> already seen similar concerns in namespace identifiers including the
> device identity to qualify it.

Okay, let's leave this open until we hear from Eric to see if he has
any additional information, but it's going to need to be pretty
compelling.

> > At this point in time I'm not overly concerned about /proc completely
> > going away in namespaces/containers that are full featured enough to
> > host a container orchestrator.  If/when reliance on procfs becomes an
> > issue, we can look at alternate APIs, but given the importance of
> > /proc to userspace (including to audit) I suspect we are going to see
> > it persist for some time.  I would prefer to see you to drop the audit
> > container ID netlink API portions of this patchset and focus on the
> > procfs API.
>
> I've already refactored the code to put the netlink bits at the end as
> completely optional pieces for completeness so they won't get in the way
> of the real substance of this patchset.  The nesting depth and total
> number of containers checks have also been punted to the end of the
> patchset to get them out of the way of discussion.

That's fine, but if we do decide to drop the netlink API after hearing
from Eric, please drop those from the patchset.  Keeping the patchset
small and focused should be a goal, and including rejected/dead
patches (even at the end) doesn't help move towards that goal.

> > Also, for the record, removing the audit loginuid from procfs is not
> > something to take lightly, if at all; like it or not, it's part of the
> > kernel API.
>
> Oh, I'm quite aware of how important this change is and it was discussed
> with Steve Grubb who saw the concern and value of considering such a
> disruptive change.  Removing proc support for auid/ses would be a
> long-term deprecation if accepted.

As I mentioned, that comment was more "for the record" than you in
particular; I know we've talked a lot over the years about kernel API
stability and I'm confident you are aware of the pitfalls there. :)
Steve Grubb Oct. 31, 2019, 2:50 p.m. UTC | #16
Hello,

TLDR;  I see a lot of benefit to switching away from procfs for setting auid & 
sessionid.

On Wednesday, October 30, 2019 6:03:20 PM EDT Richard Guy Briggs wrote:
> > Also, for the record, removing the audit loginuid from procfs is not
> > something to take lightly, if at all; like it or not, it's part of the
> > kernel API.

It can also be used by tools to iterate processes related to one user or 
session. I use this in my Intrusion Prevention System which will land in 
audit user space at some point in the future.


> Oh, I'm quite aware of how important this change is and it was discussed
> with Steve Grubb who saw the concern and value of considering such a
> disruptive change.

Actually, I advocated for syscall. I think the gist of Eric's idea was that /
proc is the intersection of many nasty problems. By relying on it, you can't 
simplify the API to reduce the complexity. Almost no program actually needs 
access to /proc. ps does. But almost everything else is happy without it. For 
example, when you setup chroot jails, you may have to add /dev/random or /
dev/null, but almost never /proc. What does force you to add /proc is any 
entry point daemon like sshd because it needs to set the loginuid. If we 
switch away from /proc, then sshd or crond will no longer /require/ procfs to 
be available which again simplifies the system design.


> Removing proc support for auid/ses would be a
> long-term deprecation if accepted.

It might need to just be turned into readonly for a while. But then again, 
perhaps auid and session should be part of /proc/<pid>/status? Maybe this can 
be done independently and ahead of the container work so there is a migration 
path for things that read auid or session. TBH, maybe this should have been 
done from the beginning.

-Steve
Paul Moore Oct. 31, 2019, 11:37 p.m. UTC | #17
On Thu, Oct 31, 2019 at 10:51 AM Steve Grubb <sgrubb@redhat.com> wrote:
> On Wednesday, October 30, 2019 6:03:20 PM EDT Richard Guy Briggs wrote:
> > > Also, for the record, removing the audit loginuid from procfs is not
> > > something to take lightly, if at all; like it or not, it's part of the
> > > kernel API.
>
> It can also be used by tools to iterate processes related to one user or
> session. I use this in my Intrusion Prevention System which will land in
> audit user space at some point in the future.

Let's try to stay focused on the audit container ID functionality; I
fear if we start bringing in other unrelated issues we are never going
to land these patches.

> > Oh, I'm quite aware of how important this change is and it was discussed
> > with Steve Grubb who saw the concern and value of considering such a
> > disruptive change.
>
> Actually, I advocated for syscall. I think the gist of Eric's idea was that /
> proc is the intersection of many nasty problems. By relying on it, you can't
> simplify the API to reduce the complexity.

I guess complexity is relative in a sense, but reading and writing a
number from a file in procfs seems awfully simple to me.

> Almost no program actually needs
> access to /proc. ps does. But almost everything else is happy without it. For
> example, when you setup chroot jails, you may have to add /dev/random or /
> dev/null, but almost never /proc. What does force you to add /proc is any
> entry point daemon like sshd because it needs to set the loginuid. If we
> switch away from /proc, then sshd or crond will no longer /require/ procfs to
> be available which again simplifies the system design.

It's not that simple, there are plenty of container use cases beyond
ps which require procfs:

Most LSM aware applications require procfs to view and manage some LSM
state (e.g. /proc/self/attr).

System containers, containers that run their own init/systemd/etc.,
require a working procfs.

Nested container orchestrators often run in system containers, which
require a working procfs (see above).

I'm sure there are plenty others, but these are the ones that came
immediately to mind.
Duncan Roe Nov. 1, 2019, 1:02 a.m. UTC | #18
On Thu, Oct 31, 2019 at 10:50:57AM -0400, Steve Grubb wrote:
> Hello,
>
> TLDR;  I see a lot of benefit to switching away from procfs for setting auid &
> sessionid.
>
> On Wednesday, October 30, 2019 6:03:20 PM EDT Richard Guy Briggs wrote:
> > > Also, for the record, removing the audit loginuid from procfs is not
> > > something to take lightly, if at all; like it or not, it's part of the
> > > kernel API.
>
> It can also be used by tools to iterate processes related to one user or
> session. I use this in my Intrusion Prevention System which will land in
> audit user space at some point in the future.
>
>
> > Oh, I'm quite aware of how important this change is and it was discussed
> > with Steve Grubb who saw the concern and value of considering such a
> > disruptive change.
>
> Actually, I advocated for syscall. I think the gist of Eric's idea was that /
> proc is the intersection of many nasty problems. By relying on it, you can't
> simplify the API to reduce the complexity. Almost no program actually needs
                                             ^^^^^^ ^^ ^^^^^^^ ^^^^^^^^ ^^^^^
> access to /proc. ps does. But almost everything else is happy without it. For
> ^^^^^^ ^^ ^^^^^^ ^^ ^^^^^

Eh?? *top* needs /proc/ps, as do most of the programs in package procps-ng.
Then there's lsof, pgrep (which doesn't fail but can't find anything) and even
lilo (for Slackware ;)

> example, when you setup chroot jails, you may have to add /dev/random or /
> dev/null, but almost never /proc. What does force you to add /proc is any
> entry point daemon like sshd because it needs to set the loginuid. If we
> switch away from /proc, then sshd or crond will no longer /require/ procfs to
> be available which again simplifies the system design.
>
>
> > Removing proc support for auid/ses would be a
> > long-term deprecation if accepted.
>
> It might need to just be turned into readonly for a while. But then again,
> perhaps auid and session should be part of /proc/<pid>/status? Maybe this can
> be done independently and ahead of the container work so there is a migration
> path for things that read auid or session. TBH, maybe this should have been
> done from the beginning.
>
> -Steve
>
Cheers ... Duncan.
Richard Guy Briggs Nov. 1, 2019, 3:09 p.m. UTC | #19
On 2019-10-31 10:50, Steve Grubb wrote:
> Hello,
> 
> TLDR;  I see a lot of benefit to switching away from procfs for setting auid & 
> sessionid.
> 
> On Wednesday, October 30, 2019 6:03:20 PM EDT Richard Guy Briggs wrote:
> > > Also, for the record, removing the audit loginuid from procfs is not
> > > something to take lightly, if at all; like it or not, it's part of the
> > > kernel API.
> 
> It can also be used by tools to iterate processes related to one user or 
> session. I use this in my Intrusion Prevention System which will land in 
> audit user space at some point in the future.
> 
> > Oh, I'm quite aware of how important this change is and it was discussed
> > with Steve Grubb who saw the concern and value of considering such a
> > disruptive change.
> 
> Actually, I advocated for syscall. I think the gist of Eric's idea was that /
> proc is the intersection of many nasty problems. By relying on it, you can't 
> simplify the API to reduce the complexity. Almost no program actually needs 
> access to /proc. ps does. But almost everything else is happy without it. For 
> example, when you setup chroot jails, you may have to add /dev/random or /
> dev/null, but almost never /proc. What does force you to add /proc is any 
> entry point daemon like sshd because it needs to set the loginuid. If we 
> switch away from /proc, then sshd or crond will no longer /require/ procfs to 
> be available which again simplifies the system design.
> 
> > Removing proc support for auid/ses would be a
> > long-term deprecation if accepted.
> 
> It might need to just be turned into readonly for a while. But then again, 
> perhaps auid and session should be part of /proc/<pid>/status? Maybe this can 
> be done independently and ahead of the container work so there is a migration 
> path for things that read auid or session. TBH, maybe this should have been 
> done from the beginning.

How about making loginuid/contid/capcontid writable only via netlink but
still provide the /proc interface for reading?  Deprecation of proc can
be left as a decision for later.  This way sshd/crond/getty don't need
/proc, but the info is still there for tools that want to read it.

> -Steve

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Steve Grubb Nov. 1, 2019, 3:13 p.m. UTC | #20
On Friday, November 1, 2019 11:09:27 AM EDT Richard Guy Briggs wrote:
> On 2019-10-31 10:50, Steve Grubb wrote:
> > Hello,
> > 
> > TLDR;  I see a lot of benefit to switching away from procfs for setting
> > auid & sessionid.
> > 
> > On Wednesday, October 30, 2019 6:03:20 PM EDT Richard Guy Briggs wrote:
> > > > Also, for the record, removing the audit loginuid from procfs is not
> > > > something to take lightly, if at all; like it or not, it's part of
> > > > the
> > > > kernel API.
> > 
> > It can also be used by tools to iterate processes related to one user or
> > session. I use this in my Intrusion Prevention System which will land in
> > audit user space at some point in the future.
> > 
> > > Oh, I'm quite aware of how important this change is and it was
> > > discussed
> > > with Steve Grubb who saw the concern and value of considering such a
> > > disruptive change.
> > 
> > Actually, I advocated for syscall. I think the gist of Eric's idea was
> > that / proc is the intersection of many nasty problems. By relying on
> > it, you can't simplify the API to reduce the complexity. Almost no
> > program actually needs access to /proc. ps does. But almost everything
> > else is happy without it. For example, when you setup chroot jails, you
> > may have to add /dev/random or / dev/null, but almost never /proc. What
> > does force you to add /proc is any entry point daemon like sshd because
> > it needs to set the loginuid. If we switch away from /proc, then sshd or
> > crond will no longer /require/ procfs to be available which again
> > simplifies the system design.
> > 
> > > Removing proc support for auid/ses would be a
> > > long-term deprecation if accepted.
> > 
> > It might need to just be turned into readonly for a while. But then
> > again,
> > perhaps auid and session should be part of /proc/<pid>/status? Maybe this
> > can be done independently and ahead of the container work so there is a
> > migration path for things that read auid or session. TBH, maybe this
> > should have been done from the beginning.
> 
> How about making loginuid/contid/capcontid writable only via netlink but
> still provide the /proc interface for reading?  Deprecation of proc can
> be left as a decision for later.  This way sshd/crond/getty don't need
> /proc, but the info is still there for tools that want to read it.

This also sounds good to me. But I still think loginuid and audit sessionid 
should get written in /proc/<pid>/status so that all process information is 
consolidated in one place.

-Steve
Richard Guy Briggs Nov. 1, 2019, 3:21 p.m. UTC | #21
On 2019-11-01 11:13, Steve Grubb wrote:
> On Friday, November 1, 2019 11:09:27 AM EDT Richard Guy Briggs wrote:
> > On 2019-10-31 10:50, Steve Grubb wrote:
> > > Hello,
> > > 
> > > TLDR;  I see a lot of benefit to switching away from procfs for setting
> > > auid & sessionid.
> > > 
> > > On Wednesday, October 30, 2019 6:03:20 PM EDT Richard Guy Briggs wrote:
> > > > > Also, for the record, removing the audit loginuid from procfs is not
> > > > > something to take lightly, if at all; like it or not, it's part of
> > > > > the
> > > > > kernel API.
> > > 
> > > It can also be used by tools to iterate processes related to one user or
> > > session. I use this in my Intrusion Prevention System which will land in
> > > audit user space at some point in the future.
> > > 
> > > > Oh, I'm quite aware of how important this change is and it was
> > > > discussed
> > > > with Steve Grubb who saw the concern and value of considering such a
> > > > disruptive change.
> > > 
> > > Actually, I advocated for syscall. I think the gist of Eric's idea was
> > > that / proc is the intersection of many nasty problems. By relying on
> > > it, you can't simplify the API to reduce the complexity. Almost no
> > > program actually needs access to /proc. ps does. But almost everything
> > > else is happy without it. For example, when you setup chroot jails, you
> > > may have to add /dev/random or / dev/null, but almost never /proc. What
> > > does force you to add /proc is any entry point daemon like sshd because
> > > it needs to set the loginuid. If we switch away from /proc, then sshd or
> > > crond will no longer /require/ procfs to be available which again
> > > simplifies the system design.
> > > 
> > > > Removing proc support for auid/ses would be a
> > > > long-term deprecation if accepted.
> > > 
> > > It might need to just be turned into readonly for a while. But then
> > > again,
> > > perhaps auid and session should be part of /proc/<pid>/status? Maybe this
> > > can be done independently and ahead of the container work so there is a
> > > migration path for things that read auid or session. TBH, maybe this
> > > should have been done from the beginning.
> > 
> > How about making loginuid/contid/capcontid writable only via netlink but
> > still provide the /proc interface for reading?  Deprecation of proc can
> > be left as a decision for later.  This way sshd/crond/getty don't need
> > /proc, but the info is still there for tools that want to read it.
> 
> This also sounds good to me. But I still think loginuid and audit sessionid 
> should get written in /proc/<pid>/status so that all process information is 
> consolidated in one place.

I don't have a problem adding auid/sessionid to /proc/<pid>/status with
other related information, but it is disruptive to deprecate the
existing interface which could be a seperate step.

> -Steve

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore Nov. 1, 2019, 4:22 p.m. UTC | #22
On Fri, Nov 1, 2019 at 11:10 AM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-10-31 10:50, Steve Grubb wrote:
> > Hello,
> >
> > TLDR;  I see a lot of benefit to switching away from procfs for setting auid &
> > sessionid.
> >
> > On Wednesday, October 30, 2019 6:03:20 PM EDT Richard Guy Briggs wrote:
> > > > Also, for the record, removing the audit loginuid from procfs is not
> > > > something to take lightly, if at all; like it or not, it's part of the
> > > > kernel API.
> >
> > It can also be used by tools to iterate processes related to one user or
> > session. I use this in my Intrusion Prevention System which will land in
> > audit user space at some point in the future.
> >
> > > Oh, I'm quite aware of how important this change is and it was discussed
> > > with Steve Grubb who saw the concern and value of considering such a
> > > disruptive change.
> >
> > Actually, I advocated for syscall. I think the gist of Eric's idea was that /
> > proc is the intersection of many nasty problems. By relying on it, you can't
> > simplify the API to reduce the complexity. Almost no program actually needs
> > access to /proc. ps does. But almost everything else is happy without it. For
> > example, when you setup chroot jails, you may have to add /dev/random or /
> > dev/null, but almost never /proc. What does force you to add /proc is any
> > entry point daemon like sshd because it needs to set the loginuid. If we
> > switch away from /proc, then sshd or crond will no longer /require/ procfs to
> > be available which again simplifies the system design.
> >
> > > Removing proc support for auid/ses would be a
> > > long-term deprecation if accepted.
> >
> > It might need to just be turned into readonly for a while. But then again,
> > perhaps auid and session should be part of /proc/<pid>/status? Maybe this can
> > be done independently and ahead of the container work so there is a migration
> > path for things that read auid or session. TBH, maybe this should have been
> > done from the beginning.
>
> How about making loginuid/contid/capcontid writable only via netlink but
> still provide the /proc interface for reading?  Deprecation of proc can
> be left as a decision for later.  This way sshd/crond/getty don't need
> /proc, but the info is still there for tools that want to read it.

Just so there is no confusion for the next patchset: I think it would
be a mistake to include any changes to loginuid in your next patchset,
even as a "RFC" at the end.  Also, barring some shocking comments from
Eric relating to the imminent death of /proc in containers, I think it
would also be a mistake to include the netlink API.

Let's keep it small and focused :)
diff mbox series

Patch

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1ce27af686ea..dcc53e62e266 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -117,6 +117,7 @@  struct audit_task_info {
 	kuid_t			loginuid;
 	unsigned int		sessionid;
 	struct audit_cont	*cont;
+	u32			capcontid;
 #ifdef CONFIG_AUDITSYSCALL
 	struct audit_context	*ctx;
 #endif
@@ -224,6 +225,14 @@  static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->audit->sessionid;
 }
 
+static inline u32 audit_get_capcontid(struct task_struct *tsk)
+{
+	if (!tsk->audit)
+		return 0;
+	return tsk->audit->capcontid;
+}
+
+extern int audit_set_capcontid(struct task_struct *tsk, u32 enable);
 extern int audit_set_contid(struct task_struct *tsk, u64 contid);
 
 static inline u64 audit_get_contid(struct task_struct *tsk)
@@ -309,6 +318,11 @@  static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return AUDIT_SID_UNSET;
 }
 
+static inline u32 audit_get_capcontid(struct task_struct *tsk)
+{
+	return 0;
+}
+
 static inline u64 audit_get_contid(struct task_struct *tsk)
 {
 	return AUDIT_CID_UNSET;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index eef42c8eea77..011b0a8ee9b2 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -78,6 +78,8 @@ 
 #define AUDIT_GET_LOGINUID	1024	/* Get loginuid of a task */
 #define AUDIT_SET_LOGINUID	1025	/* Set loginuid of a task */
 #define AUDIT_GET_SESSIONID	1026	/* Set sessionid of a task */
+#define AUDIT_GET_CAPCONTID	1027	/* Get cap_contid of a task */
+#define AUDIT_SET_CAPCONTID	1028	/* Set cap_contid of a task */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
diff --git a/kernel/audit.c b/kernel/audit.c
index a70c9184e5d9..7160da464849 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1192,6 +1192,14 @@  static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_GET_SESSIONID:
 		return 0;
 		break;
+	case AUDIT_GET_CAPCONTID:
+	case AUDIT_SET_CAPCONTID:
+	case AUDIT_GET_CONTID:
+	case AUDIT_SET_CONTID:
+		if (!netlink_capable(skb, CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
+			return -EPERM;
+		return 0;
+		break;
 	default:  /* do more checks below */
 		break;
 	}
@@ -1227,8 +1235,6 @@  static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_TTY_SET:
 	case AUDIT_TRIM:
 	case AUDIT_MAKE_EQUIV:
-	case AUDIT_GET_CONTID:
-	case AUDIT_SET_CONTID:
 	case AUDIT_SET_LOGINUID:
 		/* Only support auditd and auditctl in initial pid namespace
 		 * for now. */
@@ -1304,6 +1310,23 @@  static int audit_get_contid_status(struct sk_buff *skb)
 	return 0;
 }
 
+static int audit_get_capcontid_status(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+	u32 seq = nlh->nlmsg_seq;
+	void *data = nlmsg_data(nlh);
+	struct audit_capcontid_status cs;
+
+	cs.pid = ((struct audit_capcontid_status *)data)->pid;
+	if (!cs.pid)
+		cs.pid = task_tgid_nr(current);
+	rcu_read_lock();
+	cs.enable = audit_get_capcontid(find_task_by_vpid(cs.pid));
+	rcu_read_unlock();
+	audit_send_reply(skb, seq, AUDIT_GET_CAPCONTID, 0, 0, &cs, sizeof(cs));
+	return 0;
+}
+
 struct audit_loginuid_status { uid_t loginuid; };
 
 static int audit_get_loginuid_status(struct sk_buff *skb)
@@ -1779,6 +1802,27 @@  static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (err)
 			return err;
 		break;
+	case AUDIT_SET_CAPCONTID: {
+		struct audit_capcontid_status *s = data;
+		struct task_struct *tsk;
+
+		/* check if new data is valid */
+		if (nlmsg_len(nlh) < sizeof(*s))
+			return -EINVAL;
+		tsk = find_get_task_by_vpid(s->pid);
+		if (!tsk)
+			return -EINVAL;
+
+		err = audit_set_capcontid(tsk, s->enable);
+		put_task_struct(tsk);
+		return err;
+		break;
+	}
+	case AUDIT_GET_CAPCONTID:
+		err = audit_get_capcontid_status(skb);
+		if (err)
+			return err;
+		break;
 	case AUDIT_SET_LOGINUID: {
 		uid_t *loginuid = data;
 		kuid_t kloginuid;
@@ -2711,6 +2755,56 @@  static struct task_struct *audit_cont_owner(struct task_struct *tsk)
 	return NULL;
 }
 
+int audit_set_capcontid(struct task_struct *task, u32 enable)
+{
+	u32 oldcapcontid;
+	int rc = 0;
+	struct audit_buffer *ab;
+	uid_t uid;
+	struct tty_struct *tty;
+	char comm[sizeof(current->comm)];
+
+	if (!task->audit)
+		return -ENOPROTOOPT;
+	oldcapcontid = audit_get_capcontid(task);
+	/* if task is not descendant, block */
+	if (task == current)
+		rc = -EBADSLT;
+	else if (!task_is_descendant(current, task))
+		rc = -EXDEV;
+	else if (current_user_ns() == &init_user_ns) {
+		if (!capable(CAP_AUDIT_CONTROL) && !audit_get_capcontid(current))
+			rc = -EPERM;
+	}
+	if (!rc)
+		task->audit->capcontid = enable;
+
+	if (!audit_enabled)
+		return rc;
+
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SET_CAPCONTID);
+	if (!ab)
+		return rc;
+
+	uid = from_kuid(&init_user_ns, task_uid(current));
+	tty = audit_get_tty();
+	audit_log_format(ab,
+			 "opid=%d capcontid=%u old-capcontid=%u pid=%d uid=%u auid=%u tty=%s ses=%u",
+			 task_tgid_nr(task), enable, oldcapcontid,
+			 task_tgid_nr(current), uid,
+			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(current));
+	audit_put_tty(tty);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " comm=");
+	audit_log_untrustedstring(ab, get_task_comm(comm, current));
+	audit_log_d_path_exe(ab, current->mm);
+	audit_log_format(ab, " res=%d", !rc);
+	audit_log_end(ab);
+	return rc;
+}
+
 /*
  * audit_set_contid - set current task's audit contid
  * @task: target task
diff --git a/kernel/audit.h b/kernel/audit.h
index cb25341c1a0f..ac4694e88485 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -231,6 +231,11 @@  struct audit_contid_status {
 	u64	id;
 };
 
+struct audit_capcontid_status {
+	pid_t	pid;
+	u32	enable;
+};
+
 #define AUDIT_CONTID_DEPTH	5
 
 /* Indicates that audit should log the full pathname. */