diff mbox series

[ghak90,V7,04/21] audit: convert to contid list to check for orch/engine ownership

Message ID 6fb4e270bfafef3d0477a06b0365fdcc5a5305b5.1568834524.git.rgb@redhat.com (mailing list archive)
State New, archived
Headers show
Series audit: implement container identifier | expand

Commit Message

Richard Guy Briggs Sept. 19, 2019, 1:22 a.m. UTC
Store the audit container identifier in a refcounted kernel object that
is added to the master list of audit container identifiers.  This will
allow multiple container orchestrators/engines to work on the same
machine without danger of inadvertantly re-using an existing identifier.
It will also allow an orchestrator to inject a process into an existing
container by checking if the original container owner is the one
injecting the task.  A hash table list is used to optimize searches.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h | 26 ++++++++++++++--
 kernel/audit.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/audit.h        |  8 +++++
 3 files changed, 112 insertions(+), 8 deletions(-)

Comments

Neil Horman Sept. 26, 2019, 2:46 p.m. UTC | #1
On Wed, Sep 18, 2019 at 09:22:21PM -0400, Richard Guy Briggs wrote:
> Store the audit container identifier in a refcounted kernel object that
> is added to the master list of audit container identifiers.  This will
> allow multiple container orchestrators/engines to work on the same
> machine without danger of inadvertantly re-using an existing identifier.
> It will also allow an orchestrator to inject a process into an existing
> container by checking if the original container owner is the one
> injecting the task.  A hash table list is used to optimize searches.
> 
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  include/linux/audit.h | 26 ++++++++++++++--
>  kernel/audit.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  kernel/audit.h        |  8 +++++
>  3 files changed, 112 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index f2e3b81f2942..e317807cdd3e 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -95,10 +95,18 @@ struct audit_ntp_data {
>  struct audit_ntp_data {};
>  #endif
>  
> +struct audit_cont {
> +	struct list_head	list;
> +	u64			id;
> +	struct task_struct	*owner;
> +	refcount_t              refcount;
> +	struct rcu_head         rcu;
> +};
> +
>  struct audit_task_info {
>  	kuid_t			loginuid;
>  	unsigned int		sessionid;
> -	u64			contid;
> +	struct audit_cont	*cont;
>  #ifdef CONFIG_AUDITSYSCALL
>  	struct audit_context	*ctx;
>  #endif
> @@ -203,11 +211,15 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
>  
>  static inline u64 audit_get_contid(struct task_struct *tsk)
>  {
> -	if (!tsk->audit)
> +	if (!tsk->audit || !tsk->audit->cont)
>  		return AUDIT_CID_UNSET;
> -	return tsk->audit->contid;
> +	return tsk->audit->cont->id;
>  }
>  
> +extern struct audit_cont *audit_cont(struct task_struct *tsk);
> +
> +extern void audit_cont_put(struct audit_cont *cont);
> +
I see that you manual increment this refcount at various call sites, why
no corresponding audit_contid_hold function?

Neil

>  extern u32 audit_enabled;
>  
>  extern int audit_signal_info(int sig, struct task_struct *t);
> @@ -277,6 +289,14 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
>  	return AUDIT_CID_UNSET;
>  }
>  
> +static inline struct audit_cont *audit_cont(struct task_struct *tsk)
> +{
> +	return NULL;
> +}
> +
> +static inline void audit_cont_put(struct audit_cont *cont)
> +{ }
> +
>  #define audit_enabled AUDIT_OFF
>  
>  static inline int audit_signal_info(int sig, struct task_struct *t)
> diff --git a/kernel/audit.c b/kernel/audit.c
> index a36ea57cbb61..ea0899130cc1 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -137,6 +137,8 @@ struct audit_net {
>  
>  /* Hash for inode-based rules */
>  struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
> +/* Hash for contid-based rules */
> +struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
>  
>  static struct kmem_cache *audit_buffer_cache;
>  
> @@ -204,6 +206,8 @@ struct audit_reply {
>  
>  static struct kmem_cache *audit_task_cache;
>  
> +static DEFINE_SPINLOCK(audit_contid_list_lock);
> +
>  void __init audit_task_init(void)
>  {
>  	audit_task_cache = kmem_cache_create("audit_task",
> @@ -231,7 +235,9 @@ int audit_alloc(struct task_struct *tsk)
>  	}
>  	info->loginuid = audit_get_loginuid(current);
>  	info->sessionid = audit_get_sessionid(current);
> -	info->contid = audit_get_contid(current);
> +	info->cont = audit_cont(current);
> +	if (info->cont)
> +		refcount_inc(&info->cont->refcount);
>  	tsk->audit = info;
>  
>  	ret = audit_alloc_syscall(tsk);
> @@ -246,7 +252,7 @@ int audit_alloc(struct task_struct *tsk)
>  struct audit_task_info init_struct_audit = {
>  	.loginuid = INVALID_UID,
>  	.sessionid = AUDIT_SID_UNSET,
> -	.contid = AUDIT_CID_UNSET,
> +	.cont = NULL,
>  #ifdef CONFIG_AUDITSYSCALL
>  	.ctx = NULL,
>  #endif
> @@ -266,6 +272,9 @@ void audit_free(struct task_struct *tsk)
>  	/* Freeing the audit_task_info struct must be performed after
>  	 * audit_log_exit() due to need for loginuid and sessionid.
>  	 */
> +	spin_lock(&audit_contid_list_lock); 
> +	audit_cont_put(tsk->audit->cont);
> +	spin_unlock(&audit_contid_list_lock); 
>  	info = tsk->audit;
>  	tsk->audit = NULL;
>  	kmem_cache_free(audit_task_cache, info);
> @@ -1657,6 +1666,9 @@ static int __init audit_init(void)
>  	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
>  		INIT_LIST_HEAD(&audit_inode_hash[i]);
>  
> +	for (i = 0; i < AUDIT_CONTID_BUCKETS; i++)
> +		INIT_LIST_HEAD(&audit_contid_hash[i]);
> +
>  	mutex_init(&audit_cmd_mutex.lock);
>  	audit_cmd_mutex.owner = NULL;
>  
> @@ -2356,6 +2368,32 @@ int audit_signal_info(int sig, struct task_struct *t)
>  	return audit_signal_info_syscall(t);
>  }
>  
> +struct audit_cont *audit_cont(struct task_struct *tsk)
> +{
> +	if (!tsk->audit || !tsk->audit->cont)
> +		return NULL;
> +	return tsk->audit->cont;
> +}
> +
> +/* audit_contid_list_lock must be held by caller */
> +void audit_cont_put(struct audit_cont *cont)
> +{
> +	if (!cont)
> +		return;
> +	if (refcount_dec_and_test(&cont->refcount)) {
> +		put_task_struct(cont->owner);
> +		list_del_rcu(&cont->list);
> +		kfree_rcu(cont, rcu);
> +	}
> +}
> +
> +static struct task_struct *audit_cont_owner(struct task_struct *tsk)
> +{
> +	if (tsk->audit && tsk->audit->cont)
> +		return tsk->audit->cont->owner;
> +	return NULL;
> +}
> +
>  /*
>   * audit_set_contid - set current task's audit contid
>   * @task: target task
> @@ -2382,9 +2420,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>  	}
>  	oldcontid = audit_get_contid(task);
>  	read_lock(&tasklist_lock);
> -	/* Don't allow the audit containerid to be unset */
> +	/* Don't allow the contid to be unset */
>  	if (!audit_contid_valid(contid))
>  		rc = -EINVAL;
> +	/* Don't allow the contid to be set to the same value again */
> +	else if (contid == oldcontid) {
> +		rc = -EADDRINUSE;
>  	/* if we don't have caps, reject */
>  	else if (!capable(CAP_AUDIT_CONTROL))
>  		rc = -EPERM;
> @@ -2397,8 +2438,43 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>  	else if (audit_contid_set(task))
>  		rc = -ECHILD;
>  	read_unlock(&tasklist_lock);
> -	if (!rc)
> -		task->audit->contid = contid;
> +	if (!rc) {
> +		struct audit_cont *oldcont = audit_cont(task);
> +		struct audit_cont *cont = NULL;
> +		struct audit_cont *newcont = NULL;
> +		int h = audit_hash_contid(contid);
> +
> +		spin_lock(&audit_contid_list_lock);
> +		list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
> +			if (cont->id == contid) {
> +				/* task injection to existing container */
> +				if (current == cont->owner) {
> +					refcount_inc(&cont->refcount);
> +					newcont = cont;
> +				} else {
> +					rc = -ENOTUNIQ;
> +					goto conterror;
> +				}
> +			}
> +		if (!newcont) {
> +			newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
> +			if (newcont) {
> +				INIT_LIST_HEAD(&newcont->list);
> +				newcont->id = contid;
> +				get_task_struct(current);
> +				newcont->owner = current;
> +				refcount_set(&newcont->refcount, 1);
> +				list_add_rcu(&newcont->list, &audit_contid_hash[h]);
> +			} else {
> +				rc = -ENOMEM;
> +				goto conterror;
> +			}
> +		}
> +		task->audit->cont = newcont;
> +		audit_cont_put(oldcont);
> +conterror:
> +		spin_unlock(&audit_contid_list_lock);
> +	}
>  	task_unlock(task);
>  
>  	if (!audit_enabled)
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 16bd03b88e0d..e4a31aa92dfe 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -211,6 +211,14 @@ static inline int audit_hash_ino(u32 ino)
>  	return (ino & (AUDIT_INODE_BUCKETS-1));
>  }
>  
> +#define AUDIT_CONTID_BUCKETS	32
> +extern struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> +
> +static inline int audit_hash_contid(u64 contid)
> +{
> +	return (contid & (AUDIT_CONTID_BUCKETS-1));
> +}
> +
>  /* Indicates that audit should log the full pathname. */
>  #define AUDIT_NAME_FULL -1
>  
> -- 
> 1.8.3.1
> 
>
Paul Moore Oct. 11, 2019, 12:38 a.m. UTC | #2
On Wed, Sep 18, 2019 at 9:24 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Store the audit container identifier in a refcounted kernel object that
> is added to the master list of audit container identifiers.  This will
> allow multiple container orchestrators/engines to work on the same
> machine without danger of inadvertantly re-using an existing identifier.
> It will also allow an orchestrator to inject a process into an existing
> container by checking if the original container owner is the one
> injecting the task.  A hash table list is used to optimize searches.
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  include/linux/audit.h | 26 ++++++++++++++--
>  kernel/audit.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  kernel/audit.h        |  8 +++++
>  3 files changed, 112 insertions(+), 8 deletions(-)

One general comment before we go off into the weeds on this ... I can
understand why you wanted to keep this patch separate from the earlier
patches, but as we get closer to having mergeable code this should get
folded into the previous patches.  For example, there shouldn't be a
change in audit_task_info where you change the contid field from a u64
to struct pointer, it should be a struct pointer from the start.

It's also disappointing that idr appears to only be for 32-bit ID
values, if we had a 64-bit idr I think we could simplify this greatly.

> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index f2e3b81f2942..e317807cdd3e 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -95,10 +95,18 @@ struct audit_ntp_data {
>  struct audit_ntp_data {};
>  #endif
>
> +struct audit_cont {
> +       struct list_head        list;
> +       u64                     id;
> +       struct task_struct      *owner;
> +       refcount_t              refcount;
> +       struct rcu_head         rcu;
> +};

It seems as though in most of the code you are using "contid", any
reason why didn't stick with that naming scheme here, e.g. "struct
audit_contid"?

>  struct audit_task_info {
>         kuid_t                  loginuid;
>         unsigned int            sessionid;
> -       u64                     contid;
> +       struct audit_cont       *cont;

Same, why not stick with "contid"?

>  #ifdef CONFIG_AUDITSYSCALL
>         struct audit_context    *ctx;
>  #endif
> @@ -203,11 +211,15 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
>
>  static inline u64 audit_get_contid(struct task_struct *tsk)
>  {
> -       if (!tsk->audit)
> +       if (!tsk->audit || !tsk->audit->cont)
>                 return AUDIT_CID_UNSET;
> -       return tsk->audit->contid;
> +       return tsk->audit->cont->id;
>  }

Assuming for a moment that we implement an audit_contid_get() (see
Neil's comment as well as mine below), we probably need to name this
something different so we don't all lose our minds when we read this
code.  On the plus side we can probably preface it with an underscore
since it is a static, in which case _audit_contid_get() might be okay,
but I'm open to suggestions.

> +extern struct audit_cont *audit_cont(struct task_struct *tsk);
> +
> +extern void audit_cont_put(struct audit_cont *cont);

More of the "contid" vs "cont".

>  extern u32 audit_enabled;
>
>  extern int audit_signal_info(int sig, struct task_struct *t);
> @@ -277,6 +289,14 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
>         return AUDIT_CID_UNSET;
>  }
>
> +static inline struct audit_cont *audit_cont(struct task_struct *tsk)
> +{
> +       return NULL;
> +}
> +
> +static inline void audit_cont_put(struct audit_cont *cont)
> +{ }
> +
>  #define audit_enabled AUDIT_OFF
>
>  static inline int audit_signal_info(int sig, struct task_struct *t)
> diff --git a/kernel/audit.c b/kernel/audit.c
> index a36ea57cbb61..ea0899130cc1 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -137,6 +137,8 @@ struct audit_net {
>
>  /* Hash for inode-based rules */
>  struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
> +/* Hash for contid-based rules */
> +struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
>
>  static struct kmem_cache *audit_buffer_cache;
>
> @@ -204,6 +206,8 @@ struct audit_reply {
>
>  static struct kmem_cache *audit_task_cache;
>
> +static DEFINE_SPINLOCK(audit_contid_list_lock);

Since it looks like this protectects audit_contid_hash, I think it
would be better to move it up underneath audit_contid_hash.

>  void __init audit_task_init(void)
>  {
>         audit_task_cache = kmem_cache_create("audit_task",
> @@ -231,7 +235,9 @@ int audit_alloc(struct task_struct *tsk)
>         }
>         info->loginuid = audit_get_loginuid(current);
>         info->sessionid = audit_get_sessionid(current);
> -       info->contid = audit_get_contid(current);
> +       info->cont = audit_cont(current);
> +       if (info->cont)
> +               refcount_inc(&info->cont->refcount);

See the other comments about a "get" function, but I think we need a
RCU read lock around the above, no?

>         tsk->audit = info;
>
>         ret = audit_alloc_syscall(tsk);
> @@ -246,7 +252,7 @@ int audit_alloc(struct task_struct *tsk)
>  struct audit_task_info init_struct_audit = {
>         .loginuid = INVALID_UID,
>         .sessionid = AUDIT_SID_UNSET,
> -       .contid = AUDIT_CID_UNSET,
> +       .cont = NULL,

More "cont" vs "contid".

>  #ifdef CONFIG_AUDITSYSCALL
>         .ctx = NULL,
>  #endif
> @@ -266,6 +272,9 @@ void audit_free(struct task_struct *tsk)
>         /* Freeing the audit_task_info struct must be performed after
>          * audit_log_exit() due to need for loginuid and sessionid.
>          */
> +       spin_lock(&audit_contid_list_lock);
> +       audit_cont_put(tsk->audit->cont);
> +       spin_unlock(&audit_contid_list_lock);

Perhaps this will make sense as I get further into the patchset, but
why not move the spin lock operations into audit_[cont/contid]_put()?

>         info = tsk->audit;
>         tsk->audit = NULL;
>         kmem_cache_free(audit_task_cache, info);
> @@ -1657,6 +1666,9 @@ static int __init audit_init(void)
>         for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
>                 INIT_LIST_HEAD(&audit_inode_hash[i]);
>
> +       for (i = 0; i < AUDIT_CONTID_BUCKETS; i++)
> +               INIT_LIST_HEAD(&audit_contid_hash[i]);
> +
>         mutex_init(&audit_cmd_mutex.lock);
>         audit_cmd_mutex.owner = NULL;
>
> @@ -2356,6 +2368,32 @@ int audit_signal_info(int sig, struct task_struct *t)
>         return audit_signal_info_syscall(t);
>  }
>
> +struct audit_cont *audit_cont(struct task_struct *tsk)
> +{
> +       if (!tsk->audit || !tsk->audit->cont)
> +               return NULL;
> +       return tsk->audit->cont;
> +}
> +
> +/* audit_contid_list_lock must be held by caller */
> +void audit_cont_put(struct audit_cont *cont)
> +{
> +       if (!cont)
> +               return;
> +       if (refcount_dec_and_test(&cont->refcount)) {
> +               put_task_struct(cont->owner);
> +               list_del_rcu(&cont->list);
> +               kfree_rcu(cont, rcu);
> +       }
> +}

I tend to agree with Neil's previous comment; if we've got a
audit_[cont/contid]_put(), why not an audit_[cont/contid]_get()?

> +static struct task_struct *audit_cont_owner(struct task_struct *tsk)
> +{
> +       if (tsk->audit && tsk->audit->cont)
> +               return tsk->audit->cont->owner;
> +       return NULL;
> +}

I'm not sure if this is possible (I haven't make my way through the
entire patchset) and the function above isn't used in this patch (why
is it here?), but it seems like it would be safer to convert this into
an audit_contid_isowner() function that simply returns 1/0 depending
on if the passed task_struct is the owner or not of a passed audit
container ID value?

>  /*
>   * audit_set_contid - set current task's audit contid
>   * @task: target task
> @@ -2382,9 +2420,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>         }
>         oldcontid = audit_get_contid(task);
>         read_lock(&tasklist_lock);
> -       /* Don't allow the audit containerid to be unset */
> +       /* Don't allow the contid to be unset */
>         if (!audit_contid_valid(contid))
>                 rc = -EINVAL;
> +       /* Don't allow the contid to be set to the same value again */
> +       else if (contid == oldcontid) {
> +               rc = -EADDRINUSE;
>         /* if we don't have caps, reject */
>         else if (!capable(CAP_AUDIT_CONTROL))
>                 rc = -EPERM;

RCU read lock?  It's a bit dicey since I believe the tasklist_lock is
going to provide us the safety we need, but if we are going to claim
that the audit container ID list is protected by RCU we should
probably use it.

> @@ -2397,8 +2438,43 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>         else if (audit_contid_set(task))
>                 rc = -ECHILD;
>         read_unlock(&tasklist_lock);
> -       if (!rc)
> -               task->audit->contid = contid;
> +       if (!rc) {
> +               struct audit_cont *oldcont = audit_cont(task);

Previously we held the tasklist_lock to protect the audit container ID
associated with the struct, should we still be holding it here?

Regardless, I worry that the lock dependencies between the
tasklist_lock and the audit_contid_list_lock are going to be tricky.
It might be nice to document the relationship in a comment up near
where you declare audit_contid_list_lock.

> +               struct audit_cont *cont = NULL;
> +               struct audit_cont *newcont = NULL;
> +               int h = audit_hash_contid(contid);
> +
> +               spin_lock(&audit_contid_list_lock);
> +               list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
> +                       if (cont->id == contid) {
> +                               /* task injection to existing container */
> +                               if (current == cont->owner) {

I understand the desire to limit a given audit container ID to the
orchestrator that created it, but are we certain that we can track
audit container ID "ownership" via a single instance of a task_struct?
 What happens when the orchestrator stops/restarts/crashes?  Do we
even care?

> +                                       refcount_inc(&cont->refcount);
> +                                       newcont = cont;

We can bail out of the loop here, yes?

> +                               } else {
> +                                       rc = -ENOTUNIQ;
> +                                       goto conterror;
> +                               }
> +                       }
> +               if (!newcont) {
> +                       newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
> +                       if (newcont) {
> +                               INIT_LIST_HEAD(&newcont->list);
> +                               newcont->id = contid;
> +                               get_task_struct(current);
> +                               newcont->owner = current;
> +                               refcount_set(&newcont->refcount, 1);
> +                               list_add_rcu(&newcont->list, &audit_contid_hash[h]);
> +                       } else {
> +                               rc = -ENOMEM;
> +                               goto conterror;
> +                       }
> +               }
> +               task->audit->cont = newcont;
> +               audit_cont_put(oldcont);
> +conterror:
> +               spin_unlock(&audit_contid_list_lock);
> +       }
>         task_unlock(task);
>
>         if (!audit_enabled)
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 16bd03b88e0d..e4a31aa92dfe 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -211,6 +211,14 @@ static inline int audit_hash_ino(u32 ino)
>         return (ino & (AUDIT_INODE_BUCKETS-1));
>  }
>
> +#define AUDIT_CONTID_BUCKETS   32
> +extern struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> +
> +static inline int audit_hash_contid(u64 contid)
> +{
> +       return (contid & (AUDIT_CONTID_BUCKETS-1));
> +}
> +
>  /* Indicates that audit should log the full pathname. */
>  #define AUDIT_NAME_FULL -1
>

--
paul moore
www.paul-moore.com
Richard Guy Briggs Oct. 25, 2019, 8 p.m. UTC | #3
On 2019-09-26 10:46, Neil Horman wrote:
> On Wed, Sep 18, 2019 at 09:22:21PM -0400, Richard Guy Briggs wrote:
> > Store the audit container identifier in a refcounted kernel object that
> > is added to the master list of audit container identifiers.  This will
> > allow multiple container orchestrators/engines to work on the same
> > machine without danger of inadvertantly re-using an existing identifier.
> > It will also allow an orchestrator to inject a process into an existing
> > container by checking if the original container owner is the one
> > injecting the task.  A hash table list is used to optimize searches.
> > 
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  include/linux/audit.h | 26 ++++++++++++++--
> >  kernel/audit.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
> >  kernel/audit.h        |  8 +++++
> >  3 files changed, 112 insertions(+), 8 deletions(-)
> > 
> > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > index f2e3b81f2942..e317807cdd3e 100644
> > --- a/include/linux/audit.h
> > +++ b/include/linux/audit.h
> > @@ -95,10 +95,18 @@ struct audit_ntp_data {
> >  struct audit_ntp_data {};
> >  #endif
> >  
> > +struct audit_cont {
> > +	struct list_head	list;
> > +	u64			id;
> > +	struct task_struct	*owner;
> > +	refcount_t              refcount;
> > +	struct rcu_head         rcu;
> > +};
> > +
> >  struct audit_task_info {
> >  	kuid_t			loginuid;
> >  	unsigned int		sessionid;
> > -	u64			contid;
> > +	struct audit_cont	*cont;
> >  #ifdef CONFIG_AUDITSYSCALL
> >  	struct audit_context	*ctx;
> >  #endif
> > @@ -203,11 +211,15 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
> >  
> >  static inline u64 audit_get_contid(struct task_struct *tsk)
> >  {
> > -	if (!tsk->audit)
> > +	if (!tsk->audit || !tsk->audit->cont)
> >  		return AUDIT_CID_UNSET;
> > -	return tsk->audit->contid;
> > +	return tsk->audit->cont->id;
> >  }
> >  
> > +extern struct audit_cont *audit_cont(struct task_struct *tsk);
> > +
> > +extern void audit_cont_put(struct audit_cont *cont);
> > +
> I see that you manual increment this refcount at various call sites, why
> no corresponding audit_contid_hold function?

I was trying to avoid the get function due to having one site where I
needed the pointer for later but didn't need a refcount to it so that I
could release the refcount it if it was replaced by another cont object.
A hold function would just contain one line that would call the
refcount_inc().  If I did convert things over to a get function, it
would hide some of this extra conditional code in the main calling
function, but in one place I could just call put immediately to
neutralize that unneeded refcount.

Would you see any issue with that extra get/put refcount that would only
happen in the case of changing a contid in a nesting situation?

> Neil
> 
> >  extern u32 audit_enabled;
> >  
> >  extern int audit_signal_info(int sig, struct task_struct *t);
> > @@ -277,6 +289,14 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> >  	return AUDIT_CID_UNSET;
> >  }
> >  
> > +static inline struct audit_cont *audit_cont(struct task_struct *tsk)
> > +{
> > +	return NULL;
> > +}
> > +
> > +static inline void audit_cont_put(struct audit_cont *cont)
> > +{ }
> > +
> >  #define audit_enabled AUDIT_OFF
> >  
> >  static inline int audit_signal_info(int sig, struct task_struct *t)
> > diff --git a/kernel/audit.c b/kernel/audit.c
> > index a36ea57cbb61..ea0899130cc1 100644
> > --- a/kernel/audit.c
> > +++ b/kernel/audit.c
> > @@ -137,6 +137,8 @@ struct audit_net {
> >  
> >  /* Hash for inode-based rules */
> >  struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
> > +/* Hash for contid-based rules */
> > +struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> >  
> >  static struct kmem_cache *audit_buffer_cache;
> >  
> > @@ -204,6 +206,8 @@ struct audit_reply {
> >  
> >  static struct kmem_cache *audit_task_cache;
> >  
> > +static DEFINE_SPINLOCK(audit_contid_list_lock);
> > +
> >  void __init audit_task_init(void)
> >  {
> >  	audit_task_cache = kmem_cache_create("audit_task",
> > @@ -231,7 +235,9 @@ int audit_alloc(struct task_struct *tsk)
> >  	}
> >  	info->loginuid = audit_get_loginuid(current);
> >  	info->sessionid = audit_get_sessionid(current);
> > -	info->contid = audit_get_contid(current);
> > +	info->cont = audit_cont(current);
> > +	if (info->cont)
> > +		refcount_inc(&info->cont->refcount);
> >  	tsk->audit = info;
> >  
> >  	ret = audit_alloc_syscall(tsk);
> > @@ -246,7 +252,7 @@ int audit_alloc(struct task_struct *tsk)
> >  struct audit_task_info init_struct_audit = {
> >  	.loginuid = INVALID_UID,
> >  	.sessionid = AUDIT_SID_UNSET,
> > -	.contid = AUDIT_CID_UNSET,
> > +	.cont = NULL,
> >  #ifdef CONFIG_AUDITSYSCALL
> >  	.ctx = NULL,
> >  #endif
> > @@ -266,6 +272,9 @@ void audit_free(struct task_struct *tsk)
> >  	/* Freeing the audit_task_info struct must be performed after
> >  	 * audit_log_exit() due to need for loginuid and sessionid.
> >  	 */
> > +	spin_lock(&audit_contid_list_lock); 
> > +	audit_cont_put(tsk->audit->cont);
> > +	spin_unlock(&audit_contid_list_lock); 
> >  	info = tsk->audit;
> >  	tsk->audit = NULL;
> >  	kmem_cache_free(audit_task_cache, info);
> > @@ -1657,6 +1666,9 @@ static int __init audit_init(void)
> >  	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
> >  		INIT_LIST_HEAD(&audit_inode_hash[i]);
> >  
> > +	for (i = 0; i < AUDIT_CONTID_BUCKETS; i++)
> > +		INIT_LIST_HEAD(&audit_contid_hash[i]);
> > +
> >  	mutex_init(&audit_cmd_mutex.lock);
> >  	audit_cmd_mutex.owner = NULL;
> >  
> > @@ -2356,6 +2368,32 @@ int audit_signal_info(int sig, struct task_struct *t)
> >  	return audit_signal_info_syscall(t);
> >  }
> >  
> > +struct audit_cont *audit_cont(struct task_struct *tsk)
> > +{
> > +	if (!tsk->audit || !tsk->audit->cont)
> > +		return NULL;
> > +	return tsk->audit->cont;
> > +}
> > +
> > +/* audit_contid_list_lock must be held by caller */
> > +void audit_cont_put(struct audit_cont *cont)
> > +{
> > +	if (!cont)
> > +		return;
> > +	if (refcount_dec_and_test(&cont->refcount)) {
> > +		put_task_struct(cont->owner);
> > +		list_del_rcu(&cont->list);
> > +		kfree_rcu(cont, rcu);
> > +	}
> > +}
> > +
> > +static struct task_struct *audit_cont_owner(struct task_struct *tsk)
> > +{
> > +	if (tsk->audit && tsk->audit->cont)
> > +		return tsk->audit->cont->owner;
> > +	return NULL;
> > +}
> > +
> >  /*
> >   * audit_set_contid - set current task's audit contid
> >   * @task: target task
> > @@ -2382,9 +2420,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> >  	}
> >  	oldcontid = audit_get_contid(task);
> >  	read_lock(&tasklist_lock);
> > -	/* Don't allow the audit containerid to be unset */
> > +	/* Don't allow the contid to be unset */
> >  	if (!audit_contid_valid(contid))
> >  		rc = -EINVAL;
> > +	/* Don't allow the contid to be set to the same value again */
> > +	else if (contid == oldcontid) {
> > +		rc = -EADDRINUSE;
> >  	/* if we don't have caps, reject */
> >  	else if (!capable(CAP_AUDIT_CONTROL))
> >  		rc = -EPERM;
> > @@ -2397,8 +2438,43 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> >  	else if (audit_contid_set(task))
> >  		rc = -ECHILD;
> >  	read_unlock(&tasklist_lock);
> > -	if (!rc)
> > -		task->audit->contid = contid;
> > +	if (!rc) {
> > +		struct audit_cont *oldcont = audit_cont(task);
> > +		struct audit_cont *cont = NULL;
> > +		struct audit_cont *newcont = NULL;
> > +		int h = audit_hash_contid(contid);
> > +
> > +		spin_lock(&audit_contid_list_lock);
> > +		list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
> > +			if (cont->id == contid) {
> > +				/* task injection to existing container */
> > +				if (current == cont->owner) {
> > +					refcount_inc(&cont->refcount);
> > +					newcont = cont;
> > +				} else {
> > +					rc = -ENOTUNIQ;
> > +					goto conterror;
> > +				}
> > +			}
> > +		if (!newcont) {
> > +			newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
> > +			if (newcont) {
> > +				INIT_LIST_HEAD(&newcont->list);
> > +				newcont->id = contid;
> > +				get_task_struct(current);
> > +				newcont->owner = current;
> > +				refcount_set(&newcont->refcount, 1);
> > +				list_add_rcu(&newcont->list, &audit_contid_hash[h]);
> > +			} else {
> > +				rc = -ENOMEM;
> > +				goto conterror;
> > +			}
> > +		}
> > +		task->audit->cont = newcont;
> > +		audit_cont_put(oldcont);
> > +conterror:
> > +		spin_unlock(&audit_contid_list_lock);
> > +	}
> >  	task_unlock(task);
> >  
> >  	if (!audit_enabled)
> > diff --git a/kernel/audit.h b/kernel/audit.h
> > index 16bd03b88e0d..e4a31aa92dfe 100644
> > --- a/kernel/audit.h
> > +++ b/kernel/audit.h
> > @@ -211,6 +211,14 @@ static inline int audit_hash_ino(u32 ino)
> >  	return (ino & (AUDIT_INODE_BUCKETS-1));
> >  }
> >  
> > +#define AUDIT_CONTID_BUCKETS	32
> > +extern struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> > +
> > +static inline int audit_hash_contid(u64 contid)
> > +{
> > +	return (contid & (AUDIT_CONTID_BUCKETS-1));
> > +}
> > +
> >  /* Indicates that audit should log the full pathname. */
> >  #define AUDIT_NAME_FULL -1
> >  
> > -- 
> > 1.8.3.1
> > 
> > 

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Richard Guy Briggs Oct. 25, 2019, 9 p.m. UTC | #4
On 2019-10-10 20:38, Paul Moore wrote:
> On Wed, Sep 18, 2019 at 9:24 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > Store the audit container identifier in a refcounted kernel object that
> > is added to the master list of audit container identifiers.  This will
> > allow multiple container orchestrators/engines to work on the same
> > machine without danger of inadvertantly re-using an existing identifier.
> > It will also allow an orchestrator to inject a process into an existing
> > container by checking if the original container owner is the one
> > injecting the task.  A hash table list is used to optimize searches.
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  include/linux/audit.h | 26 ++++++++++++++--
> >  kernel/audit.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
> >  kernel/audit.h        |  8 +++++
> >  3 files changed, 112 insertions(+), 8 deletions(-)
> 
> One general comment before we go off into the weeds on this ... I can
> understand why you wanted to keep this patch separate from the earlier
> patches, but as we get closer to having mergeable code this should get
> folded into the previous patches.  For example, there shouldn't be a
> change in audit_task_info where you change the contid field from a u64
> to struct pointer, it should be a struct pointer from the start.

I should have marked this patchset as RFC even though it was v7 due to a
lot of new ideas/code that was added with uncertainties needing comment
and direction.

> It's also disappointing that idr appears to only be for 32-bit ID
> values, if we had a 64-bit idr I think we could simplify this greatly.

Perhaps.  I do still see value in letting the orchestrator choose the
value.

> > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > index f2e3b81f2942..e317807cdd3e 100644
> > --- a/include/linux/audit.h
> > +++ b/include/linux/audit.h
> > @@ -95,10 +95,18 @@ struct audit_ntp_data {
> >  struct audit_ntp_data {};
> >  #endif
> >
> > +struct audit_cont {
> > +       struct list_head        list;
> > +       u64                     id;
> > +       struct task_struct      *owner;
> > +       refcount_t              refcount;
> > +       struct rcu_head         rcu;
> > +};
> 
> It seems as though in most of the code you are using "contid", any
> reason why didn't stick with that naming scheme here, e.g. "struct
> audit_contid"?

I was using contid to refer to the value itself and cont to refer to the
refcounted object.  I find cont a bit too terse, so I'm still thinking
of changing it.  Perhaps contobj?

> >  struct audit_task_info {
> >         kuid_t                  loginuid;
> >         unsigned int            sessionid;
> > -       u64                     contid;
> > +       struct audit_cont       *cont;
> 
> Same, why not stick with "contid"?

^^^

> >  #ifdef CONFIG_AUDITSYSCALL
> >         struct audit_context    *ctx;
> >  #endif
> > @@ -203,11 +211,15 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
> >
> >  static inline u64 audit_get_contid(struct task_struct *tsk)
> >  {
> > -       if (!tsk->audit)
> > +       if (!tsk->audit || !tsk->audit->cont)
> >                 return AUDIT_CID_UNSET;
> > -       return tsk->audit->contid;
> > +       return tsk->audit->cont->id;
> >  }
> 
> Assuming for a moment that we implement an audit_contid_get() (see
> Neil's comment as well as mine below), we probably need to name this
> something different so we don't all lose our minds when we read this
> code.  On the plus side we can probably preface it with an underscore
> since it is a static, in which case _audit_contid_get() might be okay,
> but I'm open to suggestions.

I'm fine with the "_" prefix, can you point to precedent or convention?

> > +extern struct audit_cont *audit_cont(struct task_struct *tsk);
> > +
> > +extern void audit_cont_put(struct audit_cont *cont);
> 
> More of the "contid" vs "cont".

^^^

> >  extern u32 audit_enabled;
> >
> >  extern int audit_signal_info(int sig, struct task_struct *t);
> > @@ -277,6 +289,14 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> >         return AUDIT_CID_UNSET;
> >  }
> >
> > +static inline struct audit_cont *audit_cont(struct task_struct *tsk)
> > +{
> > +       return NULL;
> > +}
> > +
> > +static inline void audit_cont_put(struct audit_cont *cont)
> > +{ }
> > +
> >  #define audit_enabled AUDIT_OFF
> >
> >  static inline int audit_signal_info(int sig, struct task_struct *t)
> > diff --git a/kernel/audit.c b/kernel/audit.c
> > index a36ea57cbb61..ea0899130cc1 100644
> > --- a/kernel/audit.c
> > +++ b/kernel/audit.c
> > @@ -137,6 +137,8 @@ struct audit_net {
> >
> >  /* Hash for inode-based rules */
> >  struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
> > +/* Hash for contid-based rules */
> > +struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> >
> >  static struct kmem_cache *audit_buffer_cache;
> >
> > @@ -204,6 +206,8 @@ struct audit_reply {
> >
> >  static struct kmem_cache *audit_task_cache;
> >
> > +static DEFINE_SPINLOCK(audit_contid_list_lock);
> 
> Since it looks like this protectects audit_contid_hash, I think it
> would be better to move it up underneath audit_contid_hash.

Agreed.

> >  void __init audit_task_init(void)
> >  {
> >         audit_task_cache = kmem_cache_create("audit_task",
> > @@ -231,7 +235,9 @@ int audit_alloc(struct task_struct *tsk)
> >         }
> >         info->loginuid = audit_get_loginuid(current);
> >         info->sessionid = audit_get_sessionid(current);
> > -       info->contid = audit_get_contid(current);
> > +       info->cont = audit_cont(current);
> > +       if (info->cont)
> > +               refcount_inc(&info->cont->refcount);
> 
> See the other comments about a "get" function, but I think we need a
> RCU read lock around the above, no?

The rcu read lock is to protect the list rather than the cont object
itself, the latter of which is protected by its refcount.

> >         tsk->audit = info;
> >
> >         ret = audit_alloc_syscall(tsk);
> > @@ -246,7 +252,7 @@ int audit_alloc(struct task_struct *tsk)
> >  struct audit_task_info init_struct_audit = {
> >         .loginuid = INVALID_UID,
> >         .sessionid = AUDIT_SID_UNSET,
> > -       .contid = AUDIT_CID_UNSET,
> > +       .cont = NULL,
> 
> More "cont" vs "contid".

^^^

> >  #ifdef CONFIG_AUDITSYSCALL
> >         .ctx = NULL,
> >  #endif
> > @@ -266,6 +272,9 @@ void audit_free(struct task_struct *tsk)
> >         /* Freeing the audit_task_info struct must be performed after
> >          * audit_log_exit() due to need for loginuid and sessionid.
> >          */
> > +       spin_lock(&audit_contid_list_lock);
> > +       audit_cont_put(tsk->audit->cont);
> > +       spin_unlock(&audit_contid_list_lock);
> 
> Perhaps this will make sense as I get further into the patchset, but
> why not move the spin lock operations into audit_[cont/contid]_put()?

audit_cont_put() is recursive in patch 18/21, which would have been
evident if 18/21 was squashed into this one as you pointed out there...

> >         info = tsk->audit;
> >         tsk->audit = NULL;
> >         kmem_cache_free(audit_task_cache, info);
> > @@ -1657,6 +1666,9 @@ static int __init audit_init(void)
> >         for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
> >                 INIT_LIST_HEAD(&audit_inode_hash[i]);
> >
> > +       for (i = 0; i < AUDIT_CONTID_BUCKETS; i++)
> > +               INIT_LIST_HEAD(&audit_contid_hash[i]);
> > +
> >         mutex_init(&audit_cmd_mutex.lock);
> >         audit_cmd_mutex.owner = NULL;
> >
> > @@ -2356,6 +2368,32 @@ int audit_signal_info(int sig, struct task_struct *t)
> >         return audit_signal_info_syscall(t);
> >  }
> >
> > +struct audit_cont *audit_cont(struct task_struct *tsk)
> > +{
> > +       if (!tsk->audit || !tsk->audit->cont)
> > +               return NULL;
> > +       return tsk->audit->cont;
> > +}
> > +
> > +/* audit_contid_list_lock must be held by caller */
> > +void audit_cont_put(struct audit_cont *cont)
> > +{
> > +       if (!cont)
> > +               return;
> > +       if (refcount_dec_and_test(&cont->refcount)) {
> > +               put_task_struct(cont->owner);
> > +               list_del_rcu(&cont->list);
> > +               kfree_rcu(cont, rcu);
> > +       }
> > +}
> 
> I tend to agree with Neil's previous comment; if we've got a
> audit_[cont/contid]_put(), why not an audit_[cont/contid]_get()?

^^^

> > +static struct task_struct *audit_cont_owner(struct task_struct *tsk)
> > +{
> > +       if (tsk->audit && tsk->audit->cont)
> > +               return tsk->audit->cont->owner;
> > +       return NULL;
> > +}
> 
> I'm not sure if this is possible (I haven't make my way through the
> entire patchset) and the function above isn't used in this patch (why
> is it here?), but it seems like it would be safer to convert this into
> an audit_contid_isowner() function that simply returns 1/0 depending
> on if the passed task_struct is the owner or not of a passed audit
> container ID value?

Agreed since it is only ever compared with current.  It can be moved to
14/21.

> >  /*
> >   * audit_set_contid - set current task's audit contid
> >   * @task: target task
> > @@ -2382,9 +2420,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> >         }
> >         oldcontid = audit_get_contid(task);
> >         read_lock(&tasklist_lock);
> > -       /* Don't allow the audit containerid to be unset */
> > +       /* Don't allow the contid to be unset */
> >         if (!audit_contid_valid(contid))
> >                 rc = -EINVAL;
> > +       /* Don't allow the contid to be set to the same value again */
> > +       else if (contid == oldcontid) {
> > +               rc = -EADDRINUSE;
> >         /* if we don't have caps, reject */
> >         else if (!capable(CAP_AUDIT_CONTROL))
> >                 rc = -EPERM;
> 
> RCU read lock?  It's a bit dicey since I believe the tasklist_lock is
> going to provide us the safety we need, but if we are going to claim
> that the audit container ID list is protected by RCU we should
> probably use it.

Yes, perhaps, but to protect the task read, not the list, until it is
accessed.  Getting the contid value or cont pointer via the task does
not involve the list.  The cont pointer is protected by its refcount.

> > @@ -2397,8 +2438,43 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> >         else if (audit_contid_set(task))
> >                 rc = -ECHILD;
> >         read_unlock(&tasklist_lock);
> > -       if (!rc)
> > -               task->audit->contid = contid;
> > +       if (!rc) {
> > +               struct audit_cont *oldcont = audit_cont(task);
> 
> Previously we held the tasklist_lock to protect the audit container ID
> associated with the struct, should we still be holding it here?

We held the tasklist_lock to protect access to the target task's
child/parent/thread relationships.

> Regardless, I worry that the lock dependencies between the
> tasklist_lock and the audit_contid_list_lock are going to be tricky.
> It might be nice to document the relationship in a comment up near
> where you declare audit_contid_list_lock.

I don't think there should be a conflict between the two.

The contid_list_lock doesn't care if the cont object is associated to a
particular task.

> > +               struct audit_cont *cont = NULL;
> > +               struct audit_cont *newcont = NULL;
> > +               int h = audit_hash_contid(contid);
> > +
> > +               spin_lock(&audit_contid_list_lock);
> > +               list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
> > +                       if (cont->id == contid) {
> > +                               /* task injection to existing container */
> > +                               if (current == cont->owner) {
> 
> I understand the desire to limit a given audit container ID to the
> orchestrator that created it, but are we certain that we can track
> audit container ID "ownership" via a single instance of a task_struct?

Are you suggesting that a task_struct representing a task may be
replaced for a specific task?  I don't believe that will ever happen.

>  What happens when the orchestrator stops/restarts/crashes?  Do we
> even care?

Reap all of its containers?

> > +                                       refcount_inc(&cont->refcount);
> > +                                       newcont = cont;
> 
> We can bail out of the loop here, yes?

Yes, that would be a performance improvement, but not functional bug,
thanks.  :-)

> > +                               } else {
> > +                                       rc = -ENOTUNIQ;
> > +                                       goto conterror;
> > +                               }
> > +                       }
> > +               if (!newcont) {
> > +                       newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
> > +                       if (newcont) {
> > +                               INIT_LIST_HEAD(&newcont->list);
> > +                               newcont->id = contid;
> > +                               get_task_struct(current);
> > +                               newcont->owner = current;
> > +                               refcount_set(&newcont->refcount, 1);
> > +                               list_add_rcu(&newcont->list, &audit_contid_hash[h]);
> > +                       } else {
> > +                               rc = -ENOMEM;
> > +                               goto conterror;
> > +                       }
> > +               }
> > +               task->audit->cont = newcont;
> > +               audit_cont_put(oldcont);
> > +conterror:
> > +               spin_unlock(&audit_contid_list_lock);
> > +       }
> >         task_unlock(task);
> >
> >         if (!audit_enabled)
> > diff --git a/kernel/audit.h b/kernel/audit.h
> > index 16bd03b88e0d..e4a31aa92dfe 100644
> > --- a/kernel/audit.h
> > +++ b/kernel/audit.h
> > @@ -211,6 +211,14 @@ static inline int audit_hash_ino(u32 ino)
> >         return (ino & (AUDIT_INODE_BUCKETS-1));
> >  }
> >
> > +#define AUDIT_CONTID_BUCKETS   32
> > +extern struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> > +
> > +static inline int audit_hash_contid(u64 contid)
> > +{
> > +       return (contid & (AUDIT_CONTID_BUCKETS-1));
> > +}
> > +
> >  /* Indicates that audit should log the full pathname. */
> >  #define AUDIT_NAME_FULL -1
> >
> 
> --
> paul moore
> www.paul-moore.com

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Neil Horman Oct. 28, 2019, 12:20 p.m. UTC | #5
On Fri, Oct 25, 2019 at 04:00:19PM -0400, Richard Guy Briggs wrote:
> On 2019-09-26 10:46, Neil Horman wrote:
> > On Wed, Sep 18, 2019 at 09:22:21PM -0400, Richard Guy Briggs wrote:
> > > Store the audit container identifier in a refcounted kernel object that
> > > is added to the master list of audit container identifiers.  This will
> > > allow multiple container orchestrators/engines to work on the same
> > > machine without danger of inadvertantly re-using an existing identifier.
> > > It will also allow an orchestrator to inject a process into an existing
> > > container by checking if the original container owner is the one
> > > injecting the task.  A hash table list is used to optimize searches.
> > > 
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  include/linux/audit.h | 26 ++++++++++++++--
> > >  kernel/audit.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  kernel/audit.h        |  8 +++++
> > >  3 files changed, 112 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > index f2e3b81f2942..e317807cdd3e 100644
> > > --- a/include/linux/audit.h
> > > +++ b/include/linux/audit.h
> > > @@ -95,10 +95,18 @@ struct audit_ntp_data {
> > >  struct audit_ntp_data {};
> > >  #endif
> > >  
> > > +struct audit_cont {
> > > +	struct list_head	list;
> > > +	u64			id;
> > > +	struct task_struct	*owner;
> > > +	refcount_t              refcount;
> > > +	struct rcu_head         rcu;
> > > +};
> > > +
> > >  struct audit_task_info {
> > >  	kuid_t			loginuid;
> > >  	unsigned int		sessionid;
> > > -	u64			contid;
> > > +	struct audit_cont	*cont;
> > >  #ifdef CONFIG_AUDITSYSCALL
> > >  	struct audit_context	*ctx;
> > >  #endif
> > > @@ -203,11 +211,15 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
> > >  
> > >  static inline u64 audit_get_contid(struct task_struct *tsk)
> > >  {
> > > -	if (!tsk->audit)
> > > +	if (!tsk->audit || !tsk->audit->cont)
> > >  		return AUDIT_CID_UNSET;
> > > -	return tsk->audit->contid;
> > > +	return tsk->audit->cont->id;
> > >  }
> > >  
> > > +extern struct audit_cont *audit_cont(struct task_struct *tsk);
> > > +
> > > +extern void audit_cont_put(struct audit_cont *cont);
> > > +
> > I see that you manual increment this refcount at various call sites, why
> > no corresponding audit_contid_hold function?
> 
> I was trying to avoid the get function due to having one site where I
> needed the pointer for later but didn't need a refcount to it so that I
> could release the refcount it if it was replaced by another cont object.
> A hold function would just contain one line that would call the
> refcount_inc().  If I did convert things over to a get function, it
> would hide some of this extra conditional code in the main calling
> function, but in one place I could just call put immediately to
> neutralize that unneeded refcount.
> 
Ok, but this pattern:

static inline u64 __audit_contid_get(struct audit_cont *c) {
        return c->id;
}

audit_contid_get(struct audit_cont *c) {
        refcount_hold(c)
        return __audit_contid_get(c)
}

Squares that up, doesn't it?  It gives you an internal non refcount
holding version then to use.

> Would you see any issue with that extra get/put refcount that would only
> happen in the case of changing a contid in a nesting situation?
> 
No, I personally wouldn't have an issue with it, but the above would
make it pretty readable I think

> > Neil
> > 
> > >  extern u32 audit_enabled;
> > >  
> > >  extern int audit_signal_info(int sig, struct task_struct *t);
> > > @@ -277,6 +289,14 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> > >  	return AUDIT_CID_UNSET;
> > >  }
> > >  
> > > +static inline struct audit_cont *audit_cont(struct task_struct *tsk)
> > > +{
> > > +	return NULL;
> > > +}
> > > +
> > > +static inline void audit_cont_put(struct audit_cont *cont)
> > > +{ }
> > > +
> > >  #define audit_enabled AUDIT_OFF
> > >  
> > >  static inline int audit_signal_info(int sig, struct task_struct *t)
> > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > index a36ea57cbb61..ea0899130cc1 100644
> > > --- a/kernel/audit.c
> > > +++ b/kernel/audit.c
> > > @@ -137,6 +137,8 @@ struct audit_net {
> > >  
> > >  /* Hash for inode-based rules */
> > >  struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
> > > +/* Hash for contid-based rules */
> > > +struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> > >  
> > >  static struct kmem_cache *audit_buffer_cache;
> > >  
> > > @@ -204,6 +206,8 @@ struct audit_reply {
> > >  
> > >  static struct kmem_cache *audit_task_cache;
> > >  
> > > +static DEFINE_SPINLOCK(audit_contid_list_lock);
> > > +
> > >  void __init audit_task_init(void)
> > >  {
> > >  	audit_task_cache = kmem_cache_create("audit_task",
> > > @@ -231,7 +235,9 @@ int audit_alloc(struct task_struct *tsk)
> > >  	}
> > >  	info->loginuid = audit_get_loginuid(current);
> > >  	info->sessionid = audit_get_sessionid(current);
> > > -	info->contid = audit_get_contid(current);
> > > +	info->cont = audit_cont(current);
> > > +	if (info->cont)
> > > +		refcount_inc(&info->cont->refcount);
> > >  	tsk->audit = info;
> > >  
> > >  	ret = audit_alloc_syscall(tsk);
> > > @@ -246,7 +252,7 @@ int audit_alloc(struct task_struct *tsk)
> > >  struct audit_task_info init_struct_audit = {
> > >  	.loginuid = INVALID_UID,
> > >  	.sessionid = AUDIT_SID_UNSET,
> > > -	.contid = AUDIT_CID_UNSET,
> > > +	.cont = NULL,
> > >  #ifdef CONFIG_AUDITSYSCALL
> > >  	.ctx = NULL,
> > >  #endif
> > > @@ -266,6 +272,9 @@ void audit_free(struct task_struct *tsk)
> > >  	/* Freeing the audit_task_info struct must be performed after
> > >  	 * audit_log_exit() due to need for loginuid and sessionid.
> > >  	 */
> > > +	spin_lock(&audit_contid_list_lock); 
> > > +	audit_cont_put(tsk->audit->cont);
> > > +	spin_unlock(&audit_contid_list_lock); 
> > >  	info = tsk->audit;
> > >  	tsk->audit = NULL;
> > >  	kmem_cache_free(audit_task_cache, info);
> > > @@ -1657,6 +1666,9 @@ static int __init audit_init(void)
> > >  	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
> > >  		INIT_LIST_HEAD(&audit_inode_hash[i]);
> > >  
> > > +	for (i = 0; i < AUDIT_CONTID_BUCKETS; i++)
> > > +		INIT_LIST_HEAD(&audit_contid_hash[i]);
> > > +
> > >  	mutex_init(&audit_cmd_mutex.lock);
> > >  	audit_cmd_mutex.owner = NULL;
> > >  
> > > @@ -2356,6 +2368,32 @@ int audit_signal_info(int sig, struct task_struct *t)
> > >  	return audit_signal_info_syscall(t);
> > >  }
> > >  
> > > +struct audit_cont *audit_cont(struct task_struct *tsk)
> > > +{
> > > +	if (!tsk->audit || !tsk->audit->cont)
> > > +		return NULL;
> > > +	return tsk->audit->cont;
> > > +}
> > > +
> > > +/* audit_contid_list_lock must be held by caller */
> > > +void audit_cont_put(struct audit_cont *cont)
> > > +{
> > > +	if (!cont)
> > > +		return;
> > > +	if (refcount_dec_and_test(&cont->refcount)) {
> > > +		put_task_struct(cont->owner);
> > > +		list_del_rcu(&cont->list);
> > > +		kfree_rcu(cont, rcu);
> > > +	}
> > > +}
> > > +
> > > +static struct task_struct *audit_cont_owner(struct task_struct *tsk)
> > > +{
> > > +	if (tsk->audit && tsk->audit->cont)
> > > +		return tsk->audit->cont->owner;
> > > +	return NULL;
> > > +}
> > > +
> > >  /*
> > >   * audit_set_contid - set current task's audit contid
> > >   * @task: target task
> > > @@ -2382,9 +2420,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> > >  	}
> > >  	oldcontid = audit_get_contid(task);
> > >  	read_lock(&tasklist_lock);
> > > -	/* Don't allow the audit containerid to be unset */
> > > +	/* Don't allow the contid to be unset */
> > >  	if (!audit_contid_valid(contid))
> > >  		rc = -EINVAL;
> > > +	/* Don't allow the contid to be set to the same value again */
> > > +	else if (contid == oldcontid) {
> > > +		rc = -EADDRINUSE;
> > >  	/* if we don't have caps, reject */
> > >  	else if (!capable(CAP_AUDIT_CONTROL))
> > >  		rc = -EPERM;
> > > @@ -2397,8 +2438,43 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> > >  	else if (audit_contid_set(task))
> > >  		rc = -ECHILD;
> > >  	read_unlock(&tasklist_lock);
> > > -	if (!rc)
> > > -		task->audit->contid = contid;
> > > +	if (!rc) {
> > > +		struct audit_cont *oldcont = audit_cont(task);
> > > +		struct audit_cont *cont = NULL;
> > > +		struct audit_cont *newcont = NULL;
> > > +		int h = audit_hash_contid(contid);
> > > +
> > > +		spin_lock(&audit_contid_list_lock);
> > > +		list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
> > > +			if (cont->id == contid) {
> > > +				/* task injection to existing container */
> > > +				if (current == cont->owner) {
> > > +					refcount_inc(&cont->refcount);
> > > +					newcont = cont;
> > > +				} else {
> > > +					rc = -ENOTUNIQ;
> > > +					goto conterror;
> > > +				}
> > > +			}
> > > +		if (!newcont) {
> > > +			newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
> > > +			if (newcont) {
> > > +				INIT_LIST_HEAD(&newcont->list);
> > > +				newcont->id = contid;
> > > +				get_task_struct(current);
> > > +				newcont->owner = current;
> > > +				refcount_set(&newcont->refcount, 1);
> > > +				list_add_rcu(&newcont->list, &audit_contid_hash[h]);
> > > +			} else {
> > > +				rc = -ENOMEM;
> > > +				goto conterror;
> > > +			}
> > > +		}
> > > +		task->audit->cont = newcont;
> > > +		audit_cont_put(oldcont);
> > > +conterror:
> > > +		spin_unlock(&audit_contid_list_lock);
> > > +	}
> > >  	task_unlock(task);
> > >  
> > >  	if (!audit_enabled)
> > > diff --git a/kernel/audit.h b/kernel/audit.h
> > > index 16bd03b88e0d..e4a31aa92dfe 100644
> > > --- a/kernel/audit.h
> > > +++ b/kernel/audit.h
> > > @@ -211,6 +211,14 @@ static inline int audit_hash_ino(u32 ino)
> > >  	return (ino & (AUDIT_INODE_BUCKETS-1));
> > >  }
> > >  
> > > +#define AUDIT_CONTID_BUCKETS	32
> > > +extern struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> > > +
> > > +static inline int audit_hash_contid(u64 contid)
> > > +{
> > > +	return (contid & (AUDIT_CONTID_BUCKETS-1));
> > > +}
> > > +
> > >  /* Indicates that audit should log the full pathname. */
> > >  #define AUDIT_NAME_FULL -1
> > >  
> > > -- 
> > > 1.8.3.1
> > > 
> > > 
> 
> - RGB
> 
> --
> Richard Guy Briggs <rgb@redhat.com>
> Sr. S/W Engineer, Kernel Security, Base Operating Systems
> Remote, Ottawa, Red Hat Canada
> IRC: rgb, SunRaycer
> Voice: +1.647.777.2635, Internal: (81) 32635
> 
>
Paul Moore Nov. 8, 2019, 6:26 p.m. UTC | #6
On Fri, Oct 25, 2019 at 5:00 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-10-10 20:38, Paul Moore wrote:
> > On Wed, Sep 18, 2019 at 9:24 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > Store the audit container identifier in a refcounted kernel object that
> > > is added to the master list of audit container identifiers.  This will
> > > allow multiple container orchestrators/engines to work on the same
> > > machine without danger of inadvertantly re-using an existing identifier.
> > > It will also allow an orchestrator to inject a process into an existing
> > > container by checking if the original container owner is the one
> > > injecting the task.  A hash table list is used to optimize searches.
> > >
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  include/linux/audit.h | 26 ++++++++++++++--
> > >  kernel/audit.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  kernel/audit.h        |  8 +++++
> > >  3 files changed, 112 insertions(+), 8 deletions(-)
> >
> > One general comment before we go off into the weeds on this ... I can
> > understand why you wanted to keep this patch separate from the earlier
> > patches, but as we get closer to having mergeable code this should get
> > folded into the previous patches.  For example, there shouldn't be a
> > change in audit_task_info where you change the contid field from a u64
> > to struct pointer, it should be a struct pointer from the start.
>
> I should have marked this patchset as RFC even though it was v7 due to a
> lot of new ideas/code that was added with uncertainties needing comment
> and direction.
>
> > It's also disappointing that idr appears to only be for 32-bit ID
> > values, if we had a 64-bit idr I think we could simplify this greatly.
>
> Perhaps.  I do still see value in letting the orchestrator choose the
> value.

Agreed.  I was just thinking out loud that it seems like much of what
we need could be a generic library mechanism similar to, but not quite
like, the existing idr code.

> > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > index f2e3b81f2942..e317807cdd3e 100644
> > > --- a/include/linux/audit.h
> > > +++ b/include/linux/audit.h
> > > @@ -95,10 +95,18 @@ struct audit_ntp_data {
> > >  struct audit_ntp_data {};
> > >  #endif
> > >
> > > +struct audit_cont {
> > > +       struct list_head        list;
> > > +       u64                     id;
> > > +       struct task_struct      *owner;
> > > +       refcount_t              refcount;
> > > +       struct rcu_head         rcu;
> > > +};
> >
> > It seems as though in most of the code you are using "contid", any
> > reason why didn't stick with that naming scheme here, e.g. "struct
> > audit_contid"?
>
> I was using contid to refer to the value itself and cont to refer to the
> refcounted object.  I find cont a bit too terse, so I'm still thinking
> of changing it.  Perhaps contobj?

Yes, just "cont" is a bit too ambiguous considering we have both
integer values and structures being passed around.  Whatever you
decide on, a common base with separate suffixes seems like a good
idea.

FWIW, I still think the "audit container ID" : "ACID" thing is kinda funny ;)

> > > @@ -203,11 +211,15 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
> > >
> > >  static inline u64 audit_get_contid(struct task_struct *tsk)
> > >  {
> > > -       if (!tsk->audit)
> > > +       if (!tsk->audit || !tsk->audit->cont)
> > >                 return AUDIT_CID_UNSET;
> > > -       return tsk->audit->contid;
> > > +       return tsk->audit->cont->id;
> > >  }
> >
> > Assuming for a moment that we implement an audit_contid_get() (see
> > Neil's comment as well as mine below), we probably need to name this
> > something different so we don't all lose our minds when we read this
> > code.  On the plus side we can probably preface it with an underscore
> > since it is a static, in which case _audit_contid_get() might be okay,
> > but I'm open to suggestions.
>
> I'm fine with the "_" prefix, can you point to precedent or convention?

Generally kernel functions which are "special"/private/unsafe/etc.
have a one, or two, underscore prefix.  If you don't want to add the
prefix, that's fine, but please change the name as mentioned
previously.

> > > @@ -231,7 +235,9 @@ int audit_alloc(struct task_struct *tsk)
> > >         }
> > >         info->loginuid = audit_get_loginuid(current);
> > >         info->sessionid = audit_get_sessionid(current);
> > > -       info->contid = audit_get_contid(current);
> > > +       info->cont = audit_cont(current);
> > > +       if (info->cont)
> > > +               refcount_inc(&info->cont->refcount);
> >
> > See the other comments about a "get" function, but I think we need a
> > RCU read lock around the above, no?
>
> The rcu read lock is to protect the list rather than the cont object
> itself, the latter of which is protected by its refcount.

What protects you from info->cont going away between when you fetch
the pointer via audit_cont() to when you dereference it in
refcount_inc()?

> > > @@ -2397,8 +2438,43 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> > >         else if (audit_contid_set(task))
> > >                 rc = -ECHILD;
> > >         read_unlock(&tasklist_lock);
> > > -       if (!rc)
> > > -               task->audit->contid = contid;
> > > +       if (!rc) {
> > > +               struct audit_cont *oldcont = audit_cont(task);
> >
> > Previously we held the tasklist_lock to protect the audit container ID
> > associated with the struct, should we still be holding it here?
>
> We held the tasklist_lock to protect access to the target task's
> child/parent/thread relationships.

What protects us in the case of simultaneous calls to audit_set_contid()?

> > Regardless, I worry that the lock dependencies between the
> > tasklist_lock and the audit_contid_list_lock are going to be tricky.
> > It might be nice to document the relationship in a comment up near
> > where you declare audit_contid_list_lock.
>
> I don't think there should be a conflict between the two.
>
> The contid_list_lock doesn't care if the cont object is associated to a
> particular task.

Please document the relationship between the two, I worry we could
easily run into lockdep problems without a clearly defined ordering.

> > > +               struct audit_cont *cont = NULL;
> > > +               struct audit_cont *newcont = NULL;
> > > +               int h = audit_hash_contid(contid);
> > > +
> > > +               spin_lock(&audit_contid_list_lock);
> > > +               list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
> > > +                       if (cont->id == contid) {
> > > +                               /* task injection to existing container */
> > > +                               if (current == cont->owner) {
> >
> > I understand the desire to limit a given audit container ID to the
> > orchestrator that created it, but are we certain that we can track
> > audit container ID "ownership" via a single instance of a task_struct?
>
> Are you suggesting that a task_struct representing a task may be
> replaced for a specific task?  I don't believe that will ever happen.
>
> >  What happens when the orchestrator stops/restarts/crashes?  Do we
> > even care?
>
> Reap all of its containers?

These were genuine questions, I'm not suggesting anything in
particular, I'm just curious about how we handle an orchestrator that
isn't continuously running ... is this possible?  Do we care?
diff mbox series

Patch

diff --git a/include/linux/audit.h b/include/linux/audit.h
index f2e3b81f2942..e317807cdd3e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -95,10 +95,18 @@  struct audit_ntp_data {
 struct audit_ntp_data {};
 #endif
 
+struct audit_cont {
+	struct list_head	list;
+	u64			id;
+	struct task_struct	*owner;
+	refcount_t              refcount;
+	struct rcu_head         rcu;
+};
+
 struct audit_task_info {
 	kuid_t			loginuid;
 	unsigned int		sessionid;
-	u64			contid;
+	struct audit_cont	*cont;
 #ifdef CONFIG_AUDITSYSCALL
 	struct audit_context	*ctx;
 #endif
@@ -203,11 +211,15 @@  static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 
 static inline u64 audit_get_contid(struct task_struct *tsk)
 {
-	if (!tsk->audit)
+	if (!tsk->audit || !tsk->audit->cont)
 		return AUDIT_CID_UNSET;
-	return tsk->audit->contid;
+	return tsk->audit->cont->id;
 }
 
+extern struct audit_cont *audit_cont(struct task_struct *tsk);
+
+extern void audit_cont_put(struct audit_cont *cont);
+
 extern u32 audit_enabled;
 
 extern int audit_signal_info(int sig, struct task_struct *t);
@@ -277,6 +289,14 @@  static inline u64 audit_get_contid(struct task_struct *tsk)
 	return AUDIT_CID_UNSET;
 }
 
+static inline struct audit_cont *audit_cont(struct task_struct *tsk)
+{
+	return NULL;
+}
+
+static inline void audit_cont_put(struct audit_cont *cont)
+{ }
+
 #define audit_enabled AUDIT_OFF
 
 static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/audit.c b/kernel/audit.c
index a36ea57cbb61..ea0899130cc1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -137,6 +137,8 @@  struct audit_net {
 
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+/* Hash for contid-based rules */
+struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
 
 static struct kmem_cache *audit_buffer_cache;
 
@@ -204,6 +206,8 @@  struct audit_reply {
 
 static struct kmem_cache *audit_task_cache;
 
+static DEFINE_SPINLOCK(audit_contid_list_lock);
+
 void __init audit_task_init(void)
 {
 	audit_task_cache = kmem_cache_create("audit_task",
@@ -231,7 +235,9 @@  int audit_alloc(struct task_struct *tsk)
 	}
 	info->loginuid = audit_get_loginuid(current);
 	info->sessionid = audit_get_sessionid(current);
-	info->contid = audit_get_contid(current);
+	info->cont = audit_cont(current);
+	if (info->cont)
+		refcount_inc(&info->cont->refcount);
 	tsk->audit = info;
 
 	ret = audit_alloc_syscall(tsk);
@@ -246,7 +252,7 @@  int audit_alloc(struct task_struct *tsk)
 struct audit_task_info init_struct_audit = {
 	.loginuid = INVALID_UID,
 	.sessionid = AUDIT_SID_UNSET,
-	.contid = AUDIT_CID_UNSET,
+	.cont = NULL,
 #ifdef CONFIG_AUDITSYSCALL
 	.ctx = NULL,
 #endif
@@ -266,6 +272,9 @@  void audit_free(struct task_struct *tsk)
 	/* Freeing the audit_task_info struct must be performed after
 	 * audit_log_exit() due to need for loginuid and sessionid.
 	 */
+	spin_lock(&audit_contid_list_lock); 
+	audit_cont_put(tsk->audit->cont);
+	spin_unlock(&audit_contid_list_lock); 
 	info = tsk->audit;
 	tsk->audit = NULL;
 	kmem_cache_free(audit_task_cache, info);
@@ -1657,6 +1666,9 @@  static int __init audit_init(void)
 	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
 		INIT_LIST_HEAD(&audit_inode_hash[i]);
 
+	for (i = 0; i < AUDIT_CONTID_BUCKETS; i++)
+		INIT_LIST_HEAD(&audit_contid_hash[i]);
+
 	mutex_init(&audit_cmd_mutex.lock);
 	audit_cmd_mutex.owner = NULL;
 
@@ -2356,6 +2368,32 @@  int audit_signal_info(int sig, struct task_struct *t)
 	return audit_signal_info_syscall(t);
 }
 
+struct audit_cont *audit_cont(struct task_struct *tsk)
+{
+	if (!tsk->audit || !tsk->audit->cont)
+		return NULL;
+	return tsk->audit->cont;
+}
+
+/* audit_contid_list_lock must be held by caller */
+void audit_cont_put(struct audit_cont *cont)
+{
+	if (!cont)
+		return;
+	if (refcount_dec_and_test(&cont->refcount)) {
+		put_task_struct(cont->owner);
+		list_del_rcu(&cont->list);
+		kfree_rcu(cont, rcu);
+	}
+}
+
+static struct task_struct *audit_cont_owner(struct task_struct *tsk)
+{
+	if (tsk->audit && tsk->audit->cont)
+		return tsk->audit->cont->owner;
+	return NULL;
+}
+
 /*
  * audit_set_contid - set current task's audit contid
  * @task: target task
@@ -2382,9 +2420,12 @@  int audit_set_contid(struct task_struct *task, u64 contid)
 	}
 	oldcontid = audit_get_contid(task);
 	read_lock(&tasklist_lock);
-	/* Don't allow the audit containerid to be unset */
+	/* Don't allow the contid to be unset */
 	if (!audit_contid_valid(contid))
 		rc = -EINVAL;
+	/* Don't allow the contid to be set to the same value again */
+	else if (contid == oldcontid) {
+		rc = -EADDRINUSE;
 	/* if we don't have caps, reject */
 	else if (!capable(CAP_AUDIT_CONTROL))
 		rc = -EPERM;
@@ -2397,8 +2438,43 @@  int audit_set_contid(struct task_struct *task, u64 contid)
 	else if (audit_contid_set(task))
 		rc = -ECHILD;
 	read_unlock(&tasklist_lock);
-	if (!rc)
-		task->audit->contid = contid;
+	if (!rc) {
+		struct audit_cont *oldcont = audit_cont(task);
+		struct audit_cont *cont = NULL;
+		struct audit_cont *newcont = NULL;
+		int h = audit_hash_contid(contid);
+
+		spin_lock(&audit_contid_list_lock);
+		list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
+			if (cont->id == contid) {
+				/* task injection to existing container */
+				if (current == cont->owner) {
+					refcount_inc(&cont->refcount);
+					newcont = cont;
+				} else {
+					rc = -ENOTUNIQ;
+					goto conterror;
+				}
+			}
+		if (!newcont) {
+			newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
+			if (newcont) {
+				INIT_LIST_HEAD(&newcont->list);
+				newcont->id = contid;
+				get_task_struct(current);
+				newcont->owner = current;
+				refcount_set(&newcont->refcount, 1);
+				list_add_rcu(&newcont->list, &audit_contid_hash[h]);
+			} else {
+				rc = -ENOMEM;
+				goto conterror;
+			}
+		}
+		task->audit->cont = newcont;
+		audit_cont_put(oldcont);
+conterror:
+		spin_unlock(&audit_contid_list_lock);
+	}
 	task_unlock(task);
 
 	if (!audit_enabled)
diff --git a/kernel/audit.h b/kernel/audit.h
index 16bd03b88e0d..e4a31aa92dfe 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -211,6 +211,14 @@  static inline int audit_hash_ino(u32 ino)
 	return (ino & (AUDIT_INODE_BUCKETS-1));
 }
 
+#define AUDIT_CONTID_BUCKETS	32
+extern struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
+
+static inline int audit_hash_contid(u64 contid)
+{
+	return (contid & (AUDIT_CONTID_BUCKETS-1));
+}
+
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1