diff mbox series

[v10,18/27] integrity/ima: Define ns_status for storing namespaced iint data

Message ID 20220201203735.164593-19-stefanb@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series ima: Namespace IMA with audit support in IMA-ns | expand

Commit Message

Stefan Berger Feb. 1, 2022, 8:37 p.m. UTC
From: Mehmet Kayaalp <mkayaalp@linux.vnet.ibm.com>

Add an rbtree to the IMA namespace structure that stores a namespaced
version of iint->flags in ns_status struct. Similar to the
integrity_iint_cache, both the iint and ns_status are looked up using the
inode pointer value. The lookup, allocate, and insertion code is also
similar.

In subsequent patches we will have to find all ns_status entries an iint
is being used in and reset flags there. To do this, connect a list of
ns_status to the integrity_iint_cache and provide a reader-writer
lock in the integrity_iint_cache to lock access to the list.

To simplify the code in the non-namespaces case embed an ns_status in
the integrity_iint_cache and have it linked into the iint's ns_status list
when calling ima_get_ns_status().

When getting an ns_status first try to find it in the RB tree. Here we can
run into the situation that an ns_status found in the RB tree has a
different iint associated with it for the same inode. In this case we need
to delete the ns_status structure and get a new one.

There are two cases for freeing:
- when the iint is freed (inode deletion): Walk the list of ns_status
  entries and disconnect each ns_status from the list; take the
  writer lock to protect access to the list; also, take the item off the
  per-namespace rbtree

- when the ima_namepace is freed: While walking the rbtree, remove the
  ns_status from the list while also holding the iint's writer lock;
  to be able to grab the lock we have to have a pointer to the iint on
  the ns_status structure.

To avoid an ns_status to be freed by the two cases concurrently, prevent
these two cases to run concurrently. Therefore, groups of threads
deleting either inodes or ima_namespaces are allowed to run concurrently
but no two threads may run and one delete an inode and the other an
ima_namespace.

Signed-off-by: Mehmet Kayaalp <mkayaalp@linux.vnet.ibm.com>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>

---
v9:
 - Move ns_status into integrity.h and embedded it into integrity_iint_cache
   for the non-CONFIG_IMA_NS case
---
 include/linux/ima.h                      |   7 +
 security/integrity/iint.c                |   7 +
 security/integrity/ima/Makefile          |   2 +-
 security/integrity/ima/ima.h             |  17 ++
 security/integrity/ima/ima_init_ima_ns.c |   5 +
 security/integrity/ima/ima_ns.c          |   1 +
 security/integrity/ima/ima_ns_status.c   | 344 +++++++++++++++++++++++
 security/integrity/integrity.h           |  35 ++-
 8 files changed, 416 insertions(+), 2 deletions(-)
 create mode 100644 security/integrity/ima/ima_ns_status.c

Comments

Mimi Zohar Feb. 23, 2022, 4:12 p.m. UTC | #1
On Tue, 2022-02-01 at 15:37 -0500, Stefan Berger wrote:
> From: Mehmet Kayaalp <mkayaalp@linux.vnet.ibm.com>
> 
> Add an rbtree to the IMA namespace structure that stores a namespaced
> version of iint->flags in ns_status struct. Similar to the
> integrity_iint_cache, both the iint and ns_status are looked up using the
> inode pointer value. The lookup, allocate, and insertion code is also
> similar.
> 
> In subsequent patches we will have to find all ns_status entries an iint
> is being used in and reset flags there. To do this, connect a list of
> ns_status to the integrity_iint_cache and provide a reader-writer
> lock in the integrity_iint_cache to lock access to the list.
> 
> To simplify the code in the non-namespaces case embed an ns_status in
> the integrity_iint_cache and have it linked into the iint's ns_status list
> when calling ima_get_ns_status().
> 
> When getting an ns_status first try to find it in the RB tree. Here we can
> run into the situation that an ns_status found in the RB tree has a
> different iint associated with it for the same inode. In this case we need
> to delete the ns_status structure and get a new one.
> 
> There are two cases for freeing:
> - when the iint is freed (inode deletion): Walk the list of ns_status
>   entries and disconnect each ns_status from the list; take the
>   writer lock to protect access to the list; also, take the item off the
>   per-namespace rbtree
> 
> - when the ima_namepace is freed: While walking the rbtree, remove the
>   ns_status from the list while also holding the iint's writer lock;
>   to be able to grab the lock we have to have a pointer to the iint on
>   the ns_status structure.
> 
> To avoid an ns_status to be freed by the two cases concurrently, prevent
> these two cases to run concurrently. Therefore, groups of threads
> deleting either inodes or ima_namespaces are allowed to run concurrently
> but no two threads may run and one delete an inode and the other an
> ima_namespace.

The locking involved here is really complex.  I'm sure you thought
about it a lot, but isn't there a better alternative?

> 
> Signed-off-by: Mehmet Kayaalp <mkayaalp@linux.vnet.ibm.com>
> Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
> 
> ---
> v9:
>  - Move ns_status into integrity.h and embedded it into integrity_iint_cache
>    for the non-CONFIG_IMA_NS case

<snip>

>  
> +/*
> + * ima_free_ns_status_tree - free all items on the ns_status_tree and take each
> + *                           one off the list; yield to ns_list free'ers
> + *
> + * This function is called when an ima_namespace is freed. All entries in the
> + * rbtree will be taken off their list and collected in a garbage collection
> + * list and freed at the end. This allows to walk the rbtree again.
> + */
> +void ima_free_ns_status_tree(struct ima_namespace *ns)
> +{
> +	struct ns_status *ns_status, *next;
> +	struct llist_node *node;
> +	LLIST_HEAD(garbage);
> +	unsigned int ctr;
> +	bool restart;
> +
> +	do {
> +		ctr = 0;
> +		restart = false;
> +
> +		lock_group(GRP_NS_STATUS_TREE);
> +		write_lock(&ns->ns_tree_lock);
> +
> +		rbtree_postorder_for_each_entry_safe(ns_status, next,
> +						     &ns->ns_status_tree,
> +						     rb_node) {
> +			write_lock(&ns_status->iint->ns_list_lock);
> +			if (!list_empty(&ns_status->ns_next)) {
> +				list_del_init(&ns_status->ns_next);
> +				llist_add(&ns_status->gc_llist, &garbage);
> +				ctr++;
> +			}

At this point when the namespace is being deleted, no entries are being
added to the rbtree, so it is safe to remove the nodes here.  There's
no need to first create a list and then remove them.

> +			write_unlock(&ns_status->iint->ns_list_lock);
> +
> +			/*
> +			 * After some progress yield to any waiting ns_list
> +			 * free'ers.
> +			 */
> +			if (atomic_read(&ns_list_waiters) > 0 && ctr >= 5) {
> +				restart = true;
> +				break;
> +			}

Giving priority to removing entries in the iint cache is important, but
I wish there was a better alternative.

> +		}
> +
> +		write_unlock(&ns->ns_tree_lock);
> +		unlock_group(GRP_NS_STATUS_TREE);
> +	} while (restart);
> +
> +	node = llist_del_all(&garbage);
> +	llist_for_each_entry_safe(ns_status, next, node, gc_llist)
> +		ns_status_free(ns, ns_status);
> +
> +	kmem_cache_destroy(ns->ns_status_cache);
> +}
Stefan Berger Feb. 24, 2022, 2:21 a.m. UTC | #2
On 2/23/22 11:12, Mimi Zohar wrote:
> On Tue, 2022-02-01 at 15:37 -0500, Stefan Berger wrote:
>> From: Mehmet Kayaalp <mkayaalp@linux.vnet.ibm.com>
>>
>> Add an rbtree to the IMA namespace structure that stores a namespaced
>> version of iint->flags in ns_status struct. Similar to the
>> integrity_iint_cache, both the iint and ns_status are looked up using the
>> inode pointer value. The lookup, allocate, and insertion code is also
>> similar.
>>
>> In subsequent patches we will have to find all ns_status entries an iint
>> is being used in and reset flags there. To do this, connect a list of
>> ns_status to the integrity_iint_cache and provide a reader-writer
>> lock in the integrity_iint_cache to lock access to the list.
>>
>> To simplify the code in the non-namespaces case embed an ns_status in
>> the integrity_iint_cache and have it linked into the iint's ns_status list
>> when calling ima_get_ns_status().
>>
>> When getting an ns_status first try to find it in the RB tree. Here we can
>> run into the situation that an ns_status found in the RB tree has a
>> different iint associated with it for the same inode. In this case we need
>> to delete the ns_status structure and get a new one.
>>
>> There are two cases for freeing:
>> - when the iint is freed (inode deletion): Walk the list of ns_status
>>    entries and disconnect each ns_status from the list; take the
>>    writer lock to protect access to the list; also, take the item off the
>>    per-namespace rbtree
>>
>> - when the ima_namepace is freed: While walking the rbtree, remove the
>>    ns_status from the list while also holding the iint's writer lock;
>>    to be able to grab the lock we have to have a pointer to the iint on
>>    the ns_status structure.
>>
>> To avoid an ns_status to be freed by the two cases concurrently, prevent
>> these two cases to run concurrently. Therefore, groups of threads
>> deleting either inodes or ima_namespaces are allowed to run concurrently
>> but no two threads may run and one delete an inode and the other an
>> ima_namespace.
> The locking involved here is really complex.  I'm sure you thought
> about it a lot, but isn't there a better alternative?

I am afraid this is a difficult question and a short and concise answer 
is not possible...

The complexity of the locking is driven by concurrency and the data 
structures that are involved. The data structures (existing global iint 
rbtree, ns_status structure, and per namespace rbtree for ns_status) and 
how they are organized and connected (via linked lists) are a 
consequence of the fact that we need to be able to handle files shared 
between IMA namespaces (and the host) so that re-auditing, re-measuring 
and re-appraisal of files after file modifications or modifications of 
the security.ima xattr (by any namespaces) can be done efficiently. 
Furthermore, it helps to efficiently remove all the status information 
that an IMA namespace has kept for files it audited/measured/appraised. 
The goal was to make this as scalable as possible by having each 
namespace get out of the way of other namespaces by preventing them from 
locking each other out too much. The single biggest problem are files 
shared between IMA namespaces.

The best argument for the design I can come up with is the 'Big O 
notation' describing the time complexity of operations.


The existing global iint rbtree maintains IMA status information for 
each inode. Lookup and insertion of data into the gloab iint rbtree  is 
O(log(n)), thus optimal.

To accommodate re-auditing/re-measurement/re-appraisal, which is driven 
by resetting status flags, I connected a list of ns_status structures, 
in which each namespace maintains its status information for each inode, 
to the iint maintained in that global rbtree. The resetting of status 
flags is fast because traversing the list after a lookup in the tree is 
O(n). Lookup + resetting the flags therefore is O(log(n) + n). If the 
list didn't exist we would have to search all IMA namespaces for the 
inode to be able to reset the flags, resulting in O(n * log(n)) time 
complexity, which is of course much worse. So, the list of ns_status 
linked to an iint has a good reason: better time complexity to traverse 
the list and reset status flags. Beside  that it also supports fast 
handling of deletion of files where the iint has to be delete from the 
global rbtree and the ns_status list it holds must also be deleted (each 
ns_status also needs to be delete from a per IMA-namespace rbtree then)


There's also a per-IMA namespace rbtree for each inode that serves two 
purposes:

a) Fast lookup of ns_status (O(log(n)) for an IMA namespace; at least to 
insert an ns_status into this tree we need not write-lock the iint tree 
but the initial iint creation required the write-locking of the iint tree

b) Maintaining a collection of inodes that the namespace has 
audited/measured/appraised for efficient deletion upon IMA namespace 
teardown: We can traverse this tree in O(n) time and determine which 
iints have no more namespace users and delete them from the iint tree.


Now the dilemma with this is that an ns_status structure is connected to 
a list hanging off the iint and on top of this it is part of an rbtree. 
And this is where the 'group locking' is coming from. What we need to 
prevent is that an ns_status is deleted from its iint list (when a file 
is deleted) while it is also deleted from the per-IMA namespace rbtree 
(when the namespace is deleted). Both must not be done concurrently. 
What is possible is that a group of threads may tear down namespaces and 
the other group may act on file deletion, but threads from both groups 
must not run concurrently.


Now we can at least look at two alternatives for the per-IMA namespace 
rbtree.

1) One alternative is to use a list instead of an rbtree. We would loose 
the fast lookup via the per IMA namespace tree and get O(n) lookup times 
but quick insertion into the list [O(1)]. We still would have the 
collection of inodes. And we would still have the dilemma that an 
ns_status would be connected to two lists, thus requiring the group 
locking. I don't think using a list instead of an rbtree is a solution.

2) We could try to get rid of the per-IMA namespace rbtree altogether 
and just use the global iint rbtree that exists today with a list of 
ns_status connected to its iints. If we do this we would loose the 
knowledge of which inodes a namespace has an ns_status structure for. 
The only way we would find this is by traversing the global iint tree 
(O(n)) and follow each iint list (O(m)) to see whether we find an 
ns_status holding information about the iint. The time complexity for 
this would be O(n*m) but much less than O(n^2). A downside would also be 
that we would have to keep a lock on the global iint rbtree while 
traversing it, thus locking out those that want to add inodes to the 
tree. On the upside it would allow us to get rid of the group locking. 
Lookup of an ns_status in the global iint tree would be O(n) + O(m) and 
insertion would be O(n) + O(1).


Certainly, the alternative is 2) with its own trade-offs. My guess is 
some sort of yielding could probably also be helpful there then to avoid 
blocking higher priority operations than deleting of a namespace.


>> +/*
>> + * ima_free_ns_status_tree - free all items on the ns_status_tree and take each
>> + *                           one off the list; yield to ns_list free'ers
>> + *
>> + * This function is called when an ima_namespace is freed. All entries in the
>> + * rbtree will be taken off their list and collected in a garbage collection
>> + * list and freed at the end. This allows to walk the rbtree again.
>> + */
>> +void ima_free_ns_status_tree(struct ima_namespace *ns)
>> +{
>> +	struct ns_status *ns_status, *next;
>> +	struct llist_node *node;
>> +	LLIST_HEAD(garbage);
>> +	unsigned int ctr;
>> +	bool restart;
>> +
>> +	do {
>> +		ctr = 0;
>> +		restart = false;
>> +
>> +		lock_group(GRP_NS_STATUS_TREE);
>> +		write_lock(&ns->ns_tree_lock);
>> +
>> +		rbtree_postorder_for_each_entry_safe(ns_status, next,
>> +						     &ns->ns_status_tree,
>> +						     rb_node) {
>> +			write_lock(&ns_status->iint->ns_list_lock);
>> +			if (!list_empty(&ns_status->ns_next)) {
>> +				list_del_init(&ns_status->ns_next);
>> +				llist_add(&ns_status->gc_llist, &garbage);
>> +				ctr++;
>> +			}
> At this point when the namespace is being deleted, no entries are being
> added to the rbtree, so it is safe to remove the nodes here.  There's
> no need to first create a list and then remove them.


We are traversing the per-namespace rbtree here and later on in this 
series (21/27) we try to determine which iints are now unused in the 
global iint tree and delete the iints there. Due to the locking that's 
necessary to remove the iint from the global rbtree all the ns_status 
are collected in this list here already now. True, in this patch here it 
is not necessary but in 21/27 it will be necessary to have them on this 
list. I had thought about merging the patches but its complex enough as-is.


    Stefan


>
>> +			write_unlock(&ns_status->iint->ns_list_lock);
>> +
>> +			/*
>> +			 * After some progress yield to any waiting ns_list
>> +			 * free'ers.
>> +			 */
>> +			if (atomic_read(&ns_list_waiters) > 0 && ctr >= 5) {
>> +				restart = true;
>> +				break;
>> +			}
> Giving priority to removing entries in the iint cache is important, but
> I wish there was a better alternative.
>
>> +		}
>> +
>> +		write_unlock(&ns->ns_tree_lock);
>> +		unlock_group(GRP_NS_STATUS_TREE);
>> +	} while (restart);
>> +
>> +	node = llist_del_all(&garbage);
>> +	llist_for_each_entry_safe(ns_status, next, node, gc_llist)
>> +		ns_status_free(ns, ns_status);
>> +
>> +	kmem_cache_destroy(ns->ns_status_cache);
>> +}
Stefan Berger Feb. 24, 2022, 2:49 a.m. UTC | #3
On 2/23/22 21:21, Stefan Berger wrote:
>
> On 2/23/22 11:12, Mimi Zohar wrote:
>> On Tue, 2022-02-01 at 15:37 -0500, Stefan Berger wrote:
>>> From: Mehmet Kayaalp <mkayaalp@linux.vnet.ibm.com>
>>>
>>> Add an rbtree to the IMA namespace structure that stores a namespaced
>>> version of iint->flags in ns_status struct. Similar to the
>>> integrity_iint_cache, both the iint and ns_status are looked up 
>>> using the
>>> inode pointer value. The lookup, allocate, and insertion code is also
>>> similar.
>>>
>>> In subsequent patches we will have to find all ns_status entries an 
>>> iint
>>> is being used in and reset flags there. To do this, connect a list of
>>> ns_status to the integrity_iint_cache and provide a reader-writer
>>> lock in the integrity_iint_cache to lock access to the list.
>>>
>>> To simplify the code in the non-namespaces case embed an ns_status in
>>> the integrity_iint_cache and have it linked into the iint's 
>>> ns_status list
>>> when calling ima_get_ns_status().
>>>
>>> When getting an ns_status first try to find it in the RB tree. Here 
>>> we can
>>> run into the situation that an ns_status found in the RB tree has a
>>> different iint associated with it for the same inode. In this case 
>>> we need
>>> to delete the ns_status structure and get a new one.
>>>
>>> There are two cases for freeing:
>>> - when the iint is freed (inode deletion): Walk the list of ns_status
>>>    entries and disconnect each ns_status from the list; take the
>>>    writer lock to protect access to the list; also, take the item 
>>> off the
>>>    per-namespace rbtree
>>>
>>> - when the ima_namepace is freed: While walking the rbtree, remove the
>>>    ns_status from the list while also holding the iint's writer lock;
>>>    to be able to grab the lock we have to have a pointer to the iint on
>>>    the ns_status structure.
>>>
>>> To avoid an ns_status to be freed by the two cases concurrently, 
>>> prevent
>>> these two cases to run concurrently. Therefore, groups of threads
>>> deleting either inodes or ima_namespaces are allowed to run 
>>> concurrently
>>> but no two threads may run and one delete an inode and the other an
>>> ima_namespace.
>> The locking involved here is really complex.  I'm sure you thought
>> about it a lot, but isn't there a better alternative?
>
> I am afraid this is a difficult question and a short and concise 
> answer is not possible...
>
> The complexity of the locking is driven by concurrency and the data 
> structures that are involved. The data structures (existing global 
> iint rbtree, ns_status structure, and per namespace rbtree for 
> ns_status) and how they are organized and connected (via linked lists) 
> are a consequence of the fact that we need to be able to handle files 
> shared between IMA namespaces (and the host) so that re-auditing, 
> re-measuring and re-appraisal of files after file modifications or 
> modifications of the security.ima xattr (by any namespaces) can be 
> done efficiently. Furthermore, it helps to efficiently remove all the 
> status information that an IMA namespace has kept for files it 
> audited/measured/appraised. The goal was to make this as scalable as 
> possible by having each namespace get out of the way of other 
> namespaces by preventing them from locking each other out too much. 
> The single biggest problem are files shared between IMA namespaces.
>
> The best argument for the design I can come up with is the 'Big O 
> notation' describing the time complexity of operations.
>
>
> The existing global iint rbtree maintains IMA status information for 
> each inode. Lookup and insertion of data into the gloab iint rbtree  
> is O(log(n)), thus optimal.
>
> To accommodate re-auditing/re-measurement/re-appraisal, which is 
> driven by resetting status flags, I connected a list of ns_status 
> structures, in which each namespace maintains its status information 
> for each inode, to the iint maintained in that global rbtree. The 
> resetting of status flags is fast because traversing the list after a 
> lookup in the tree is O(n). Lookup + resetting the flags therefore is 
> O(log(n) + n). If the list didn't exist we would have to search all 
> IMA namespaces for the inode to be able to reset the flags, resulting 
> in O(n * log(n)) time complexity, which is of course much worse. So, 
> the list of ns_status linked to an iint has a good reason: better time 
> complexity to traverse the list and reset status flags. Beside  that 
> it also supports fast handling of deletion of files where the iint has 
> to be delete from the global rbtree and the ns_status list it holds 
> must also be deleted (each ns_status also needs to be delete from a 
> per IMA-namespace rbtree then)
>
>
> There's also a per-IMA namespace rbtree for each inode that serves two 
> purposes:
>
> a) Fast lookup of ns_status (O(log(n)) for an IMA namespace; at least 
> to insert an ns_status into this tree we need not write-lock the iint 
> tree but the initial iint creation required the write-locking of the 
> iint tree
>
> b) Maintaining a collection of inodes that the namespace has 
> audited/measured/appraised for efficient deletion upon IMA namespace 
> teardown: We can traverse this tree in O(n) time and determine which 
> iints have no more namespace users and delete them from the iint tree.
>
>
> Now the dilemma with this is that an ns_status structure is connected 
> to a list hanging off the iint and on top of this it is part of an 
> rbtree. And this is where the 'group locking' is coming from. What we 
> need to prevent is that an ns_status is deleted from its iint list 
> (when a file is deleted) while it is also deleted from the per-IMA 
> namespace rbtree (when the namespace is deleted). Both must not be 
> done concurrently. What is possible is that a group of threads may 
> tear down namespaces and the other group may act on file deletion, but 
> threads from both groups must not run concurrently.
>
>
> Now we can at least look at two alternatives for the per-IMA namespace 
> rbtree.
>
> 1) One alternative is to use a list instead of an rbtree. We would 
> loose the fast lookup via the per IMA namespace tree and get O(n) 
> lookup times but quick insertion into the list [O(1)]. We still would 
> have the collection of inodes. And we would still have the dilemma 
> that an ns_status would be connected to two lists, thus requiring the 
> group locking. I don't think using a list instead of an rbtree is a 
> solution.
>
> 2) We could try to get rid of the per-IMA namespace rbtree altogether 
> and just use the global iint rbtree that exists today with a list of 
> ns_status connected to its iints. If we do this we would loose the 
> knowledge of which inodes a namespace has an ns_status structure for. 
> The only way we would find this is by traversing the global iint tree 
> (O(n)) and follow each iint list (O(m)) to see whether we find an 
> ns_status holding information about the iint. The time complexity for 
> this would be O(n*m) but much less than O(n^2). A downside would also 
> be that we would have to keep a lock on the global iint rbtree while 
> traversing it, thus locking out those that want to add inodes to the 
> tree. On the upside it would allow us to get rid of the group locking. 
> Lookup of an ns_status in the global iint tree would be O(n) + O(m) 
> and insertion would be O(n) + O(1).
>
>
> Certainly, the alternative is 2) with its own trade-offs. My guess is 
> some sort of yielding could probably also be helpful there then to 
> avoid blocking higher priority operations than deleting of a namespace.


I forgot to mention: It makes a difference if one has to walk the global 
iint tree to find the few ns_status for the namespace among possibly 
thousands of entries in that tree than having a per-IMA namespace rbtree 
that has these few ns_status right there. So walking the iint tree is 
more like O(N) versus O(n) walking the per-IMA namespace rbtree.
diff mbox series

Patch

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 964a08702573..2fe32f1bf84b 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -225,12 +225,19 @@  static inline bool ima_appraise_signature(enum kernel_read_file_id func)
 
 void free_ima_ns(struct user_namespace *ns);
 
+void ima_free_ns_status_list(struct list_head *head, rwlock_t *ns_list_lock);
+
 #else
 
 static inline void free_ima_ns(struct user_namespace *user_ns)
 {
 }
 
+static inline void ima_free_ns_status_list(struct list_head *head,
+					   rwlock_t *ns_list_lock)
+{
+}
+
 #endif /* CONFIG_IMA_NS */
 
 #endif /* _LINUX_IMA_H */
diff --git a/security/integrity/iint.c b/security/integrity/iint.c
index 8638976f7990..371cbceaf479 100644
--- a/security/integrity/iint.c
+++ b/security/integrity/iint.c
@@ -19,6 +19,7 @@ 
 #include <linux/uaccess.h>
 #include <linux/security.h>
 #include <linux/lsm_hooks.h>
+#include <linux/ima.h>
 #include "integrity.h"
 
 static struct rb_root integrity_iint_tree = RB_ROOT;
@@ -82,6 +83,8 @@  static void iint_free(struct integrity_iint_cache *iint)
 	iint->ima_creds_status = INTEGRITY_UNKNOWN;
 	iint->evm_status = INTEGRITY_UNKNOWN;
 	iint->measured_pcrs = 0;
+	rwlock_init(&iint->ns_list_lock);
+	INIT_LIST_HEAD(&iint->ns_list);
 	kmem_cache_free(iint_cache, iint);
 }
 
@@ -155,6 +158,8 @@  void integrity_inode_free(struct inode *inode)
 	rb_erase(&iint->rb_node, &integrity_iint_tree);
 	write_unlock(&integrity_iint_lock);
 
+	ima_free_ns_status_list(&iint->ns_list, &iint->ns_list_lock);
+
 	iint_free(iint);
 }
 
@@ -170,6 +175,8 @@  static void init_once(void *foo)
 	iint->ima_creds_status = INTEGRITY_UNKNOWN;
 	iint->evm_status = INTEGRITY_UNKNOWN;
 	mutex_init(&iint->mutex);
+	rwlock_init(&iint->ns_list_lock);
+	INIT_LIST_HEAD(&iint->ns_list);
 }
 
 static int __init integrity_iintcache_init(void)
diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile
index b86a35fbed60..edfb0c30a063 100644
--- a/security/integrity/ima/Makefile
+++ b/security/integrity/ima/Makefile
@@ -14,7 +14,7 @@  ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o
 ima-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o
 ima-$(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) += ima_asymmetric_keys.o
 ima-$(CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS) += ima_queue_keys.o
-ima-$(CONFIG_IMA_NS) += ima_ns.o
+ima-$(CONFIG_IMA_NS) += ima_ns.o ima_ns_status.o
 
 ifeq ($(CONFIG_EFI),y)
 ima-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_efi.o
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 751e936a6311..4af8f2c4df40 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -123,6 +123,10 @@  struct ima_h_table {
 };
 
 struct ima_namespace {
+	struct rb_root ns_status_tree;
+	rwlock_t ns_tree_lock;
+	struct kmem_cache *ns_status_cache;
+
 	/* policy rules */
 	struct list_head ima_default_rules;
 	struct list_head ima_policy_rules;
@@ -507,6 +511,12 @@  static inline struct ima_namespace
 
 struct ima_namespace *create_ima_ns(void);
 
+struct ns_status *ima_get_ns_status(struct ima_namespace *ns,
+				    struct inode *inode,
+				    struct integrity_iint_cache *iint);
+
+void ima_free_ns_status_tree(struct ima_namespace *ns);
+
 #else
 
 static inline struct ima_namespace *create_ima_ns(void)
@@ -515,6 +525,13 @@  static inline struct ima_namespace *create_ima_ns(void)
 	return ERR_PTR(-EFAULT);
 }
 
+static inline struct ns_status *ima_get_ns_status(struct ima_namespace *ns,
+						  struct inode *inode,
+						  struct integrity_iint_cache *iint)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_IMA_NS */
 
 #endif /* __LINUX_IMA_H */
diff --git a/security/integrity/ima/ima_init_ima_ns.c b/security/integrity/ima/ima_init_ima_ns.c
index 166dab4a3126..d4ddfd1de60b 100644
--- a/security/integrity/ima/ima_init_ima_ns.c
+++ b/security/integrity/ima/ima_init_ima_ns.c
@@ -12,6 +12,11 @@  int ima_init_namespace(struct ima_namespace *ns)
 {
 	int rc;
 
+	ns->ns_status_tree = RB_ROOT;
+	rwlock_init(&ns->ns_tree_lock);
+	/* Use KMEM_CACHE for simplicity */
+	ns->ns_status_cache = KMEM_CACHE(ns_status, SLAB_PANIC);
+
 	INIT_LIST_HEAD(&ns->ima_default_rules);
 	INIT_LIST_HEAD(&ns->ima_policy_rules);
 	INIT_LIST_HEAD(&ns->ima_temp_rules);
diff --git a/security/integrity/ima/ima_ns.c b/security/integrity/ima/ima_ns.c
index b3b81a1e313e..29af6fea2d74 100644
--- a/security/integrity/ima/ima_ns.c
+++ b/security/integrity/ima/ima_ns.c
@@ -29,6 +29,7 @@  static void destroy_ima_ns(struct ima_namespace *ns)
 	unregister_blocking_lsm_notifier(&ns->ima_lsm_policy_notifier);
 	kfree(ns->arch_policy_entry);
 	ima_free_policy_rules(ns);
+	ima_free_ns_status_tree(ns);
 }
 
 void free_ima_ns(struct user_namespace *user_ns)
diff --git a/security/integrity/ima/ima_ns_status.c b/security/integrity/ima/ima_ns_status.c
new file mode 100644
index 000000000000..9c753caad6ac
--- /dev/null
+++ b/security/integrity/ima/ima_ns_status.c
@@ -0,0 +1,344 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016-2021 IBM Corporation
+ * Author:
+ *  Yuqiong Sun <suny@us.ibm.com>
+ *  Stefan Berger <stefanb@linux.vnet.ibm.com>
+ */
+
+#include <linux/user_namespace.h>
+#include <linux/ima.h>
+
+#include "ima.h"
+
+/*
+ * An ns_status must be on a per-namespace rbtree and on a per-iint list.
+ *
+ * Locking order for ns_status:
+ * 1) ns->ns_tree_lock  : Lock the rbtree
+ * 2) iint->ns_list_lock: Lock the list
+ *
+ * An ns_status can be freed either by walking the rbtree (namespace deletion)
+ * or by walking the linked list of ns_status (inode/iint deletion). There are
+ * two functions that implement each one of the cases. To avoid concurrent
+ * freeing of the same ns_status, the two freeing paths cannot be run
+ * concurrently but each path can be run by multiple threads since no two
+ * threads will free the same inode/iint and no two threads will free the same
+ * namespace. Grouping threads like this ensures that:
+ * - while walking the rbtree: all ns_status will be on their list and the iint
+ *                             will still exist
+ * - while walking the list:   all ns_status will be on their rbtree
+ */
+enum lk_group {
+	GRP_NS_STATUS_LIST = 0,
+	GRP_NS_STATUS_TREE
+};
+
+static atomic_t lg_ctr[2] = {
+	ATOMIC_INIT(0),
+	ATOMIC_INIT(0)
+};
+
+static DEFINE_SPINLOCK(lg_ctr_lock);
+
+static struct wait_queue_head lg_wq[2] = {
+	__WAIT_QUEUE_HEAD_INITIALIZER(lg_wq[0]),
+	__WAIT_QUEUE_HEAD_INITIALIZER(lg_wq[1])
+};
+
+static atomic_t ns_list_waiters = ATOMIC_INIT(0);
+
+/*
+ * Any number of concurrent threads may free ns_status's in either one of the
+ * groups but the groups must not run concurrently. The GRP_NS_STATUS_TREE
+ * group yields to waiters in the GRP_NS_STATUS_LIST group since namespace
+ * deletion has more time.
+ */
+static void lock_group(enum lk_group group)
+{
+	unsigned long flags;
+	bool done = false;
+	int announced = 0;
+
+	while (1) {
+		spin_lock_irqsave(&lg_ctr_lock, flags);
+
+		switch (group) {
+		case GRP_NS_STATUS_LIST:
+			if (atomic_read(&lg_ctr[GRP_NS_STATUS_TREE]) == 0) {
+				if (announced)
+					atomic_dec(&ns_list_waiters);
+				done = true;
+				atomic_inc(&lg_ctr[GRP_NS_STATUS_LIST]);
+			} else {
+				/* rbtree being deleted; announce waiting */
+				if (!announced) {
+					atomic_inc(&ns_list_waiters);
+					announced = 1;
+				}
+			}
+			break;
+		case GRP_NS_STATUS_TREE:
+			if (atomic_read(&lg_ctr[GRP_NS_STATUS_LIST]) == 0 &&
+			    atomic_read(&ns_list_waiters) == 0) {
+				done = true;
+				atomic_inc(&lg_ctr[GRP_NS_STATUS_TREE]);
+			}
+			break;
+		}
+
+		spin_unlock_irqrestore(&lg_ctr_lock, flags);
+
+		if (done)
+			break;
+
+		/* wait until opposite group is done */
+		switch (group) {
+		case GRP_NS_STATUS_LIST:
+			wait_event_interruptible
+			    (lg_wq[GRP_NS_STATUS_LIST],
+			     atomic_read(&lg_ctr[GRP_NS_STATUS_TREE]) == 0);
+			break;
+		case GRP_NS_STATUS_TREE:
+			wait_event_interruptible
+			    (lg_wq[GRP_NS_STATUS_TREE],
+			     atomic_read(&lg_ctr[GRP_NS_STATUS_LIST]) == 0 &&
+			     atomic_read(&ns_list_waiters) == 0);
+			break;
+		}
+	}
+}
+
+static void unlock_group(enum lk_group group)
+{
+	switch (group) {
+	case GRP_NS_STATUS_LIST:
+		if (atomic_dec_and_test(&lg_ctr[GRP_NS_STATUS_LIST]))
+			wake_up_interruptible_all(&lg_wq[GRP_NS_STATUS_TREE]);
+		break;
+	case GRP_NS_STATUS_TREE:
+		if (atomic_dec_and_test(&lg_ctr[GRP_NS_STATUS_TREE]))
+			wake_up_interruptible_all(&lg_wq[GRP_NS_STATUS_LIST]);
+		break;
+	}
+}
+
+static void ns_status_free(struct ima_namespace *ns,
+			   struct ns_status *ns_status)
+{
+	pr_debug("FREE ns_status: %p\n", ns_status);
+
+	kmem_cache_free(ns->ns_status_cache, ns_status);
+}
+
+/*
+ * ima_free_ns_status_tree - free all items on the ns_status_tree and take each
+ *                           one off the list; yield to ns_list free'ers
+ *
+ * This function is called when an ima_namespace is freed. All entries in the
+ * rbtree will be taken off their list and collected in a garbage collection
+ * list and freed at the end. This allows to walk the rbtree again.
+ */
+void ima_free_ns_status_tree(struct ima_namespace *ns)
+{
+	struct ns_status *ns_status, *next;
+	struct llist_node *node;
+	LLIST_HEAD(garbage);
+	unsigned int ctr;
+	bool restart;
+
+	do {
+		ctr = 0;
+		restart = false;
+
+		lock_group(GRP_NS_STATUS_TREE);
+		write_lock(&ns->ns_tree_lock);
+
+		rbtree_postorder_for_each_entry_safe(ns_status, next,
+						     &ns->ns_status_tree,
+						     rb_node) {
+			write_lock(&ns_status->iint->ns_list_lock);
+			if (!list_empty(&ns_status->ns_next)) {
+				list_del_init(&ns_status->ns_next);
+				llist_add(&ns_status->gc_llist, &garbage);
+				ctr++;
+			}
+			write_unlock(&ns_status->iint->ns_list_lock);
+
+			/*
+			 * After some progress yield to any waiting ns_list
+			 * free'ers.
+			 */
+			if (atomic_read(&ns_list_waiters) > 0 && ctr >= 5) {
+				restart = true;
+				break;
+			}
+		}
+
+		write_unlock(&ns->ns_tree_lock);
+		unlock_group(GRP_NS_STATUS_TREE);
+	} while (restart);
+
+	node = llist_del_all(&garbage);
+	llist_for_each_entry_safe(ns_status, next, node, gc_llist)
+		ns_status_free(ns, ns_status);
+
+	kmem_cache_destroy(ns->ns_status_cache);
+}
+
+/*
+ * ima_free_ns_status_list: free the list of ns_status items and take
+ *                          each one off its namespace rbtree
+ */
+void ima_free_ns_status_list(struct list_head *head, rwlock_t *ns_list_lock)
+{
+	struct ns_status *ns_status;
+
+	lock_group(GRP_NS_STATUS_LIST);
+
+	while (1) {
+		write_lock(ns_list_lock);
+		ns_status = list_first_entry_or_null(head, struct ns_status,
+						     ns_next);
+		if (ns_status)
+			list_del_init(&ns_status->ns_next);
+		write_unlock(ns_list_lock);
+
+		if (!ns_status)
+			break;
+
+		write_lock(&ns_status->ns->ns_tree_lock);
+
+		rb_erase(&ns_status->rb_node, &ns_status->ns->ns_status_tree);
+		RB_CLEAR_NODE(&ns_status->rb_node);
+
+		write_unlock(&ns_status->ns->ns_tree_lock);
+
+		ns_status_free(ns_status->ns, ns_status);
+	}
+
+	unlock_group(GRP_NS_STATUS_LIST);
+}
+
+/*
+ * ns_status_find - return the ns_status associated with an inode;
+ *                  caller must hold lock for tree
+ */
+static struct ns_status *ns_status_find(struct ima_namespace *ns,
+					struct inode *inode)
+{
+	struct ns_status *ns_status;
+	struct rb_node *n = ns->ns_status_tree.rb_node;
+
+	while (n) {
+		ns_status = rb_entry(n, struct ns_status, rb_node);
+
+		if (inode < ns_status->inode)
+			n = n->rb_left;
+		else if (inode > ns_status->inode)
+			n = n->rb_right;
+		else
+			break;
+	}
+	if (!n)
+		return NULL;
+
+	return ns_status;
+}
+
+static void insert_ns_status(struct ima_namespace *ns, struct inode *inode,
+			     struct ns_status *ns_status)
+{
+	struct rb_node **p;
+	struct rb_node *node, *parent = NULL;
+	struct ns_status *test_status;
+
+	p = &ns->ns_status_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		test_status = rb_entry(parent, struct ns_status, rb_node);
+		if (inode < test_status->inode)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	node = &ns_status->rb_node;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, &ns->ns_status_tree);
+}
+
+static void ns_status_unlink(struct ima_namespace *ns,
+			     struct ns_status *ns_status)
+{
+	write_lock(&ns_status->iint->ns_list_lock);
+	if (!list_empty(&ns_status->ns_next))
+		list_del_init(&ns_status->ns_next);
+	write_unlock(&ns_status->iint->ns_list_lock);
+
+	rb_erase(&ns_status->rb_node, &ns->ns_status_tree);
+	RB_CLEAR_NODE(&ns_status->rb_node);
+}
+
+struct ns_status *ima_get_ns_status(struct ima_namespace *ns,
+				    struct inode *inode,
+				    struct integrity_iint_cache *iint)
+{
+	struct ns_status *ns_status;
+	bool get_new = true;
+
+	/*
+	 * Prevent finding the status via the list (inode/iint deletion) since
+	 * we may free it.
+	 */
+	lock_group(GRP_NS_STATUS_TREE);
+
+	write_lock(&ns->ns_tree_lock);
+
+	ns_status = ns_status_find(ns, inode);
+	if (ns_status) {
+		if (unlikely(ns_status->iint != iint)) {
+			/* Same inode but stale iint: free it and get new */
+			ns_status_unlink(ns, ns_status);
+			ns_status_free(ns, ns_status);
+		} else if (inode->i_ino == ns_status->i_ino &&
+			   inode->i_generation == ns_status->i_generation) {
+			goto unlock;
+		} else {
+			/* Reuse of ns_status is possible but need reset */
+			ns_status_reset(ns_status);
+			get_new = false;
+		}
+	}
+
+	if (get_new) {
+		ns_status = kmem_cache_alloc(ns->ns_status_cache, GFP_NOFS);
+		if (!ns_status) {
+			ns_status = ERR_PTR(-ENOMEM);
+			goto unlock;
+		}
+
+		pr_debug("NEW  ns_status: %p\n", ns_status);
+
+		ns_status_init(ns_status);
+		insert_ns_status(ns, inode, ns_status);
+	}
+
+	ns_status->iint = iint;
+	ns_status->inode = inode;
+	ns_status->ns = ns;
+	ns_status->i_ino = inode->i_ino;
+	ns_status->i_generation = inode->i_generation;
+
+	/* make visible on list */
+	write_lock(&iint->ns_list_lock);
+	if (list_empty(&ns_status->ns_next))
+		list_add_tail(&ns_status->ns_next, &iint->ns_list);
+	write_unlock(&iint->ns_list_lock);
+
+unlock:
+	write_unlock(&ns->ns_tree_lock);
+
+	unlock_group(GRP_NS_STATUS_TREE);
+
+	return ns_status;
+}
diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index 547425c20e11..b7d5ca108900 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h
@@ -122,13 +122,39 @@  struct signature_v2_hdr {
 	uint8_t sig[];		/* signature payload */
 } __packed;
 
+/* integrity status of an inode in a namespace */
+struct ns_status {
+	struct list_head ns_next;
+	unsigned long flags;		/* flags split with iint */
+#ifdef CONFIG_IMA_NS
+	struct rb_node rb_node;
+	struct integrity_iint_cache *iint;
+	struct inode *inode;
+	struct ima_namespace *ns;
+	ino_t i_ino;
+	u32 i_generation;
+	struct llist_node gc_llist;	/* used while freeing */
+#endif
+};
+
+static inline void ns_status_reset(struct ns_status *ns_status)
+{
+	ns_status->flags = 0;
+}
+
+static inline void ns_status_init(struct ns_status *ns_status)
+{
+	INIT_LIST_HEAD(&ns_status->ns_next);
+	ns_status_reset(ns_status);
+}
+
 /* integrity data associated with an inode */
 struct integrity_iint_cache {
 	struct rb_node rb_node;	/* rooted in integrity_iint_tree */
 	struct mutex mutex;	/* protects: version, flags, digest */
 	struct inode *inode;	/* back pointer to inode in question */
 	u64 version;		/* track inode changes */
-	unsigned long flags;
+	unsigned long flags;	/* flags split with ns_status */
 	unsigned long measured_pcrs;
 	unsigned long atomic_flags;
 	enum integrity_status ima_file_status:4;
@@ -138,6 +164,13 @@  struct integrity_iint_cache {
 	enum integrity_status ima_creds_status:4;
 	enum integrity_status evm_status:4;
 	struct ima_digest_data *ima_hash;
+
+	/*
+	 * Lock and list of ns_status for files shared by different
+	 * namespaces
+	 */
+	rwlock_t ns_list_lock;
+	struct list_head ns_list;
 };
 
 /* rbtree tree calls to lookup, insert, delete