diff mbox series

[09/23] ns: Introduce ns_idr to be able to iterate all allocated namespaces in the system

Message ID 159611040870.535980.13460189038999722608.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show
Series proc: Introduce /proc/namespaces/ directory to expose namespaces lineary | expand

Commit Message

Kirill Tkhai July 30, 2020, noon UTC
This patch introduces a new IDR and functions to add/remove and iterate
registered namespaces in the system. It will be used to list namespaces
in /proc/namespaces/... in next patches.

The IDR is protected by ns_idr, and it's choosen to be a spinlock (not
mutex) to allow calling ns_idr_unregister() from put_xxx_ns() methods,
which may be called from (say) softirq context. Spinlock allows us
to avoid introduction of kwork on top of put_xxx_ns() to call mutex_lock().

We introduce a new IDR, because there is no appropriate items to reuse
instead of this. The closest proc_inum_ida does not fit our goals:
it is IDA and its convertation to IDR will bring a big overhead by proc
entries, which are not interested in IDR functionality (pointers).

Read access to ns_idr is made lockless (see ns_get_next()). This is made
for better parallelism and better performance from start. One new requirement
to do this is that namespace memory must be freed one RCU grace period
after ns_idr_unregister(). Some namespaces types already does this (say, net),
the rest will be converted to use kfree_rcu()/etc, where they become
linked to the IDR. See next patches in this series for the details.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 fs/nsfs.c                 |   76 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ns_common.h |   10 ++++++
 include/linux/proc_ns.h   |   11 ++++---
 3 files changed, 92 insertions(+), 5 deletions(-)

Comments

Matthew Wilcox (Oracle) July 30, 2020, 12:23 p.m. UTC | #1
On Thu, Jul 30, 2020 at 03:00:08PM +0300, Kirill Tkhai wrote:
> This patch introduces a new IDR and functions to add/remove and iterate
> registered namespaces in the system. It will be used to list namespaces
> in /proc/namespaces/... in next patches.

Looks like you could use an XArray for this and it would be fewer lines of
code.

>  
>  static struct vfsmount *nsfs_mnt;
> +static DEFINE_SPINLOCK(ns_lock);
> +static DEFINE_IDR(ns_idr);

XArray includes its own spinlock.

> +/*
> + * Add a newly created ns to ns_idr. The ns must be fully
> + * initialized since it becomes available for ns_get_next()
> + * right after we exit this function.
> + */
> +int ns_idr_register(struct ns_common *ns)
> +{
> +	int ret, id = ns->inum - PROC_NS_MIN_INO;
> +
> +	if (WARN_ON(id < 0))
> +		return -EINVAL;
> +
> +	idr_preload(GFP_KERNEL);
> +	spin_lock_irq(&ns_lock);
> +	ret = idr_alloc(&ns_idr, ns, id, id + 1, GFP_ATOMIC);
> +	spin_unlock_irq(&ns_lock);
> +	idr_preload_end();
> +	return ret < 0 ? ret : 0;

This would simply be return xa_insert_irq(...);

> +}
> +
> +/*
> + * Remove a dead ns from ns_idr. Note, that ns memory must
> + * be freed not earlier then one RCU grace period after
> + * this function, since ns_get_next() uses RCU to iterate the IDR.
> + */
> +void ns_idr_unregister(struct ns_common *ns)
> +{
> +	int id = ns->inum - PROC_NS_MIN_INO;
> +	unsigned long flags;
> +
> +	if (WARN_ON(id < 0))
> +		return;
> +
> +	spin_lock_irqsave(&ns_lock, flags);
> +	idr_remove(&ns_idr, id);
> +	spin_unlock_irqrestore(&ns_lock, flags);
> +}

xa_erase_irqsave();

> +
> +/*
> + * This returns ns with inum greater than @id or NULL.
> + * @id is updated to refer the ns inum.
> + */
> +struct ns_common *ns_get_next(unsigned int *id)
> +{
> +	struct ns_common *ns;
> +
> +	if (*id < PROC_NS_MIN_INO - 1)
> +		*id = PROC_NS_MIN_INO - 1;
> +
> +	*id += 1;
> +	*id -= PROC_NS_MIN_INO;
> +
> +	rcu_read_lock();
> +	do {
> +		ns = idr_get_next(&ns_idr, id);
> +		if (!ns)
> +			break;

xa_find_after();

You'll want a temporary unsigned long to work with ...

> +		if (!refcount_inc_not_zero(&ns->count)) {
> +			ns = NULL;
> +			*id += 1;

you won't need this increment.
Kirill Tkhai July 30, 2020, 1:32 p.m. UTC | #2
On 30.07.2020 15:23, Matthew Wilcox wrote:
> On Thu, Jul 30, 2020 at 03:00:08PM +0300, Kirill Tkhai wrote:
>> This patch introduces a new IDR and functions to add/remove and iterate
>> registered namespaces in the system. It will be used to list namespaces
>> in /proc/namespaces/... in next patches.
> 
> Looks like you could use an XArray for this and it would be fewer lines of
> code.
> 
>>  
>>  static struct vfsmount *nsfs_mnt;
>> +static DEFINE_SPINLOCK(ns_lock);
>> +static DEFINE_IDR(ns_idr);
> 
> XArray includes its own spinlock.
> 
>> +/*
>> + * Add a newly created ns to ns_idr. The ns must be fully
>> + * initialized since it becomes available for ns_get_next()
>> + * right after we exit this function.
>> + */
>> +int ns_idr_register(struct ns_common *ns)
>> +{
>> +	int ret, id = ns->inum - PROC_NS_MIN_INO;
>> +
>> +	if (WARN_ON(id < 0))
>> +		return -EINVAL;
>> +
>> +	idr_preload(GFP_KERNEL);
>> +	spin_lock_irq(&ns_lock);
>> +	ret = idr_alloc(&ns_idr, ns, id, id + 1, GFP_ATOMIC);
>> +	spin_unlock_irq(&ns_lock);
>> +	idr_preload_end();
>> +	return ret < 0 ? ret : 0;
> 
> This would simply be return xa_insert_irq(...);
> 
>> +}
>> +
>> +/*
>> + * Remove a dead ns from ns_idr. Note, that ns memory must
>> + * be freed not earlier then one RCU grace period after
>> + * this function, since ns_get_next() uses RCU to iterate the IDR.
>> + */
>> +void ns_idr_unregister(struct ns_common *ns)
>> +{
>> +	int id = ns->inum - PROC_NS_MIN_INO;
>> +	unsigned long flags;
>> +
>> +	if (WARN_ON(id < 0))
>> +		return;
>> +
>> +	spin_lock_irqsave(&ns_lock, flags);
>> +	idr_remove(&ns_idr, id);
>> +	spin_unlock_irqrestore(&ns_lock, flags);
>> +}
> 
> xa_erase_irqsave();

static inline void *xa_erase_irqsave(struct xarray *xa, unsigned long index)
{
	unsigned long flags;
        void *entry;

        xa_lock_irqsave(xa, flags);
        entry = __xa_erase(xa, index);
        xa_unlock_irqrestore(xa, flags);

        return entry;
}

>> +
>> +/*
>> + * This returns ns with inum greater than @id or NULL.
>> + * @id is updated to refer the ns inum.
>> + */
>> +struct ns_common *ns_get_next(unsigned int *id)
>> +{
>> +	struct ns_common *ns;
>> +
>> +	if (*id < PROC_NS_MIN_INO - 1)
>> +		*id = PROC_NS_MIN_INO - 1;
>> +
>> +	*id += 1;
>> +	*id -= PROC_NS_MIN_INO;
>> +
>> +	rcu_read_lock();
>> +	do {
>> +		ns = idr_get_next(&ns_idr, id);
>> +		if (!ns)
>> +			break;
> 
> xa_find_after();
> 
> You'll want a temporary unsigned long to work with ...
> 
>> +		if (!refcount_inc_not_zero(&ns->count)) {
>> +			ns = NULL;
>> +			*id += 1;
> 
> you won't need this increment.

Why? I don't see a way xarray allows to avoid this.
Matthew Wilcox (Oracle) July 30, 2020, 1:56 p.m. UTC | #3
On Thu, Jul 30, 2020 at 04:32:22PM +0300, Kirill Tkhai wrote:
> On 30.07.2020 15:23, Matthew Wilcox wrote:
> > xa_erase_irqsave();
> 
> static inline void *xa_erase_irqsave(struct xarray *xa, unsigned long index)
> {
> 	unsigned long flags;
>         void *entry;
> 
>         xa_lock_irqsave(xa, flags);
>         entry = __xa_erase(xa, index);
>         xa_unlock_irqrestore(xa, flags);
> 
>         return entry;
> }

was there a question here?

> >> +struct ns_common *ns_get_next(unsigned int *id)
> >> +{
> >> +	struct ns_common *ns;
> >> +
> >> +	if (*id < PROC_NS_MIN_INO - 1)
> >> +		*id = PROC_NS_MIN_INO - 1;
> >> +
> >> +	*id += 1;
> >> +	*id -= PROC_NS_MIN_INO;
> >> +
> >> +	rcu_read_lock();
> >> +	do {
> >> +		ns = idr_get_next(&ns_idr, id);
> >> +		if (!ns)
> >> +			break;
> > 
> > xa_find_after();
> > 
> > You'll want a temporary unsigned long to work with ...
> > 
> >> +		if (!refcount_inc_not_zero(&ns->count)) {
> >> +			ns = NULL;
> >> +			*id += 1;
> > 
> > you won't need this increment.
> 
> Why? I don't see a way xarray allows to avoid this.

It's embedded in xa_find_after().
Kirill Tkhai July 30, 2020, 2:12 p.m. UTC | #4
On 30.07.2020 16:56, Matthew Wilcox wrote:
> On Thu, Jul 30, 2020 at 04:32:22PM +0300, Kirill Tkhai wrote:
>> On 30.07.2020 15:23, Matthew Wilcox wrote:
>>> xa_erase_irqsave();
>>
>> static inline void *xa_erase_irqsave(struct xarray *xa, unsigned long index)
>> {
>> 	unsigned long flags;
>>         void *entry;
>>
>>         xa_lock_irqsave(xa, flags);
>>         entry = __xa_erase(xa, index);
>>         xa_unlock_irqrestore(xa, flags);
>>
>>         return entry;
>> }
> 
> was there a question here?

No, I just I will add this in separate patch.
 
>>>> +struct ns_common *ns_get_next(unsigned int *id)
>>>> +{
>>>> +	struct ns_common *ns;
>>>> +
>>>> +	if (*id < PROC_NS_MIN_INO - 1)
>>>> +		*id = PROC_NS_MIN_INO - 1;
>>>> +
>>>> +	*id += 1;
>>>> +	*id -= PROC_NS_MIN_INO;
>>>> +
>>>> +	rcu_read_lock();
>>>> +	do {
>>>> +		ns = idr_get_next(&ns_idr, id);
>>>> +		if (!ns)
>>>> +			break;
>>>
>>> xa_find_after();
>>>
>>> You'll want a temporary unsigned long to work with ...
>>>
>>>> +		if (!refcount_inc_not_zero(&ns->count)) {
>>>> +			ns = NULL;
>>>> +			*id += 1;
>>>
>>> you won't need this increment.
>>
>> Why? I don't see a way xarray allows to avoid this.
> 
> It's embedded in xa_find_after().
 
How is it embedded to check ns->count that it knows nothing?
Matthew Wilcox (Oracle) July 30, 2020, 2:15 p.m. UTC | #5
On Thu, Jul 30, 2020 at 05:12:09PM +0300, Kirill Tkhai wrote:
> On 30.07.2020 16:56, Matthew Wilcox wrote:
> > On Thu, Jul 30, 2020 at 04:32:22PM +0300, Kirill Tkhai wrote:
> >> On 30.07.2020 15:23, Matthew Wilcox wrote:
> >>> xa_erase_irqsave();
> >>
> >> static inline void *xa_erase_irqsave(struct xarray *xa, unsigned long index)
> >> {
> >> 	unsigned long flags;
> >>         void *entry;
> >>
> >>         xa_lock_irqsave(xa, flags);
> >>         entry = __xa_erase(xa, index);
> >>         xa_unlock_irqrestore(xa, flags);
> >>
> >>         return entry;
> >> }
> > 
> > was there a question here?
> 
> No, I just I will add this in separate patch.

Ah, yes.  Thanks!

> >>>> +struct ns_common *ns_get_next(unsigned int *id)
> >>>> +{
> >>>> +	struct ns_common *ns;
> >>>> +
> >>>> +	if (*id < PROC_NS_MIN_INO - 1)
> >>>> +		*id = PROC_NS_MIN_INO - 1;
> >>>> +
> >>>> +	*id += 1;
> >>>> +	*id -= PROC_NS_MIN_INO;
> >>>> +
> >>>> +	rcu_read_lock();
> >>>> +	do {
> >>>> +		ns = idr_get_next(&ns_idr, id);
> >>>> +		if (!ns)
> >>>> +			break;
> >>>
> >>> xa_find_after();
> >>>
> >>> You'll want a temporary unsigned long to work with ...
> >>>
> >>>> +		if (!refcount_inc_not_zero(&ns->count)) {
> >>>> +			ns = NULL;
> >>>> +			*id += 1;
> >>>
> >>> you won't need this increment.
> >>
> >> Why? I don't see a way xarray allows to avoid this.
> > 
> > It's embedded in xa_find_after().
>  
> How is it embedded to check ns->count that it knows nothing?

I meant you won't need to increment '*id'.  The refcount is, of course,
your business.
Kirill Tkhai July 30, 2020, 2:20 p.m. UTC | #6
On 30.07.2020 17:15, Matthew Wilcox wrote:
> On Thu, Jul 30, 2020 at 05:12:09PM +0300, Kirill Tkhai wrote:
>> On 30.07.2020 16:56, Matthew Wilcox wrote:
>>> On Thu, Jul 30, 2020 at 04:32:22PM +0300, Kirill Tkhai wrote:
>>>> On 30.07.2020 15:23, Matthew Wilcox wrote:
>>>>> xa_erase_irqsave();
>>>>
>>>> static inline void *xa_erase_irqsave(struct xarray *xa, unsigned long index)
>>>> {
>>>> 	unsigned long flags;
>>>>         void *entry;
>>>>
>>>>         xa_lock_irqsave(xa, flags);
>>>>         entry = __xa_erase(xa, index);
>>>>         xa_unlock_irqrestore(xa, flags);
>>>>
>>>>         return entry;
>>>> }
>>>
>>> was there a question here?
>>
>> No, I just I will add this in separate patch.
> 
> Ah, yes.  Thanks!
> 
>>>>>> +struct ns_common *ns_get_next(unsigned int *id)
>>>>>> +{
>>>>>> +	struct ns_common *ns;
>>>>>> +
>>>>>> +	if (*id < PROC_NS_MIN_INO - 1)
>>>>>> +		*id = PROC_NS_MIN_INO - 1;
>>>>>> +
>>>>>> +	*id += 1;
>>>>>> +	*id -= PROC_NS_MIN_INO;
>>>>>> +
>>>>>> +	rcu_read_lock();
>>>>>> +	do {
>>>>>> +		ns = idr_get_next(&ns_idr, id);
>>>>>> +		if (!ns)
>>>>>> +			break;
>>>>>
>>>>> xa_find_after();
>>>>>
>>>>> You'll want a temporary unsigned long to work with ...
>>>>>
>>>>>> +		if (!refcount_inc_not_zero(&ns->count)) {
>>>>>> +			ns = NULL;
>>>>>> +			*id += 1;
>>>>>
>>>>> you won't need this increment.
>>>>
>>>> Why? I don't see a way xarray allows to avoid this.
>>>
>>> It's embedded in xa_find_after().
>>  
>> How is it embedded to check ns->count that it knows nothing?
> 
> I meant you won't need to increment '*id'.  The refcount is, of course,
> your business.

Ok, this brings comfort to me, because first time I thought xarray is a
big brother, which knows everything about my counters :)
diff mbox series

Patch

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 800c1d0eb0d0..ee4be67d3a0b 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -11,10 +11,13 @@ 
 #include <linux/user_namespace.h>
 #include <linux/nsfs.h>
 #include <linux/uaccess.h>
+#include <linux/idr.h>
 
 #include "internal.h"
 
 static struct vfsmount *nsfs_mnt;
+static DEFINE_SPINLOCK(ns_lock);
+static DEFINE_IDR(ns_idr);
 
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg);
@@ -304,3 +307,76 @@  void __init nsfs_init(void)
 		panic("can't set nsfs up\n");
 	nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 }
+
+/*
+ * Add a newly created ns to ns_idr. The ns must be fully
+ * initialized since it becomes available for ns_get_next()
+ * right after we exit this function.
+ */
+int ns_idr_register(struct ns_common *ns)
+{
+	int ret, id = ns->inum - PROC_NS_MIN_INO;
+
+	if (WARN_ON(id < 0))
+		return -EINVAL;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&ns_lock);
+	ret = idr_alloc(&ns_idr, ns, id, id + 1, GFP_ATOMIC);
+	spin_unlock_irq(&ns_lock);
+	idr_preload_end();
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * Remove a dead ns from ns_idr. Note, that ns memory must
+ * be freed not earlier then one RCU grace period after
+ * this function, since ns_get_next() uses RCU to iterate the IDR.
+ */
+void ns_idr_unregister(struct ns_common *ns)
+{
+	int id = ns->inum - PROC_NS_MIN_INO;
+	unsigned long flags;
+
+	if (WARN_ON(id < 0))
+		return;
+
+	spin_lock_irqsave(&ns_lock, flags);
+	idr_remove(&ns_idr, id);
+	spin_unlock_irqrestore(&ns_lock, flags);
+}
+
+/*
+ * This returns ns with inum greater than @id or NULL.
+ * @id is updated to refer the ns inum.
+ */
+struct ns_common *ns_get_next(unsigned int *id)
+{
+	struct ns_common *ns;
+
+	if (*id < PROC_NS_MIN_INO - 1)
+		*id = PROC_NS_MIN_INO - 1;
+
+	*id += 1;
+	*id -= PROC_NS_MIN_INO;
+
+	rcu_read_lock();
+	do {
+		ns = idr_get_next(&ns_idr, id);
+		if (!ns)
+			break;
+		if (!refcount_inc_not_zero(&ns->count)) {
+			ns = NULL;
+			*id += 1;
+		}
+	} while (!ns);
+	rcu_read_unlock();
+
+	if (ns) {
+		*id += PROC_NS_MIN_INO;
+		WARN_ON(*id != ns->inum);
+	}
+
+	return ns;
+}
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 27db02ebdf36..5f460e97151a 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -4,6 +4,12 @@ 
 
 struct proc_ns_operations;
 
+/*
+ * Common part of all namespaces. Note, that we link namespaces
+ * into IDR, and they are dereferenced via RCU. So, a namespace
+ * memory is allowed to be freed one RCU grace period after final
+ * .count put. See ns_get_next() for the details.
+ */
 struct ns_common {
 	atomic_long_t stashed;
 	const struct proc_ns_operations *ops;
@@ -11,4 +17,8 @@  struct ns_common {
 	refcount_t count;
 };
 
+extern int ns_idr_register(struct ns_common *ns);
+extern void ns_idr_unregister(struct ns_common *ns);
+extern struct ns_common *ns_get_next(unsigned int *id);
+
 #endif
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 75807ecef880..906e6ebb43e4 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -40,12 +40,13 @@  extern const struct proc_ns_operations timens_for_children_operations;
  */
 enum {
 	PROC_ROOT_INO		= 1,
-	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
-	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
-	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
-	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
-	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
 	PROC_TIME_INIT_INO	= 0xEFFFFFFAU,
+	PROC_NS_MIN_INO		= PROC_TIME_INIT_INO,
+	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
+	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
+	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
+	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
+	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
 };
 
 #ifdef CONFIG_PROC_FS