diff mbox series

[v4,2/6] mounts: keep list of mounts in an rbtree

Message ID 20231025140205.3586473-3-mszeredi@redhat.com (mailing list archive)
State Changes Requested
Delegated to: Paul Moore
Headers show
Series querying mount attributes | expand

Commit Message

Miklos Szeredi Oct. 25, 2023, 2:02 p.m. UTC
When adding a mount to a namespace insert it into an rbtree rooted in the
mnt_namespace instead of a linear list.

The mnt.mnt_list is still used to set up the mount tree and for
propagation, but not after the mount has been added to a namespace.  Hence
mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
validate that the mount is on the correct list.

This allows removing the cursor used for reading /proc/$PID/mountinfo.  The
mnt_id_unique of the next mount can be used as an index into the seq file.

Tested by inserting 100k bind mounts, unsharing the mount namespace, and
unmounting.  No performance regressions have been observed.

For the last mount in the 100k list the statmount() call was more than 100x
faster due to the mount ID lookup not having to do a linear search.  This
patch makes the overhead of mount ID lookup non-observable in this range.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/mount.h            |  24 +++---
 fs/namespace.c        | 190 ++++++++++++++++++++----------------------
 fs/pnode.c            |   2 +-
 fs/proc_namespace.c   |   3 -
 include/linux/mount.h |   5 +-
 5 files changed, 106 insertions(+), 118 deletions(-)

Comments

Ian Kent Oct. 27, 2023, 3:11 a.m. UTC | #1
On 25/10/23 22:02, Miklos Szeredi wrote:
> When adding a mount to a namespace insert it into an rbtree rooted in the
> mnt_namespace instead of a linear list.
>
> The mnt.mnt_list is still used to set up the mount tree and for
> propagation, but not after the mount has been added to a namespace.  Hence
> mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
> validate that the mount is on the correct list.

Is that accurate, propagation occurs at mount and also at umount.


IDG how the change to umount_one() works, it looks like umount_list()

uses mnt_list. It looks like propagate_umount() is also using mnt_list.


Am I missing something obvious?

Ian

>
> This allows removing the cursor used for reading /proc/$PID/mountinfo.  The
> mnt_id_unique of the next mount can be used as an index into the seq file.
>
> Tested by inserting 100k bind mounts, unsharing the mount namespace, and
> unmounting.  No performance regressions have been observed.
>
> For the last mount in the 100k list the statmount() call was more than 100x
> faster due to the mount ID lookup not having to do a linear search.  This
> patch makes the overhead of mount ID lookup non-observable in this range.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>   fs/mount.h            |  24 +++---
>   fs/namespace.c        | 190 ++++++++++++++++++++----------------------
>   fs/pnode.c            |   2 +-
>   fs/proc_namespace.c   |   3 -
>   include/linux/mount.h |   5 +-
>   5 files changed, 106 insertions(+), 118 deletions(-)
>
> diff --git a/fs/mount.h b/fs/mount.h
> index a14f762b3f29..4a42fc68f4cc 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -8,19 +8,13 @@
>   struct mnt_namespace {
>   	struct ns_common	ns;
>   	struct mount *	root;
> -	/*
> -	 * Traversal and modification of .list is protected by either
> -	 * - taking namespace_sem for write, OR
> -	 * - taking namespace_sem for read AND taking .ns_lock.
> -	 */
> -	struct list_head	list;
> -	spinlock_t		ns_lock;
> +	struct rb_root		mounts; /* Protected by namespace_sem */
>   	struct user_namespace	*user_ns;
>   	struct ucounts		*ucounts;
>   	u64			seq;	/* Sequence number to prevent loops */
>   	wait_queue_head_t poll;
>   	u64 event;
> -	unsigned int		mounts; /* # of mounts in the namespace */
> +	unsigned int		nr_mounts; /* # of mounts in the namespace */
>   	unsigned int		pending_mounts;
>   } __randomize_layout;
>   
> @@ -55,7 +49,10 @@ struct mount {
>   	struct list_head mnt_child;	/* and going through their mnt_child */
>   	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
>   	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
> -	struct list_head mnt_list;
> +	union {
> +		struct rb_node mnt_node;	/* Under ns->mounts */
> +		struct list_head mnt_list;
> +	};
>   	struct list_head mnt_expire;	/* link in fs-specific expiry list */
>   	struct list_head mnt_share;	/* circular list of shared mounts */
>   	struct list_head mnt_slave_list;/* list of slave mounts */
> @@ -128,7 +125,6 @@ struct proc_mounts {
>   	struct mnt_namespace *ns;
>   	struct path root;
>   	int (*show)(struct seq_file *, struct vfsmount *);
> -	struct mount cursor;
>   };
>   
>   extern const struct seq_operations mounts_op;
> @@ -147,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
>   	return ns->seq == 0;
>   }
>   
> +static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
> +{
> +	WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
> +	mnt->mnt.mnt_flags &= ~MNT_ONRB;
> +	rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
> +	list_add_tail(&mnt->mnt_list, dt_list);
> +}
> +
>   extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e02bc5f41c7b..0eab47ffc76c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -732,21 +732,6 @@ struct vfsmount *lookup_mnt(const struct path *path)
>   	return m;
>   }
>   
> -static inline void lock_ns_list(struct mnt_namespace *ns)
> -{
> -	spin_lock(&ns->ns_lock);
> -}
> -
> -static inline void unlock_ns_list(struct mnt_namespace *ns)
> -{
> -	spin_unlock(&ns->ns_lock);
> -}
> -
> -static inline bool mnt_is_cursor(struct mount *mnt)
> -{
> -	return mnt->mnt.mnt_flags & MNT_CURSOR;
> -}
> -
>   /*
>    * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
>    *                         current mount namespace.
> @@ -765,19 +750,15 @@ static inline bool mnt_is_cursor(struct mount *mnt)
>   bool __is_local_mountpoint(struct dentry *dentry)
>   {
>   	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
> -	struct mount *mnt;
> +	struct mount *mnt, *n;
>   	bool is_covered = false;
>   
>   	down_read(&namespace_sem);
> -	lock_ns_list(ns);
> -	list_for_each_entry(mnt, &ns->list, mnt_list) {
> -		if (mnt_is_cursor(mnt))
> -			continue;
> +	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
>   		is_covered = (mnt->mnt_mountpoint == dentry);
>   		if (is_covered)
>   			break;
>   	}
> -	unlock_ns_list(ns);
>   	up_read(&namespace_sem);
>   
>   	return is_covered;
> @@ -1024,6 +1005,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
>   	mnt_add_count(old_parent, -1);
>   }
>   
> +static inline struct mount *node_to_mount(struct rb_node *node)
> +{
> +	return rb_entry(node, struct mount, mnt_node);
> +}
> +
> +static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
> +{
> +	struct rb_node **link = &ns->mounts.rb_node;
> +	struct rb_node *parent = NULL;
> +
> +	WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
> +	mnt->mnt_ns = ns;
> +	while (*link) {
> +		parent = *link;
> +		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
> +			link = &parent->rb_left;
> +		else
> +			link = &parent->rb_right;
> +	}
> +	rb_link_node(&mnt->mnt_node, parent, link);
> +	rb_insert_color(&mnt->mnt_node, &ns->mounts);
> +	mnt->mnt.mnt_flags |= MNT_ONRB;
> +}
> +
>   /*
>    * vfsmount lock must be held for write
>    */
> @@ -1037,12 +1042,13 @@ static void commit_tree(struct mount *mnt)
>   	BUG_ON(parent == mnt);
>   
>   	list_add_tail(&head, &mnt->mnt_list);
> -	list_for_each_entry(m, &head, mnt_list)
> -		m->mnt_ns = n;
> +	while (!list_empty(&head)) {
> +		m = list_first_entry(&head, typeof(*m), mnt_list);
> +		list_del(&m->mnt_list);
>   
> -	list_splice(&head, n->list.prev);
> -
> -	n->mounts += n->pending_mounts;
> +		mnt_add_to_ns(n, m);
> +	}
> +	n->nr_mounts += n->pending_mounts;
>   	n->pending_mounts = 0;
>   
>   	__attach_mnt(mnt, parent);
> @@ -1190,7 +1196,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
>   	}
>   
>   	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
> -	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
> +	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
>   
>   	atomic_inc(&sb->s_active);
>   	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
> @@ -1415,65 +1421,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
>   	return &p->mnt;
>   }
>   
> -#ifdef CONFIG_PROC_FS
> -static struct mount *mnt_list_next(struct mnt_namespace *ns,
> -				   struct list_head *p)
> +/*
> + * Returns the mount which either has the specified mnt_id, or has the next
> + * smallest id afer the specified one.
> + */
> +static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
>   {
> -	struct mount *mnt, *ret = NULL;
> +	struct rb_node *node = ns->mounts.rb_node;
> +	struct mount *ret = NULL;
>   
> -	lock_ns_list(ns);
> -	list_for_each_continue(p, &ns->list) {
> -		mnt = list_entry(p, typeof(*mnt), mnt_list);
> -		if (!mnt_is_cursor(mnt)) {
> -			ret = mnt;
> -			break;
> +	while (node) {
> +		struct mount *m = node_to_mount(node);
> +
> +		if (mnt_id <= m->mnt_id_unique) {
> +			ret = node_to_mount(node);
> +			if (mnt_id == m->mnt_id_unique)
> +				break;
> +			node = node->rb_left;
> +		} else {
> +			node = node->rb_right;
>   		}
>   	}
> -	unlock_ns_list(ns);
> -
>   	return ret;
>   }
>   
> +#ifdef CONFIG_PROC_FS
> +
>   /* iterator; we want it to have access to namespace_sem, thus here... */
>   static void *m_start(struct seq_file *m, loff_t *pos)
>   {
>   	struct proc_mounts *p = m->private;
> -	struct list_head *prev;
>   
>   	down_read(&namespace_sem);
> -	if (!*pos) {
> -		prev = &p->ns->list;
> -	} else {
> -		prev = &p->cursor.mnt_list;
>   
> -		/* Read after we'd reached the end? */
> -		if (list_empty(prev))
> -			return NULL;
> -	}
> -
> -	return mnt_list_next(p->ns, prev);
> +	return mnt_find_id_at(p->ns, *pos);
>   }
>   
>   static void *m_next(struct seq_file *m, void *v, loff_t *pos)
>   {
> -	struct proc_mounts *p = m->private;
> -	struct mount *mnt = v;
> +	struct mount *next = NULL, *mnt = v;
> +	struct rb_node *node = rb_next(&mnt->mnt_node);
>   
>   	++*pos;
> -	return mnt_list_next(p->ns, &mnt->mnt_list);
> +	if (node) {
> +		next = node_to_mount(node);
> +		*pos = next->mnt_id_unique;
> +	}
> +	return next;
>   }
>   
>   static void m_stop(struct seq_file *m, void *v)
>   {
> -	struct proc_mounts *p = m->private;
> -	struct mount *mnt = v;
> -
> -	lock_ns_list(p->ns);
> -	if (mnt)
> -		list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
> -	else
> -		list_del_init(&p->cursor.mnt_list);
> -	unlock_ns_list(p->ns);
>   	up_read(&namespace_sem);
>   }
>   
> @@ -1491,14 +1489,6 @@ const struct seq_operations mounts_op = {
>   	.show	= m_show,
>   };
>   
> -void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
> -{
> -	down_read(&namespace_sem);
> -	lock_ns_list(ns);
> -	list_del(&cursor->mnt_list);
> -	unlock_ns_list(ns);
> -	up_read(&namespace_sem);
> -}
>   #endif  /* CONFIG_PROC_FS */
>   
>   /**
> @@ -1640,7 +1630,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>   	/* Gather the mounts to umount */
>   	for (p = mnt; p; p = next_mnt(p, mnt)) {
>   		p->mnt.mnt_flags |= MNT_UMOUNT;
> -		list_move(&p->mnt_list, &tmp_list);
> +		if (p->mnt.mnt_flags & MNT_ONRB)
> +			move_from_ns(p, &tmp_list);
> +		else
> +			list_move(&p->mnt_list, &tmp_list);
>   	}
>   
>   	/* Hide the mounts from mnt_mounts */
> @@ -1660,7 +1653,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>   		list_del_init(&p->mnt_list);
>   		ns = p->mnt_ns;
>   		if (ns) {
> -			ns->mounts--;
> +			ns->nr_mounts--;
>   			__touch_mnt_namespace(ns);
>   		}
>   		p->mnt_ns = NULL;
> @@ -1786,14 +1779,16 @@ static int do_umount(struct mount *mnt, int flags)
>   
>   	event++;
>   	if (flags & MNT_DETACH) {
> -		if (!list_empty(&mnt->mnt_list))
> +		if (mnt->mnt.mnt_flags & MNT_ONRB ||
> +		    !list_empty(&mnt->mnt_list))
>   			umount_tree(mnt, UMOUNT_PROPAGATE);
>   		retval = 0;
>   	} else {
>   		shrink_submounts(mnt);
>   		retval = -EBUSY;
>   		if (!propagate_mount_busy(mnt, 2)) {
> -			if (!list_empty(&mnt->mnt_list))
> +			if (mnt->mnt.mnt_flags & MNT_ONRB ||
> +			    !list_empty(&mnt->mnt_list))
>   				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
>   			retval = 0;
>   		}
> @@ -2211,9 +2206,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
>   	unsigned int mounts = 0;
>   	struct mount *p;
>   
> -	if (ns->mounts >= max)
> +	if (ns->nr_mounts >= max)
>   		return -ENOSPC;
> -	max -= ns->mounts;
> +	max -= ns->nr_mounts;
>   	if (ns->pending_mounts >= max)
>   		return -ENOSPC;
>   	max -= ns->pending_mounts;
> @@ -2357,8 +2352,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>   		touch_mnt_namespace(source_mnt->mnt_ns);
>   	} else {
>   		if (source_mnt->mnt_ns) {
> +			LIST_HEAD(head);
> +
>   			/* move from anon - the caller will destroy */
> -			list_del_init(&source_mnt->mnt_ns->list);
> +			for (p = source_mnt; p; p = next_mnt(p, source_mnt))
> +				move_from_ns(p, &head);
> +			list_del_init(&head);
>   		}
>   		if (beneath)
>   			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
> @@ -2669,11 +2668,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
>   
>   	lock_mount_hash();
>   	for (p = mnt; p; p = next_mnt(p, mnt)) {
> -		p->mnt_ns = ns;
> -		ns->mounts++;
> +		mnt_add_to_ns(ns, p);
> +		ns->nr_mounts++;
>   	}
>   	ns->root = mnt;
> -	list_add_tail(&ns->list, &mnt->mnt_list);
>   	mntget(&mnt->mnt);
>   	unlock_mount_hash();
>   	namespace_unlock();
> @@ -3736,9 +3734,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
>   	if (!anon)
>   		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
>   	refcount_set(&new_ns->ns.count, 1);
> -	INIT_LIST_HEAD(&new_ns->list);
> +	new_ns->mounts = RB_ROOT;
>   	init_waitqueue_head(&new_ns->poll);
> -	spin_lock_init(&new_ns->ns_lock);
>   	new_ns->user_ns = get_user_ns(user_ns);
>   	new_ns->ucounts = ucounts;
>   	return new_ns;
> @@ -3785,7 +3782,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
>   		unlock_mount_hash();
>   	}
>   	new_ns->root = new;
> -	list_add_tail(&new_ns->list, &new->mnt_list);
>   
>   	/*
>   	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
> @@ -3795,8 +3791,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
>   	p = old;
>   	q = new;
>   	while (p) {
> -		q->mnt_ns = new_ns;
> -		new_ns->mounts++;
> +		mnt_add_to_ns(new_ns, q);
> +		new_ns->nr_mounts++;
>   		if (new_fs) {
>   			if (&p->mnt == new_fs->root.mnt) {
>   				new_fs->root.mnt = mntget(&q->mnt);
> @@ -3838,10 +3834,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
>   		mntput(m);
>   		return ERR_CAST(ns);
>   	}
> -	mnt->mnt_ns = ns;
>   	ns->root = mnt;
> -	ns->mounts++;
> -	list_add(&mnt->mnt_list, &ns->list);
> +	ns->nr_mounts++;
> +	mnt_add_to_ns(ns, mnt);
>   
>   	err = vfs_path_lookup(m->mnt_root, m,
>   			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
> @@ -4019,10 +4014,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
>   		goto err_path;
>   	}
>   	mnt = real_mount(newmount.mnt);
> -	mnt->mnt_ns = ns;
>   	ns->root = mnt;
> -	ns->mounts = 1;
> -	list_add(&mnt->mnt_list, &ns->list);
> +	ns->nr_mounts = 1;
> +	mnt_add_to_ns(ns, mnt);
>   	mntget(newmount.mnt);
>   
>   	/* Attach to an apparent O_PATH fd with a note that we need to unmount
> @@ -4693,10 +4687,9 @@ static void __init init_mount_tree(void)
>   	if (IS_ERR(ns))
>   		panic("Can't allocate initial namespace");
>   	m = real_mount(mnt);
> -	m->mnt_ns = ns;
>   	ns->root = m;
> -	ns->mounts = 1;
> -	list_add(&m->mnt_list, &ns->list);
> +	ns->nr_mounts = 1;
> +	mnt_add_to_ns(ns, m);
>   	init_task.nsproxy->mnt_ns = ns;
>   	get_mnt_ns(ns);
>   
> @@ -4823,18 +4816,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
>   				int *new_mnt_flags)
>   {
>   	int new_flags = *new_mnt_flags;
> -	struct mount *mnt;
> +	struct mount *mnt, *n;
>   	bool visible = false;
>   
>   	down_read(&namespace_sem);
> -	lock_ns_list(ns);
> -	list_for_each_entry(mnt, &ns->list, mnt_list) {
> +	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
>   		struct mount *child;
>   		int mnt_flags;
>   
> -		if (mnt_is_cursor(mnt))
> -			continue;
> -
>   		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
>   			continue;
>   
> @@ -4882,7 +4871,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
>   	next:	;
>   	}
>   found:
> -	unlock_ns_list(ns);
>   	up_read(&namespace_sem);
>   	return visible;
>   }
> diff --git a/fs/pnode.c b/fs/pnode.c
> index e4d0340393d5..a799e0315cc9 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount)
>   	mnt->mnt.mnt_flags |= MNT_UMOUNT;
>   	list_del_init(&mnt->mnt_child);
>   	list_del_init(&mnt->mnt_umounting);
> -	list_move_tail(&mnt->mnt_list, to_umount);
> +	move_from_ns(mnt, to_umount);
>   }
>   
>   /*
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 250eb5bf7b52..73d2274d5f59 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -283,8 +283,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
>   	p->ns = ns;
>   	p->root = root;
>   	p->show = show;
> -	INIT_LIST_HEAD(&p->cursor.mnt_list);
> -	p->cursor.mnt.mnt_flags = MNT_CURSOR;
>   
>   	return 0;
>   
> @@ -301,7 +299,6 @@ static int mounts_release(struct inode *inode, struct file *file)
>   	struct seq_file *m = file->private_data;
>   	struct proc_mounts *p = m->private;
>   	path_put(&p->root);
> -	mnt_cursor_del(p->ns, &p->cursor);
>   	put_mnt_ns(p->ns);
>   	return seq_release_private(inode, file);
>   }
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 4f40b40306d0..7952eddc835c 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -50,8 +50,7 @@ struct path;
>   #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
>   
>   #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
> -			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
> -			    MNT_CURSOR)
> +			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
>   
>   #define MNT_INTERNAL	0x4000
>   
> @@ -65,7 +64,7 @@ struct path;
>   #define MNT_SYNC_UMOUNT		0x2000000
>   #define MNT_MARKED		0x4000000
>   #define MNT_UMOUNT		0x8000000
> -#define MNT_CURSOR		0x10000000
> +#define MNT_ONRB		0x10000000
>   
>   struct vfsmount {
>   	struct dentry *mnt_root;	/* root of the mounted tree */
Miklos Szeredi Oct. 27, 2023, 8:17 a.m. UTC | #2
On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <raven@themaw.net> wrote:
>
> On 25/10/23 22:02, Miklos Szeredi wrote:

> > The mnt.mnt_list is still used to set up the mount tree and for
> > propagation, but not after the mount has been added to a namespace.  Hence
> > mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
> > validate that the mount is on the correct list.
>
> Is that accurate, propagation occurs at mount and also at umount.

When propagating a mount, the new mount's mnt_list is used as a head
for the new propagated mounts.  These are then moved to the rb tree by
commit_tree().

When umounting there's a "to umount" list called tmp_list in
umount_tree(), this list is used to collect direct umounts and then
propagated umounts.  The direct umounts are added in umount_tree(),
the propagated ones umount_one().

Note: umount_tree() can be called on a not yet finished mount, in that
case the mounts are still on mnt_list, so umount_tree() needs to deal
with both.

> IDG how the change to umount_one() works, it looks like umount_list()
>
> uses mnt_list. It looks like propagate_umount() is also using mnt_list.
>
>
> Am I missing something obvious?

So when a mount is part of a namespace (either anonymous or not) it is
on the rb tree, when not then it can temporarily be on mnt_list.
MNT_ONRB flag is used to validate that the mount is on the list that
we expect it to be on, but also to detect the case of the mount setup
being aborted.

We could handle the second case differently, since we should be able
to tell when we are removing the mount from a namespace and when we
are aborting a mount, but this was the least invasive way to do this.

Thanks,
Miklos
Ian Kent Oct. 28, 2023, 1:36 a.m. UTC | #3
On 27/10/23 16:17, Miklos Szeredi wrote:
> On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <raven@themaw.net> wrote:
>> On 25/10/23 22:02, Miklos Szeredi wrote:
>>> The mnt.mnt_list is still used to set up the mount tree and for
>>> propagation, but not after the mount has been added to a namespace.  Hence
>>> mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
>>> validate that the mount is on the correct list.
>> Is that accurate, propagation occurs at mount and also at umount.
> When propagating a mount, the new mount's mnt_list is used as a head
> for the new propagated mounts.  These are then moved to the rb tree by
> commit_tree().
>
> When umounting there's a "to umount" list called tmp_list in
> umount_tree(), this list is used to collect direct umounts and then
> propagated umounts.  The direct umounts are added in umount_tree(),
> the propagated ones umount_one().
>
> Note: umount_tree() can be called on a not yet finished mount, in that
> case the mounts are still on mnt_list, so umount_tree() needs to deal
> with both.
>
>> IDG how the change to umount_one() works, it looks like umount_list()
>>
>> uses mnt_list. It looks like propagate_umount() is also using mnt_list.
>>
>>
>> Am I missing something obvious?
> So when a mount is part of a namespace (either anonymous or not) it is
> on the rb tree, when not then it can temporarily be on mnt_list.
> MNT_ONRB flag is used to validate that the mount is on the list that
> we expect it to be on, but also to detect the case of the mount setup
> being aborted.
>
> We could handle the second case differently, since we should be able
> to tell when we are removing the mount from a namespace and when we
> are aborting a mount, but this was the least invasive way to do this.

Thanks for the explanation, what you've said is essentially what I

understood reading the series.


But I still haven't quite got this so I'll need to spend more time

on this part of the patch series.


That's not a problem, ;).


Ian

>
> Thanks,
> Miklos
>
Ian Kent Oct. 30, 2023, 5:37 a.m. UTC | #4
On 28/10/23 09:36, Ian Kent wrote:
> On 27/10/23 16:17, Miklos Szeredi wrote:
>> On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <raven@themaw.net> wrote:
>>> On 25/10/23 22:02, Miklos Szeredi wrote:
>>>> The mnt.mnt_list is still used to set up the mount tree and for
>>>> propagation, but not after the mount has been added to a 
>>>> namespace.  Hence
>>>> mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
>>>> validate that the mount is on the correct list.
>>> Is that accurate, propagation occurs at mount and also at umount.
>> When propagating a mount, the new mount's mnt_list is used as a head
>> for the new propagated mounts.  These are then moved to the rb tree by
>> commit_tree().
>>
>> When umounting there's a "to umount" list called tmp_list in
>> umount_tree(), this list is used to collect direct umounts and then
>> propagated umounts.  The direct umounts are added in umount_tree(),
>> the propagated ones umount_one().
>>
>> Note: umount_tree() can be called on a not yet finished mount, in that
>> case the mounts are still on mnt_list, so umount_tree() needs to deal
>> with both.
>>
>>> IDG how the change to umount_one() works, it looks like umount_list()
>>>
>>> uses mnt_list. It looks like propagate_umount() is also using mnt_list.
>>>
>>>
>>> Am I missing something obvious?
>> So when a mount is part of a namespace (either anonymous or not) it is
>> on the rb tree, when not then it can temporarily be on mnt_list.
>> MNT_ONRB flag is used to validate that the mount is on the list that
>> we expect it to be on, but also to detect the case of the mount setup
>> being aborted.
>>
>> We could handle the second case differently, since we should be able
>> to tell when we are removing the mount from a namespace and when we
>> are aborting a mount, but this was the least invasive way to do this.
>
> Thanks for the explanation, what you've said is essentially what I
>
> understood reading the series.
>
>
> But I still haven't quite got this so I'll need to spend more time
>
> on this part of the patch series.
>
>
> That's not a problem, ;).

After cloning your git tree and looking in there I don't see what

I was concerned about so I think I was confused by obscurity by

diff rather than seeing a real problem, ;)


Still that union worries me a little bit so I'll keep looking at

the code for a while.


>
>
> Ian
>
>>
>> Thanks,
>> Miklos
>>
>
Ian Kent Oct. 30, 2023, 5:45 a.m. UTC | #5
On 30/10/23 13:37, Ian Kent wrote:
> On 28/10/23 09:36, Ian Kent wrote:
>> On 27/10/23 16:17, Miklos Szeredi wrote:
>>> On Fri, Oct 27, 2023 at 5:12 AM Ian Kent <raven@themaw.net> wrote:
>>>> On 25/10/23 22:02, Miklos Szeredi wrote:
>>>>> The mnt.mnt_list is still used to set up the mount tree and for
>>>>> propagation, but not after the mount has been added to a 
>>>>> namespace.  Hence
>>>>> mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
>>>>> validate that the mount is on the correct list.
>>>> Is that accurate, propagation occurs at mount and also at umount.
>>> When propagating a mount, the new mount's mnt_list is used as a head
>>> for the new propagated mounts.  These are then moved to the rb tree by
>>> commit_tree().
>>>
>>> When umounting there's a "to umount" list called tmp_list in
>>> umount_tree(), this list is used to collect direct umounts and then
>>> propagated umounts.  The direct umounts are added in umount_tree(),
>>> the propagated ones umount_one().
>>>
>>> Note: umount_tree() can be called on a not yet finished mount, in that
>>> case the mounts are still on mnt_list, so umount_tree() needs to deal
>>> with both.
>>>
>>>> IDG how the change to umount_one() works, it looks like umount_list()
>>>>
>>>> uses mnt_list. It looks like propagate_umount() is also using 
>>>> mnt_list.
>>>>
>>>>
>>>> Am I missing something obvious?
>>> So when a mount is part of a namespace (either anonymous or not) it is
>>> on the rb tree, when not then it can temporarily be on mnt_list.
>>> MNT_ONRB flag is used to validate that the mount is on the list that
>>> we expect it to be on, but also to detect the case of the mount setup
>>> being aborted.
>>>
>>> We could handle the second case differently, since we should be able
>>> to tell when we are removing the mount from a namespace and when we
>>> are aborting a mount, but this was the least invasive way to do this.
>>
>> Thanks for the explanation, what you've said is essentially what I
>>
>> understood reading the series.
>>
>>
>> But I still haven't quite got this so I'll need to spend more time
>>
>> on this part of the patch series.
>>
>>
>> That's not a problem, ;).
>
> After cloning your git tree and looking in there I don't see what
>
> I was concerned about so I think I was confused by obscurity by
>
> diff rather than seeing a real problem, ;)
>
>
> Still that union worries me a little bit so I'll keep looking at
>
> the code for a while.

Is fs/namespace.c:iterate_mounts() a problem?

It's called from:

1) ./kernel/audit_tree.c:709: if (iterate_mounts(compare_root,
2) ./kernel/audit_tree.c:839:    err = iterate_mounts(tag_mount, tree, mnt);
3) ./kernel/audit_tree.c:917:        failed = iterate_mounts(tag_mount, 
tree, tagged);


 From functions 1) audit_trim_trees(), 2) audit_add_tree_rule() and

3) audit_tag_tree().


>
>
>>
>>
>> Ian
>>
>>>
>>> Thanks,
>>> Miklos
>>>
>>
>
Miklos Szeredi Oct. 30, 2023, 9:06 a.m. UTC | #6
On Mon, Oct 30, 2023 at 6:45 AM Ian Kent <raven@themaw.net> wrote:

> Is fs/namespace.c:iterate_mounts() a problem?
>
> It's called from:
>
> 1) ./kernel/audit_tree.c:709: if (iterate_mounts(compare_root,
> 2) ./kernel/audit_tree.c:839:    err = iterate_mounts(tag_mount, tree, mnt);
> 3) ./kernel/audit_tree.c:917:        failed = iterate_mounts(tag_mount,
> tree, tagged);
>
>
>  From functions 1) audit_trim_trees(), 2) audit_add_tree_rule() and
>
> 3) audit_tag_tree().

So that interface works like this:

 - collect_mounts() creates a temporary copy of a mount tree, mounts
are chained on mnt_list.

 - iterate_mounts() is used to do some work on the temporary tree

 - drop_collected_mounts() frees the temporary tree

These mounts are never installed in a namespace.  My guess is that a
private copy is used instead of the original mount tree to prevent
races.

Thanks,
Miklos
Ian Kent Oct. 31, 2023, 1:23 a.m. UTC | #7
On 30/10/23 17:06, Miklos Szeredi wrote:
> On Mon, Oct 30, 2023 at 6:45 AM Ian Kent <raven@themaw.net> wrote:
>
>> Is fs/namespace.c:iterate_mounts() a problem?
>>
>> It's called from:
>>
>> 1) ./kernel/audit_tree.c:709: if (iterate_mounts(compare_root,
>> 2) ./kernel/audit_tree.c:839:    err = iterate_mounts(tag_mount, tree, mnt);
>> 3) ./kernel/audit_tree.c:917:        failed = iterate_mounts(tag_mount,
>> tree, tagged);
>>
>>
>>   From functions 1) audit_trim_trees(), 2) audit_add_tree_rule() and
>>
>> 3) audit_tag_tree().
> So that interface works like this:
>
>   - collect_mounts() creates a temporary copy of a mount tree, mounts
> are chained on mnt_list.

Right, sorry for the noise, I didn't look far enough.


Ian

>
>   - iterate_mounts() is used to do some work on the temporary tree
>
>   - drop_collected_mounts() frees the temporary tree
>
> These mounts are never installed in a namespace.  My guess is that a
> private copy is used instead of the original mount tree to prevent
> races.
>
> Thanks,
> Miklos
>
diff mbox series

Patch

diff --git a/fs/mount.h b/fs/mount.h
index a14f762b3f29..4a42fc68f4cc 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,19 +8,13 @@ 
 struct mnt_namespace {
 	struct ns_common	ns;
 	struct mount *	root;
-	/*
-	 * Traversal and modification of .list is protected by either
-	 * - taking namespace_sem for write, OR
-	 * - taking namespace_sem for read AND taking .ns_lock.
-	 */
-	struct list_head	list;
-	spinlock_t		ns_lock;
+	struct rb_root		mounts; /* Protected by namespace_sem */
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	u64 event;
-	unsigned int		mounts; /* # of mounts in the namespace */
+	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
 } __randomize_layout;
 
@@ -55,7 +49,10 @@  struct mount {
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
 	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
-	struct list_head mnt_list;
+	union {
+		struct rb_node mnt_node;	/* Under ns->mounts */
+		struct list_head mnt_list;
+	};
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct list_head mnt_share;	/* circular list of shared mounts */
 	struct list_head mnt_slave_list;/* list of slave mounts */
@@ -128,7 +125,6 @@  struct proc_mounts {
 	struct mnt_namespace *ns;
 	struct path root;
 	int (*show)(struct seq_file *, struct vfsmount *);
-	struct mount cursor;
 };
 
 extern const struct seq_operations mounts_op;
@@ -147,4 +143,12 @@  static inline bool is_anon_ns(struct mnt_namespace *ns)
 	return ns->seq == 0;
 }
 
+static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+{
+	WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
+	mnt->mnt.mnt_flags &= ~MNT_ONRB;
+	rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+	list_add_tail(&mnt->mnt_list, dt_list);
+}
+
 extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
diff --git a/fs/namespace.c b/fs/namespace.c
index e02bc5f41c7b..0eab47ffc76c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -732,21 +732,6 @@  struct vfsmount *lookup_mnt(const struct path *path)
 	return m;
 }
 
-static inline void lock_ns_list(struct mnt_namespace *ns)
-{
-	spin_lock(&ns->ns_lock);
-}
-
-static inline void unlock_ns_list(struct mnt_namespace *ns)
-{
-	spin_unlock(&ns->ns_lock);
-}
-
-static inline bool mnt_is_cursor(struct mount *mnt)
-{
-	return mnt->mnt.mnt_flags & MNT_CURSOR;
-}
-
 /*
  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  *                         current mount namespace.
@@ -765,19 +750,15 @@  static inline bool mnt_is_cursor(struct mount *mnt)
 bool __is_local_mountpoint(struct dentry *dentry)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-	struct mount *mnt;
+	struct mount *mnt, *n;
 	bool is_covered = false;
 
 	down_read(&namespace_sem);
-	lock_ns_list(ns);
-	list_for_each_entry(mnt, &ns->list, mnt_list) {
-		if (mnt_is_cursor(mnt))
-			continue;
+	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
 		is_covered = (mnt->mnt_mountpoint == dentry);
 		if (is_covered)
 			break;
 	}
-	unlock_ns_list(ns);
 	up_read(&namespace_sem);
 
 	return is_covered;
@@ -1024,6 +1005,30 @@  void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
 	mnt_add_count(old_parent, -1);
 }
 
+static inline struct mount *node_to_mount(struct rb_node *node)
+{
+	return rb_entry(node, struct mount, mnt_node);
+}
+
+static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
+{
+	struct rb_node **link = &ns->mounts.rb_node;
+	struct rb_node *parent = NULL;
+
+	WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+	mnt->mnt_ns = ns;
+	while (*link) {
+		parent = *link;
+		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	rb_link_node(&mnt->mnt_node, parent, link);
+	rb_insert_color(&mnt->mnt_node, &ns->mounts);
+	mnt->mnt.mnt_flags |= MNT_ONRB;
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -1037,12 +1042,13 @@  static void commit_tree(struct mount *mnt)
 	BUG_ON(parent == mnt);
 
 	list_add_tail(&head, &mnt->mnt_list);
-	list_for_each_entry(m, &head, mnt_list)
-		m->mnt_ns = n;
+	while (!list_empty(&head)) {
+		m = list_first_entry(&head, typeof(*m), mnt_list);
+		list_del(&m->mnt_list);
 
-	list_splice(&head, n->list.prev);
-
-	n->mounts += n->pending_mounts;
+		mnt_add_to_ns(n, m);
+	}
+	n->nr_mounts += n->pending_mounts;
 	n->pending_mounts = 0;
 
 	__attach_mnt(mnt, parent);
@@ -1190,7 +1196,7 @@  static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1415,65 +1421,57 @@  struct vfsmount *mnt_clone_internal(const struct path *path)
 	return &p->mnt;
 }
 
-#ifdef CONFIG_PROC_FS
-static struct mount *mnt_list_next(struct mnt_namespace *ns,
-				   struct list_head *p)
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * smallest id afer the specified one.
+ */
+static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
 {
-	struct mount *mnt, *ret = NULL;
+	struct rb_node *node = ns->mounts.rb_node;
+	struct mount *ret = NULL;
 
-	lock_ns_list(ns);
-	list_for_each_continue(p, &ns->list) {
-		mnt = list_entry(p, typeof(*mnt), mnt_list);
-		if (!mnt_is_cursor(mnt)) {
-			ret = mnt;
-			break;
+	while (node) {
+		struct mount *m = node_to_mount(node);
+
+		if (mnt_id <= m->mnt_id_unique) {
+			ret = node_to_mount(node);
+			if (mnt_id == m->mnt_id_unique)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
 		}
 	}
-	unlock_ns_list(ns);
-
 	return ret;
 }
 
+#ifdef CONFIG_PROC_FS
+
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_mounts *p = m->private;
-	struct list_head *prev;
 
 	down_read(&namespace_sem);
-	if (!*pos) {
-		prev = &p->ns->list;
-	} else {
-		prev = &p->cursor.mnt_list;
 
-		/* Read after we'd reached the end? */
-		if (list_empty(prev))
-			return NULL;
-	}
-
-	return mnt_list_next(p->ns, prev);
+	return mnt_find_id_at(p->ns, *pos);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct proc_mounts *p = m->private;
-	struct mount *mnt = v;
+	struct mount *next = NULL, *mnt = v;
+	struct rb_node *node = rb_next(&mnt->mnt_node);
 
 	++*pos;
-	return mnt_list_next(p->ns, &mnt->mnt_list);
+	if (node) {
+		next = node_to_mount(node);
+		*pos = next->mnt_id_unique;
+	}
+	return next;
 }
 
 static void m_stop(struct seq_file *m, void *v)
 {
-	struct proc_mounts *p = m->private;
-	struct mount *mnt = v;
-
-	lock_ns_list(p->ns);
-	if (mnt)
-		list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
-	else
-		list_del_init(&p->cursor.mnt_list);
-	unlock_ns_list(p->ns);
 	up_read(&namespace_sem);
 }
 
@@ -1491,14 +1489,6 @@  const struct seq_operations mounts_op = {
 	.show	= m_show,
 };
 
-void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
-{
-	down_read(&namespace_sem);
-	lock_ns_list(ns);
-	list_del(&cursor->mnt_list);
-	unlock_ns_list(ns);
-	up_read(&namespace_sem);
-}
 #endif  /* CONFIG_PROC_FS */
 
 /**
@@ -1640,7 +1630,10 @@  static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 	/* Gather the mounts to umount */
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		p->mnt.mnt_flags |= MNT_UMOUNT;
-		list_move(&p->mnt_list, &tmp_list);
+		if (p->mnt.mnt_flags & MNT_ONRB)
+			move_from_ns(p, &tmp_list);
+		else
+			list_move(&p->mnt_list, &tmp_list);
 	}
 
 	/* Hide the mounts from mnt_mounts */
@@ -1660,7 +1653,7 @@  static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 		list_del_init(&p->mnt_list);
 		ns = p->mnt_ns;
 		if (ns) {
-			ns->mounts--;
+			ns->nr_mounts--;
 			__touch_mnt_namespace(ns);
 		}
 		p->mnt_ns = NULL;
@@ -1786,14 +1779,16 @@  static int do_umount(struct mount *mnt, int flags)
 
 	event++;
 	if (flags & MNT_DETACH) {
-		if (!list_empty(&mnt->mnt_list))
+		if (mnt->mnt.mnt_flags & MNT_ONRB ||
+		    !list_empty(&mnt->mnt_list))
 			umount_tree(mnt, UMOUNT_PROPAGATE);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
-			if (!list_empty(&mnt->mnt_list))
+			if (mnt->mnt.mnt_flags & MNT_ONRB ||
+			    !list_empty(&mnt->mnt_list))
 				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 			retval = 0;
 		}
@@ -2211,9 +2206,9 @@  int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
 	unsigned int mounts = 0;
 	struct mount *p;
 
-	if (ns->mounts >= max)
+	if (ns->nr_mounts >= max)
 		return -ENOSPC;
-	max -= ns->mounts;
+	max -= ns->nr_mounts;
 	if (ns->pending_mounts >= max)
 		return -ENOSPC;
 	max -= ns->pending_mounts;
@@ -2357,8 +2352,12 @@  static int attach_recursive_mnt(struct mount *source_mnt,
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		if (source_mnt->mnt_ns) {
+			LIST_HEAD(head);
+
 			/* move from anon - the caller will destroy */
-			list_del_init(&source_mnt->mnt_ns->list);
+			for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+				move_from_ns(p, &head);
+			list_del_init(&head);
 		}
 		if (beneath)
 			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
@@ -2669,11 +2668,10 @@  static struct file *open_detached_copy(struct path *path, bool recursive)
 
 	lock_mount_hash();
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		p->mnt_ns = ns;
-		ns->mounts++;
+		mnt_add_to_ns(ns, p);
+		ns->nr_mounts++;
 	}
 	ns->root = mnt;
-	list_add_tail(&ns->list, &mnt->mnt_list);
 	mntget(&mnt->mnt);
 	unlock_mount_hash();
 	namespace_unlock();
@@ -3736,9 +3734,8 @@  static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	if (!anon)
 		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	refcount_set(&new_ns->ns.count, 1);
-	INIT_LIST_HEAD(&new_ns->list);
+	new_ns->mounts = RB_ROOT;
 	init_waitqueue_head(&new_ns->poll);
-	spin_lock_init(&new_ns->ns_lock);
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
 	return new_ns;
@@ -3785,7 +3782,6 @@  struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		unlock_mount_hash();
 	}
 	new_ns->root = new;
-	list_add_tail(&new_ns->list, &new->mnt_list);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -3795,8 +3791,8 @@  struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	p = old;
 	q = new;
 	while (p) {
-		q->mnt_ns = new_ns;
-		new_ns->mounts++;
+		mnt_add_to_ns(new_ns, q);
+		new_ns->nr_mounts++;
 		if (new_fs) {
 			if (&p->mnt == new_fs->root.mnt) {
 				new_fs->root.mnt = mntget(&q->mnt);
@@ -3838,10 +3834,9 @@  struct dentry *mount_subtree(struct vfsmount *m, const char *name)
 		mntput(m);
 		return ERR_CAST(ns);
 	}
-	mnt->mnt_ns = ns;
 	ns->root = mnt;
-	ns->mounts++;
-	list_add(&mnt->mnt_list, &ns->list);
+	ns->nr_mounts++;
+	mnt_add_to_ns(ns, mnt);
 
 	err = vfs_path_lookup(m->mnt_root, m,
 			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
@@ -4019,10 +4014,9 @@  SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 		goto err_path;
 	}
 	mnt = real_mount(newmount.mnt);
-	mnt->mnt_ns = ns;
 	ns->root = mnt;
-	ns->mounts = 1;
-	list_add(&mnt->mnt_list, &ns->list);
+	ns->nr_mounts = 1;
+	mnt_add_to_ns(ns, mnt);
 	mntget(newmount.mnt);
 
 	/* Attach to an apparent O_PATH fd with a note that we need to unmount
@@ -4693,10 +4687,9 @@  static void __init init_mount_tree(void)
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
 	m = real_mount(mnt);
-	m->mnt_ns = ns;
 	ns->root = m;
-	ns->mounts = 1;
-	list_add(&m->mnt_list, &ns->list);
+	ns->nr_mounts = 1;
+	mnt_add_to_ns(ns, m);
 	init_task.nsproxy->mnt_ns = ns;
 	get_mnt_ns(ns);
 
@@ -4823,18 +4816,14 @@  static bool mnt_already_visible(struct mnt_namespace *ns,
 				int *new_mnt_flags)
 {
 	int new_flags = *new_mnt_flags;
-	struct mount *mnt;
+	struct mount *mnt, *n;
 	bool visible = false;
 
 	down_read(&namespace_sem);
-	lock_ns_list(ns);
-	list_for_each_entry(mnt, &ns->list, mnt_list) {
+	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
 		struct mount *child;
 		int mnt_flags;
 
-		if (mnt_is_cursor(mnt))
-			continue;
-
 		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
 			continue;
 
@@ -4882,7 +4871,6 @@  static bool mnt_already_visible(struct mnt_namespace *ns,
 	next:	;
 	}
 found:
-	unlock_ns_list(ns);
 	up_read(&namespace_sem);
 	return visible;
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index e4d0340393d5..a799e0315cc9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -468,7 +468,7 @@  static void umount_one(struct mount *mnt, struct list_head *to_umount)
 	mnt->mnt.mnt_flags |= MNT_UMOUNT;
 	list_del_init(&mnt->mnt_child);
 	list_del_init(&mnt->mnt_umounting);
-	list_move_tail(&mnt->mnt_list, to_umount);
+	move_from_ns(mnt, to_umount);
 }
 
 /*
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..73d2274d5f59 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -283,8 +283,6 @@  static int mounts_open_common(struct inode *inode, struct file *file,
 	p->ns = ns;
 	p->root = root;
 	p->show = show;
-	INIT_LIST_HEAD(&p->cursor.mnt_list);
-	p->cursor.mnt.mnt_flags = MNT_CURSOR;
 
 	return 0;
 
@@ -301,7 +299,6 @@  static int mounts_release(struct inode *inode, struct file *file)
 	struct seq_file *m = file->private_data;
 	struct proc_mounts *p = m->private;
 	path_put(&p->root);
-	mnt_cursor_del(p->ns, &p->cursor);
 	put_mnt_ns(p->ns);
 	return seq_release_private(inode, file);
 }
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4f40b40306d0..7952eddc835c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -50,8 +50,7 @@  struct path;
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
-			    MNT_CURSOR)
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
 
 #define MNT_INTERNAL	0x4000
 
@@ -65,7 +64,7 @@  struct path;
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
 #define MNT_UMOUNT		0x8000000
-#define MNT_CURSOR		0x10000000
+#define MNT_ONRB		0x10000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */