diff mbox

Fix a race in put_mountpoint.

Message ID 20170103040052.GB1555@ZenIV.linux.org.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Al Viro Jan. 3, 2017, 4 a.m. UTC
On Tue, Jan 03, 2017 at 04:17:14PM +1300, Eric W. Biederman wrote:
> Al Viro <viro@ZenIV.linux.org.uk> writes:
> 
> > On Tue, Jan 03, 2017 at 01:51:36PM +1300, Eric W. Biederman wrote:
> >
> >> The only significant thing I see is that you have not taken the
> >> mount_lock on the path where new_mountpoint adds the new struct
> >> mountpoint into the mountpoint hash table.
> >
> > Umm...  Point, but I really don't like that bouncing mount_lock up
> > and down there.  It's not going to cause any serious overhead,
> > but it just looks ugly... ;-/
> >
> > Let me think for a while...
> 
> The other possibility is to grab namespace_sem in mntput_no_expire
> around the call of umount_mnt.  That is the only path where
> put_mountpoint can be called where we are not holding namespace_sem.
> That works in the small but I haven't traced the callers of mntput and
> mntput_no_expire yet to see if it works in practice.

No, that's a really bad idea.  Final mntput should _not_ happen under
namespace_lock, but I don't want grabbing it in that place.

How about this instead:

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric W. Biederman Jan. 4, 2017, 3:52 a.m. UTC | #1
Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Tue, Jan 03, 2017 at 04:17:14PM +1300, Eric W. Biederman wrote:
>> Al Viro <viro@ZenIV.linux.org.uk> writes:
>> 
>> > On Tue, Jan 03, 2017 at 01:51:36PM +1300, Eric W. Biederman wrote:
>> >
>> >> The only significant thing I see is that you have not taken the
>> >> mount_lock on the path where new_mountpoint adds the new struct
>> >> mountpoint into the mountpoint hash table.
>> >
>> > Umm...  Point, but I really don't like that bouncing mount_lock up
>> > and down there.  It's not going to cause any serious overhead,
>> > but it just looks ugly... ;-/
>> >
>> > Let me think for a while...
>> 
>> The other possibility is to grab namespace_sem in mntput_no_expire
>> around the call of umount_mnt.  That is the only path where
>> put_mountpoint can be called where we are not holding namespace_sem.
>> That works in the small but I haven't traced the callers of mntput and
>> mntput_no_expire yet to see if it works in practice.
>
> No, that's a really bad idea.  Final mntput should _not_ happen under
> namespace_lock, but I don't want grabbing it in that place.

Agreed.  That just makes the code harder to maintain later on.

> How about this instead:

I really don't like the logic inlined as my patch to kill shadow mounts
needs to call acquire a mountpoint which may not already have been
allocated as well.

Beyond that we can make the logic simpler by causing d_set_mounted to
fail if the flag is already set and syncrhonize on that.  Which means
we don't have to verify the ordering between mount_lock
and rename_lock (from d_set_mounted) is not a problem, which makes
backports easier to verify.

Patch follows.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index b5b1259e064f..20fc797277f8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -742,29 +742,6 @@  static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 	return NULL;
 }
 
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
-{
-	struct hlist_head *chain = mp_hash(dentry);
-	struct mountpoint *mp;
-	int ret;
-
-	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
-	if (!mp)
-		return ERR_PTR(-ENOMEM);
-
-	ret = d_set_mounted(dentry);
-	if (ret) {
-		kfree(mp);
-		return ERR_PTR(ret);
-	}
-
-	mp->m_dentry = dentry;
-	mp->m_count = 1;
-	hlist_add_head(&mp->m_hash, chain);
-	INIT_HLIST_HEAD(&mp->m_list);
-	return mp;
-}
-
 static void put_mountpoint(struct mountpoint *mp)
 {
 	if (!--mp->m_count) {
@@ -1595,11 +1572,11 @@  void __detach_mounts(struct dentry *dentry)
 	struct mount *mnt;
 
 	namespace_lock();
+	lock_mount_hash();
 	mp = lookup_mountpoint(dentry);
 	if (IS_ERR_OR_NULL(mp))
 		goto out_unlock;
 
-	lock_mount_hash();
 	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
@@ -1609,9 +1586,9 @@  void __detach_mounts(struct dentry *dentry)
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
-	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -2027,8 +2004,11 @@  static int attach_recursive_mnt(struct mount *source_mnt,
 
 static struct mountpoint *lock_mount(struct path *path)
 {
+	struct mountpoint *mp = NULL;
 	struct vfsmount *mnt;
 	struct dentry *dentry = path->dentry;
+	int err;
+
 retry:
 	inode_lock(dentry->d_inode);
 	if (unlikely(cant_mount(dentry))) {
@@ -2037,29 +2017,60 @@  static struct mountpoint *lock_mount(struct path *path)
 	}
 	namespace_lock();
 	mnt = lookup_mnt(path);
-	if (likely(!mnt)) {
-		struct mountpoint *mp = lookup_mountpoint(dentry);
-		if (!mp)
-			mp = new_mountpoint(dentry);
-		if (IS_ERR(mp)) {
+	if (unlikely(mnt)) {
+		namespace_unlock();
+		inode_unlock(path->dentry->d_inode);
+		path_put(path);
+		path->mnt = mnt;
+		dentry = path->dentry = dget(mnt->mnt_root);
+		goto retry;
+	}
+
+	/*
+	 * OK, we have namespace_lock held, nothing is overmounting
+	 * *path and inode of mountpoint to be is locked.
+	 */
+	if (likely(!d_mountpoint(dentry)))
+		mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	read_seqlock_excl(&mount_lock);
+	if (!mp && !d_mountpoint(dentry)) {
+		read_sequnlock_excl(&mount_lock);
+		mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+		read_seqlock_excl(&mount_lock);
+	}
+	if (d_mountpoint(dentry)) {
+		kfree(mp);
+		mp = lookup_mountpoint(dentry);
+	} else {
+		if (unlikely(!mp)) {
+			read_sequnlock_excl(&mount_lock);
 			namespace_unlock();
 			inode_unlock(dentry->d_inode);
-			return mp;
+			return ERR_PTR(-ENOMEM);
 		}
-		return mp;
+		err = d_set_mounted(dentry);
+		if (unlikely(err)) {
+			kfree(mp);
+			read_sequnlock_excl(&mount_lock);
+			namespace_unlock();
+			inode_unlock(dentry->d_inode);
+			return ERR_PTR(err);
+		}
+		mp->m_dentry = dentry;
+		mp->m_count = 1;
+		hlist_add_head(&mp->m_hash, mp_hash(dentry));
+		INIT_HLIST_HEAD(&mp->m_list);
 	}
-	namespace_unlock();
-	inode_unlock(path->dentry->d_inode);
-	path_put(path);
-	path->mnt = mnt;
-	dentry = path->dentry = dget(mnt->mnt_root);
-	goto retry;
+	read_sequnlock_excl(&mount_lock);
+	return mp;
 }
 
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(where);
+	read_sequnlock_excl(&mount_lock);
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 }
@@ -3137,7 +3148,9 @@  SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	list_del_init(&new_mnt->mnt_expire);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(root_mp);
+	read_sequnlock_excl(&mount_lock);
 	error = 0;
 out4:
 	unlock_mount(old_mp);