diff mbox series

[RFC,v2,02/34] fix automount/automount race properly

Message ID 20200223011626.4103706-2-viro@ZenIV.linux.org.uk (mailing list archive)
State New, archived
Headers show
Series [RFC,v2,01/34] do_add_mount(): lift lock_mount/unlock_mount into callers | expand

Commit Message

Al Viro Feb. 23, 2020, 1:15 a.m. UTC
From: Al Viro <viro@zeniv.linux.org.uk>

Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY.  It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it.  It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped.  That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
	 referral point under automounted NFS under explicit overmount
under another copy of automounted NFS

What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.

IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part.  And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.

Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...

Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c     | 29 ++++-------------------------
 fs/namespace.c | 41 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 38 insertions(+), 32 deletions(-)

Comments

Linus Torvalds Feb. 23, 2020, 2:07 a.m. UTC | #1
On Sat, Feb 22, 2020 at 5:16 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> +
> +discard2:
> +       namespace_unlock();
> +discard1:
> +       inode_unlock(dentry->d_inode);
> +discard:
>         /* remove m from any expiration list it may be on */

Would you mind re-naming those labels?

I realize that the numbering may help show that the error handling is
done in the reverse order, but I bet that a nice name could so that
too.

              Linus
Al Viro Feb. 27, 2020, 7:43 p.m. UTC | #2
On Sat, Feb 22, 2020 at 06:07:39PM -0800, Linus Torvalds wrote:
> On Sat, Feb 22, 2020 at 5:16 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > +
> > +discard2:
> > +       namespace_unlock();
> > +discard1:
> > +       inode_unlock(dentry->d_inode);
> > +discard:
> >         /* remove m from any expiration list it may be on */
> 
> Would you mind re-naming those labels?
> 
> I realize that the numbering may help show that the error handling is
> done in the reverse order, but I bet that a nice name could so that
> too.

Umm...  A bit of reordering in the beginning eliminates discard1, suggesting
s/discard2/discard_locked/...  Incremental would be

diff --git a/fs/namespace.c b/fs/namespace.c
index 6228fd1ef94f..777c3116e62e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2844,22 +2844,22 @@ int finish_automount(struct vfsmount *m, struct path *path)
 	 * got", not "try to mount it on top".
 	 */
 	inode_lock(dentry->d_inode);
+	namespace_lock();
 	if (unlikely(cant_mount(dentry))) {
 		err = -ENOENT;
-		goto discard1;
+		goto discard_locked;
 	}
-	namespace_lock();
 	rcu_read_lock();
 	if (unlikely(__lookup_mnt(path->mnt, dentry))) {
 		rcu_read_unlock();
 		err = 0;
-		goto discard2;
+		goto discard_locked;
 	}
 	rcu_read_unlock();
 	mp = get_mountpoint(dentry);
 	if (IS_ERR(mp)) {
 		err = PTR_ERR(mp);
-		goto discard2;
+		goto discard_locked;
 	}
 
 	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
@@ -2869,9 +2869,8 @@ int finish_automount(struct vfsmount *m, struct path *path)
 	mntput(m);
 	return 0;
 
-discard2:
+discard_locked:
 	namespace_unlock();
-discard1:
 	inode_unlock(dentry->d_inode);
 discard:
 	/* remove m from any expiration list it may be on */
Linus Torvalds Feb. 27, 2020, 8 p.m. UTC | #3
On Thu, Feb 27, 2020 at 11:43 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Umm...  A bit of reordering in the beginning eliminates discard1, suggesting
> s/discard2/discard_locked/...  Incremental would be

Ack, thanks.

          Linus
diff mbox series

Patch

diff --git a/fs/namei.c b/fs/namei.c
index db6565c99825..626eddb33508 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1208,11 +1208,9 @@  EXPORT_SYMBOL(follow_up);
  * - return -EISDIR to tell follow_managed() to stop and return the path we
  *   were called with.
  */
-static int follow_automount(struct path *path, struct nameidata *nd,
-			    bool *need_mntput)
+static int follow_automount(struct path *path, struct nameidata *nd)
 {
 	struct vfsmount *mnt;
-	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 		return -EREMOTE;
@@ -1253,29 +1251,10 @@  static int follow_automount(struct path *path, struct nameidata *nd,
 		return PTR_ERR(mnt);
 	}
 
-	if (!mnt) /* mount collision */
-		return 0;
-
-	if (!*need_mntput) {
-		/* lock_mount() may release path->mnt on error */
-		mntget(path->mnt);
-		*need_mntput = true;
-	}
-	err = finish_automount(mnt, path);
-
-	switch (err) {
-	case -EBUSY:
-		/* Someone else made a mount here whilst we were busy */
+	if (!mnt)
 		return 0;
-	case 0:
-		path_put(path);
-		path->mnt = mnt;
-		path->dentry = dget(mnt->mnt_root);
-		return 0;
-	default:
-		return err;
-	}
 
+	return finish_automount(mnt, path);
 }
 
 /*
@@ -1333,7 +1312,7 @@  static int follow_managed(struct path *path, struct nameidata *nd)
 
 		/* Handle an automount point */
 		if (flags & DCACHE_NEED_AUTOMOUNT) {
-			ret = follow_automount(path, nd, &need_mntput);
+			ret = follow_automount(path, nd);
 			if (ret < 0)
 				break;
 			continue;
diff --git a/fs/namespace.c b/fs/namespace.c
index dcd015fafe01..6228fd1ef94f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2823,6 +2823,7 @@  static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 
 int finish_automount(struct vfsmount *m, struct path *path)
 {
+	struct dentry *dentry = path->dentry;
 	struct mount *mnt = real_mount(m);
 	struct mountpoint *mp;
 	int err;
@@ -2832,21 +2833,47 @@  int finish_automount(struct vfsmount *m, struct path *path)
 	BUG_ON(mnt_get_count(mnt) < 2);
 
 	if (m->mnt_sb == path->mnt->mnt_sb &&
-	    m->mnt_root == path->dentry) {
+	    m->mnt_root == dentry) {
 		err = -ELOOP;
-		goto fail;
+		goto discard;
 	}
 
-	mp = lock_mount(path);
+	/*
+	 * we don't want to use lock_mount() - in this case finding something
+	 * that overmounts our mountpoint to be means "quitely drop what we've
+	 * got", not "try to mount it on top".
+	 */
+	inode_lock(dentry->d_inode);
+	if (unlikely(cant_mount(dentry))) {
+		err = -ENOENT;
+		goto discard1;
+	}
+	namespace_lock();
+	rcu_read_lock();
+	if (unlikely(__lookup_mnt(path->mnt, dentry))) {
+		rcu_read_unlock();
+		err = 0;
+		goto discard2;
+	}
+	rcu_read_unlock();
+	mp = get_mountpoint(dentry);
 	if (IS_ERR(mp)) {
 		err = PTR_ERR(mp);
-		goto fail;
+		goto discard2;
 	}
+
 	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
 	unlock_mount(mp);
-	if (!err)
-		return 0;
-fail:
+	if (unlikely(err))
+		goto discard;
+	mntput(m);
+	return 0;
+
+discard2:
+	namespace_unlock();
+discard1:
+	inode_unlock(dentry->d_inode);
+discard:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		namespace_lock();