diff mbox series

[09/11] nfsd: Allow filehandle lookup to cross internal mount points.

Message ID 162742546556.32498.16708762469227881912.stgit@noble.brown (mailing list archive)
State New, archived
Headers show
Series expose btrfs subvols in mount table correctly | expand

Commit Message

NeilBrown July 27, 2021, 10:37 p.m. UTC
Enhance nfsd to detect internal mounts and to cross them without
requiring a new export.

Also ensure the fsid reported is different for different submounts.  We
do this by xoring in the ino of the mounted-on directory.  This makes
sense for btrfs at least.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs3xdr.c |   28 +++++++++++++++++++++-------
 fs/nfsd/nfs4xdr.c |   34 +++++++++++++++++++++++-----------
 fs/nfsd/nfsfh.c   |    7 ++++++-
 fs/nfsd/vfs.c     |   11 +++++++++--
 4 files changed, 59 insertions(+), 21 deletions(-)

Comments

J. Bruce Fields July 28, 2021, 7:15 p.m. UTC | #1
On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote:
> Enhance nfsd to detect internal mounts and to cross them without
> requiring a new export.

Why don't we want a new export?

(Honest question, it's not obvious to me what the best behavior is.)

--b.

> 
> Also ensure the fsid reported is different for different submounts.  We
> do this by xoring in the ino of the mounted-on directory.  This makes
> sense for btrfs at least.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs3xdr.c |   28 +++++++++++++++++++++-------
>  fs/nfsd/nfs4xdr.c |   34 +++++++++++++++++++++++-----------
>  fs/nfsd/nfsfh.c   |    7 ++++++-
>  fs/nfsd/vfs.c     |   11 +++++++++--
>  4 files changed, 59 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 67af0c5c1543..80b1cc0334fa 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -370,6 +370,8 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
>  	case FSIDSOURCE_UUID:
>  		fsid = ((u64 *)fhp->fh_export->ex_uuid)[0];
>  		fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1];
> +		if (fhp->fh_mnt != fhp->fh_export->ex_path.mnt)
> +			fsid ^= nfsd_get_mounted_on(fhp->fh_mnt);
>  		break;
>  	default:
>  		fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev);
> @@ -1094,8 +1096,8 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>  	__be32 rv = nfserr_noent;
>  
>  	dparent = cd->fh.fh_dentry;
> -	exp  = cd->fh.fh_export;
> -	child.mnt = cd->fh.fh_mnt;
> +	exp  = exp_get(cd->fh.fh_export);
> +	child.mnt = mntget(cd->fh.fh_mnt);
>  
>  	if (isdotent(name, namlen)) {
>  		if (namlen == 2) {
> @@ -1112,15 +1114,27 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>  			child.dentry = dget(dparent);
>  	} else
>  		child.dentry = lookup_positive_unlocked(name, dparent, namlen);
> -	if (IS_ERR(child.dentry))
> +	if (IS_ERR(child.dentry)) {
> +		mntput(child.mnt);
> +		exp_put(exp);
>  		return rv;
> -	if (d_mountpoint(child.dentry))
> -		goto out;
> -	if (child.dentry->d_inode->i_ino != ino)
> +	}
> +	/* If child is a mountpoint, then we want to expose the fact
> +	 * so client can create a mountpoint.  If not, then a different
> +	 * ino number probably means a race with rename, so avoid providing
> +	 * too much detail.
> +	 */
> +	if (nfsd_mountpoint(child.dentry, exp)) {
> +		int err;
> +		err = nfsd_cross_mnt(cd->rqstp, &child, &exp);
> +		if (err)
> +			goto out;
> +	} else if (child.dentry->d_inode->i_ino != ino)
>  		goto out;
>  	rv = fh_compose(fhp, exp, &child, &cd->fh);
>  out:
> -	dput(child.dentry);
> +	path_put(&child);
> +	exp_put(exp);
>  	return rv;
>  }
>  
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index d5683b6a74b2..4dbc99ed2c8b 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2817,6 +2817,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>  	struct kstat stat;
>  	struct svc_fh *tempfh = NULL;
>  	struct kstatfs statfs;
> +	u64 mounted_on_ino;
> +	u64 sub_fsid;
>  	__be32 *p;
>  	int starting_len = xdr->buf->len;
>  	int attrlen_offset;
> @@ -2871,6 +2873,24 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>  			goto out;
>  		fhp = tempfh;
>  	}
> +	if ((bmval0 & FATTR4_WORD0_FSID) ||
> +	    (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
> +		mounted_on_ino = stat.ino;
> +		sub_fsid = 0;
> +		/*
> +		 * The inode number that the current mnt is mounted on is
> +		 * used for MOUNTED_ON_FILED if we are at the root,
> +		 * and for sub_fsid if mnt is not the export mnt.
> +		 */
> +		if (ignore_crossmnt == 0) {
> +			u64 moi = nfsd_get_mounted_on(mnt);
> +
> +			if (dentry == mnt->mnt_root && moi)
> +				mounted_on_ino = moi;
> +			if (mnt != exp->ex_path.mnt)
> +				sub_fsid = moi;
> +		}
> +	}
>  	if (bmval0 & FATTR4_WORD0_ACL) {
>  		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
>  		if (err == -EOPNOTSUPP)
> @@ -3008,6 +3028,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>  		case FSIDSOURCE_UUID:
>  			p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
>  								EX_UUID_LEN);
> +			if (mnt != exp->ex_path.mnt)
> +				*(u64*)(p-2) ^= sub_fsid;
>  			break;
>  		}
>  	}
> @@ -3253,20 +3275,10 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
>  		*p++ = cpu_to_be32(stat.mtime.tv_nsec);
>  	}
>  	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
> -		u64 ino;
> -
>  		p = xdr_reserve_space(xdr, 8);
>  		if (!p)
>  			goto out_resource;
> -		/*
> -		 * Get parent's attributes if not ignoring crossmount
> -		 * and this is the root of a cross-mounted filesystem.
> -		 */
> -		if (ignore_crossmnt == 0 && dentry == mnt->mnt_root)
> -			ino = nfsd_get_mounted_on(mnt);
> -		if (!ino)
> -			ino = stat.ino;
> -		p = xdr_encode_hyper(p, ino);
> +		p = xdr_encode_hyper(p, mounted_on_ino);
>  	}
>  #ifdef CONFIG_NFSD_PNFS
>  	if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index 4023046f63e2..4b53838bca89 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -9,7 +9,7 @@
>   */
>  
>  #include <linux/exportfs.h>
> -
> +#include <linux/namei.h>
>  #include <linux/sunrpc/svcauth_gss.h>
>  #include "nfsd.h"
>  #include "vfs.h"
> @@ -285,6 +285,11 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
>  			default:
>  				dentry = ERR_PTR(-ESTALE);
>  			}
> +		} else if (nfsd_mountpoint(dentry, exp)) {
> +			struct path path = { .mnt = mnt, .dentry = dentry };
> +			follow_down(&path, LOOKUP_AUTOMOUNT);
> +			mnt = path.mnt;
> +			dentry = path.dentry;
>  		}
>  	}
>  	if (dentry == NULL)
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index baa12ac36ece..22523e1cd478 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -64,7 +64,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
>  			    .dentry = dget(path_parent->dentry)};
>  	int err = 0;
>  
> -	err = follow_down(&path, 0);
> +	err = follow_down(&path, LOOKUP_AUTOMOUNT);
>  	if (err < 0)
>  		goto out;
>  	if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry &&
> @@ -73,6 +73,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
>  		path_put(&path);
>  		goto out;
>  	}
> +	if (mount_is_internal(path.mnt)) {
> +		/* Use the new path, but don't look for a new export */
> +		/* FIXME should I check NOHIDE in this case?? */
> +		path_put(path_parent);
> +		*path_parent = path;
> +		goto out;
> +	}
>  
>  	exp2 = rqst_exp_get_by_name(rqstp, &path);
>  	if (IS_ERR(exp2)) {
> @@ -157,7 +164,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
>  		return 1;
>  	if (nfsd4_is_junction(dentry))
>  		return 1;
> -	if (d_mountpoint(dentry))
> +	if (d_managed(dentry))
>  		/*
>  		 * Might only be a mountpoint in a different namespace,
>  		 * but we need to check.
>
NeilBrown July 28, 2021, 10:29 p.m. UTC | #2
On Thu, 29 Jul 2021, J. Bruce Fields wrote:
> On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote:
> > Enhance nfsd to detect internal mounts and to cross them without
> > requiring a new export.
> 
> Why don't we want a new export?
> 
> (Honest question, it's not obvious to me what the best behavior is.)

Because a new export means asking user-space to determine if the mount
is exported and to provide a filehandle-prefix for it.  A large part of
the point of this it to avoid using a different filehandle-prefix.

I haven't yet thought deeply about how the 'crossmnt' flag (for v3)
should affect crossing these internal mounts.  My current feeling is
that it shouldn't as it really is just one big filesystem being
exported, which happens to internally have different inode-number
spaces. 
Unfortuantely this technically violates the RFC as the fsid is not meant
to change when you do a LOOKUP ...

NeilBrown
Al Viro July 30, 2021, 12:42 a.m. UTC | #3
On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote:

> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index baa12ac36ece..22523e1cd478 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -64,7 +64,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
>  			    .dentry = dget(path_parent->dentry)};
>  	int err = 0;
>  
> -	err = follow_down(&path, 0);
> +	err = follow_down(&path, LOOKUP_AUTOMOUNT);
>  	if (err < 0)
>  		goto out;
>  	if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry &&
> @@ -73,6 +73,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
>  		path_put(&path);
>  		goto out;
>  	}
> +	if (mount_is_internal(path.mnt)) {
> +		/* Use the new path, but don't look for a new export */
> +		/* FIXME should I check NOHIDE in this case?? */
> +		path_put(path_parent);
> +		*path_parent = path;
> +		goto out;
> +	}

... IOW, mount_is_internal() is called with no exclusion whatsoever.  What's there
to
	* keep its return value valid?
	* prevent fetching ->mnt_mountpoint, getting preempted away, having
the mount moved *and* what used to be ->mnt_mountpoint evicted from dcache,
now that it's no longer pinned, then mount_is_internal() regaining CPU and
dereferencing ->mnt_mountpoint, which now points to hell knows what?
NeilBrown July 30, 2021, 5:43 a.m. UTC | #4
On Fri, 30 Jul 2021, Al Viro wrote:
> On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote:
> 
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index baa12ac36ece..22523e1cd478 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -64,7 +64,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
> >  			    .dentry = dget(path_parent->dentry)};
> >  	int err = 0;
> >  
> > -	err = follow_down(&path, 0);
> > +	err = follow_down(&path, LOOKUP_AUTOMOUNT);
> >  	if (err < 0)
> >  		goto out;
> >  	if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry &&
> > @@ -73,6 +73,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
> >  		path_put(&path);
> >  		goto out;
> >  	}
> > +	if (mount_is_internal(path.mnt)) {
> > +		/* Use the new path, but don't look for a new export */
> > +		/* FIXME should I check NOHIDE in this case?? */
> > +		path_put(path_parent);
> > +		*path_parent = path;
> > +		goto out;
> > +	}
> 
> ... IOW, mount_is_internal() is called with no exclusion whatsoever.  What's there
> to
> 	* keep its return value valid?
> 	* prevent fetching ->mnt_mountpoint, getting preempted away, having
> the mount moved *and* what used to be ->mnt_mountpoint evicted from dcache,
> now that it's no longer pinned, then mount_is_internal() regaining CPU and
> dereferencing ->mnt_mountpoint, which now points to hell knows what?
> 

Yes, mount_is_internal needs to same mount_lock protection that
lookup_mnt() has.  Thanks.

I don't think it matter how long the result remains valid.  The only
realistic transtion is from True to False, but the fact that it *was*
True means that it is acceptable for the lookup to have succeeded.
i.e.  If the mountpoint was moved which a request was being processed it
will either cause the same result as if it happened before the request
started, or after it finished.  Either seems OK.

Thanks,
NeilBrown
diff mbox series

Patch

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 67af0c5c1543..80b1cc0334fa 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -370,6 +370,8 @@  svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	case FSIDSOURCE_UUID:
 		fsid = ((u64 *)fhp->fh_export->ex_uuid)[0];
 		fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1];
+		if (fhp->fh_mnt != fhp->fh_export->ex_path.mnt)
+			fsid ^= nfsd_get_mounted_on(fhp->fh_mnt);
 		break;
 	default:
 		fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev);
@@ -1094,8 +1096,8 @@  compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 	__be32 rv = nfserr_noent;
 
 	dparent = cd->fh.fh_dentry;
-	exp  = cd->fh.fh_export;
-	child.mnt = cd->fh.fh_mnt;
+	exp  = exp_get(cd->fh.fh_export);
+	child.mnt = mntget(cd->fh.fh_mnt);
 
 	if (isdotent(name, namlen)) {
 		if (namlen == 2) {
@@ -1112,15 +1114,27 @@  compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 			child.dentry = dget(dparent);
 	} else
 		child.dentry = lookup_positive_unlocked(name, dparent, namlen);
-	if (IS_ERR(child.dentry))
+	if (IS_ERR(child.dentry)) {
+		mntput(child.mnt);
+		exp_put(exp);
 		return rv;
-	if (d_mountpoint(child.dentry))
-		goto out;
-	if (child.dentry->d_inode->i_ino != ino)
+	}
+	/* If child is a mountpoint, then we want to expose the fact
+	 * so client can create a mountpoint.  If not, then a different
+	 * ino number probably means a race with rename, so avoid providing
+	 * too much detail.
+	 */
+	if (nfsd_mountpoint(child.dentry, exp)) {
+		int err;
+		err = nfsd_cross_mnt(cd->rqstp, &child, &exp);
+		if (err)
+			goto out;
+	} else if (child.dentry->d_inode->i_ino != ino)
 		goto out;
 	rv = fh_compose(fhp, exp, &child, &cd->fh);
 out:
-	dput(child.dentry);
+	path_put(&child);
+	exp_put(exp);
 	return rv;
 }
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d5683b6a74b2..4dbc99ed2c8b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2817,6 +2817,8 @@  nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 	struct kstat stat;
 	struct svc_fh *tempfh = NULL;
 	struct kstatfs statfs;
+	u64 mounted_on_ino;
+	u64 sub_fsid;
 	__be32 *p;
 	int starting_len = xdr->buf->len;
 	int attrlen_offset;
@@ -2871,6 +2873,24 @@  nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out;
 		fhp = tempfh;
 	}
+	if ((bmval0 & FATTR4_WORD0_FSID) ||
+	    (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+		mounted_on_ino = stat.ino;
+		sub_fsid = 0;
+		/*
+		 * The inode number that the current mnt is mounted on is
+		 * used for MOUNTED_ON_FILED if we are at the root,
+		 * and for sub_fsid if mnt is not the export mnt.
+		 */
+		if (ignore_crossmnt == 0) {
+			u64 moi = nfsd_get_mounted_on(mnt);
+
+			if (dentry == mnt->mnt_root && moi)
+				mounted_on_ino = moi;
+			if (mnt != exp->ex_path.mnt)
+				sub_fsid = moi;
+		}
+	}
 	if (bmval0 & FATTR4_WORD0_ACL) {
 		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
 		if (err == -EOPNOTSUPP)
@@ -3008,6 +3028,8 @@  nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 		case FSIDSOURCE_UUID:
 			p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
 								EX_UUID_LEN);
+			if (mnt != exp->ex_path.mnt)
+				*(u64*)(p-2) ^= sub_fsid;
 			break;
 		}
 	}
@@ -3253,20 +3275,10 @@  nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 		*p++ = cpu_to_be32(stat.mtime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
-		u64 ino;
-
 		p = xdr_reserve_space(xdr, 8);
 		if (!p)
 			goto out_resource;
-		/*
-		 * Get parent's attributes if not ignoring crossmount
-		 * and this is the root of a cross-mounted filesystem.
-		 */
-		if (ignore_crossmnt == 0 && dentry == mnt->mnt_root)
-			ino = nfsd_get_mounted_on(mnt);
-		if (!ino)
-			ino = stat.ino;
-		p = xdr_encode_hyper(p, ino);
+		p = xdr_encode_hyper(p, mounted_on_ino);
 	}
 #ifdef CONFIG_NFSD_PNFS
 	if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 4023046f63e2..4b53838bca89 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -9,7 +9,7 @@ 
  */
 
 #include <linux/exportfs.h>
-
+#include <linux/namei.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include "nfsd.h"
 #include "vfs.h"
@@ -285,6 +285,11 @@  static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 			default:
 				dentry = ERR_PTR(-ESTALE);
 			}
+		} else if (nfsd_mountpoint(dentry, exp)) {
+			struct path path = { .mnt = mnt, .dentry = dentry };
+			follow_down(&path, LOOKUP_AUTOMOUNT);
+			mnt = path.mnt;
+			dentry = path.dentry;
 		}
 	}
 	if (dentry == NULL)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index baa12ac36ece..22523e1cd478 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -64,7 +64,7 @@  nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
 			    .dentry = dget(path_parent->dentry)};
 	int err = 0;
 
-	err = follow_down(&path, 0);
+	err = follow_down(&path, LOOKUP_AUTOMOUNT);
 	if (err < 0)
 		goto out;
 	if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry &&
@@ -73,6 +73,13 @@  nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent,
 		path_put(&path);
 		goto out;
 	}
+	if (mount_is_internal(path.mnt)) {
+		/* Use the new path, but don't look for a new export */
+		/* FIXME should I check NOHIDE in this case?? */
+		path_put(path_parent);
+		*path_parent = path;
+		goto out;
+	}
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
@@ -157,7 +164,7 @@  int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
 		return 1;
 	if (nfsd4_is_junction(dentry))
 		return 1;
-	if (d_mountpoint(dentry))
+	if (d_managed(dentry))
 		/*
 		 * Might only be a mountpoint in a different namespace,
 		 * but we need to check.