Message ID | 162742546556.32498.16708762469227881912.stgit@noble.brown (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | expose btrfs subvols in mount table correctly | expand |
On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote: > Enhance nfsd to detect internal mounts and to cross them without > requiring a new export. Why don't we want a new export? (Honest question, it's not obvious to me what the best behavior is.) --b. > > Also ensure the fsid reported is different for different submounts. We > do this by xoring in the ino of the mounted-on directory. This makes > sense for btrfs at least. > > Signed-off-by: NeilBrown <neilb@suse.de> > --- > fs/nfsd/nfs3xdr.c | 28 +++++++++++++++++++++------- > fs/nfsd/nfs4xdr.c | 34 +++++++++++++++++++++++----------- > fs/nfsd/nfsfh.c | 7 ++++++- > fs/nfsd/vfs.c | 11 +++++++++-- > 4 files changed, 59 insertions(+), 21 deletions(-) > > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c > index 67af0c5c1543..80b1cc0334fa 100644 > --- a/fs/nfsd/nfs3xdr.c > +++ b/fs/nfsd/nfs3xdr.c > @@ -370,6 +370,8 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, > case FSIDSOURCE_UUID: > fsid = ((u64 *)fhp->fh_export->ex_uuid)[0]; > fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1]; > + if (fhp->fh_mnt != fhp->fh_export->ex_path.mnt) > + fsid ^= nfsd_get_mounted_on(fhp->fh_mnt); > break; > default: > fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev); > @@ -1094,8 +1096,8 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, > __be32 rv = nfserr_noent; > > dparent = cd->fh.fh_dentry; > - exp = cd->fh.fh_export; > - child.mnt = cd->fh.fh_mnt; > + exp = exp_get(cd->fh.fh_export); > + child.mnt = mntget(cd->fh.fh_mnt); > > if (isdotent(name, namlen)) { > if (namlen == 2) { > @@ -1112,15 +1114,27 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, > child.dentry = dget(dparent); > } else > child.dentry = lookup_positive_unlocked(name, dparent, namlen); > - if (IS_ERR(child.dentry)) > + if (IS_ERR(child.dentry)) { > + mntput(child.mnt); > + exp_put(exp); > return rv; > - if (d_mountpoint(child.dentry)) > - goto out; > - if (child.dentry->d_inode->i_ino != ino) > + } > + /* If child is a mountpoint, then we want to expose the fact > + * so client can create a mountpoint. If not, then a different > + * ino number probably means a race with rename, so avoid providing > + * too much detail. > + */ > + if (nfsd_mountpoint(child.dentry, exp)) { > + int err; > + err = nfsd_cross_mnt(cd->rqstp, &child, &exp); > + if (err) > + goto out; > + } else if (child.dentry->d_inode->i_ino != ino) > goto out; > rv = fh_compose(fhp, exp, &child, &cd->fh); > out: > - dput(child.dentry); > + path_put(&child); > + exp_put(exp); > return rv; > } > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index d5683b6a74b2..4dbc99ed2c8b 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -2817,6 +2817,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, > struct kstat stat; > struct svc_fh *tempfh = NULL; > struct kstatfs statfs; > + u64 mounted_on_ino; > + u64 sub_fsid; > __be32 *p; > int starting_len = xdr->buf->len; > int attrlen_offset; > @@ -2871,6 +2873,24 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, > goto out; > fhp = tempfh; > } > + if ((bmval0 & FATTR4_WORD0_FSID) || > + (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID)) { > + mounted_on_ino = stat.ino; > + sub_fsid = 0; > + /* > + * The inode number that the current mnt is mounted on is > + * used for MOUNTED_ON_FILED if we are at the root, > + * and for sub_fsid if mnt is not the export mnt. > + */ > + if (ignore_crossmnt == 0) { > + u64 moi = nfsd_get_mounted_on(mnt); > + > + if (dentry == mnt->mnt_root && moi) > + mounted_on_ino = moi; > + if (mnt != exp->ex_path.mnt) > + sub_fsid = moi; > + } > + } > if (bmval0 & FATTR4_WORD0_ACL) { > err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); > if (err == -EOPNOTSUPP) > @@ -3008,6 +3028,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, > case FSIDSOURCE_UUID: > p = xdr_encode_opaque_fixed(p, exp->ex_uuid, > EX_UUID_LEN); > + if (mnt != exp->ex_path.mnt) > + *(u64*)(p-2) ^= sub_fsid; > break; > } > } > @@ -3253,20 +3275,10 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, > *p++ = cpu_to_be32(stat.mtime.tv_nsec); > } > if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { > - u64 ino; > - > p = xdr_reserve_space(xdr, 8); > if (!p) > goto out_resource; > - /* > - * Get parent's attributes if not ignoring crossmount > - * and this is the root of a cross-mounted filesystem. > - */ > - if (ignore_crossmnt == 0 && dentry == mnt->mnt_root) > - ino = nfsd_get_mounted_on(mnt); > - if (!ino) > - ino = stat.ino; > - p = xdr_encode_hyper(p, ino); > + p = xdr_encode_hyper(p, mounted_on_ino); > } > #ifdef CONFIG_NFSD_PNFS > if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { > diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c > index 4023046f63e2..4b53838bca89 100644 > --- a/fs/nfsd/nfsfh.c > +++ b/fs/nfsd/nfsfh.c > @@ -9,7 +9,7 @@ > */ > > #include <linux/exportfs.h> > - > +#include <linux/namei.h> > #include <linux/sunrpc/svcauth_gss.h> > #include "nfsd.h" > #include "vfs.h" > @@ -285,6 +285,11 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) > default: > dentry = ERR_PTR(-ESTALE); > } > + } else if (nfsd_mountpoint(dentry, exp)) { > + struct path path = { .mnt = mnt, .dentry = dentry }; > + follow_down(&path, LOOKUP_AUTOMOUNT); > + mnt = path.mnt; > + dentry = path.dentry; > } > } > if (dentry == NULL) > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > index baa12ac36ece..22523e1cd478 100644 > --- a/fs/nfsd/vfs.c > +++ b/fs/nfsd/vfs.c > @@ -64,7 +64,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, > .dentry = dget(path_parent->dentry)}; > int err = 0; > > - err = follow_down(&path, 0); > + err = follow_down(&path, LOOKUP_AUTOMOUNT); > if (err < 0) > goto out; > if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry && > @@ -73,6 +73,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, > path_put(&path); > goto out; > } > + if (mount_is_internal(path.mnt)) { > + /* Use the new path, but don't look for a new export */ > + /* FIXME should I check NOHIDE in this case?? */ > + path_put(path_parent); > + *path_parent = path; > + goto out; > + } > > exp2 = rqst_exp_get_by_name(rqstp, &path); > if (IS_ERR(exp2)) { > @@ -157,7 +164,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) > return 1; > if (nfsd4_is_junction(dentry)) > return 1; > - if (d_mountpoint(dentry)) > + if (d_managed(dentry)) > /* > * Might only be a mountpoint in a different namespace, > * but we need to check. >
On Thu, 29 Jul 2021, J. Bruce Fields wrote: > On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote: > > Enhance nfsd to detect internal mounts and to cross them without > > requiring a new export. > > Why don't we want a new export? > > (Honest question, it's not obvious to me what the best behavior is.) Because a new export means asking user-space to determine if the mount is exported and to provide a filehandle-prefix for it. A large part of the point of this it to avoid using a different filehandle-prefix. I haven't yet thought deeply about how the 'crossmnt' flag (for v3) should affect crossing these internal mounts. My current feeling is that it shouldn't as it really is just one big filesystem being exported, which happens to internally have different inode-number spaces. Unfortuantely this technically violates the RFC as the fsid is not meant to change when you do a LOOKUP ... NeilBrown
On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote: > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > index baa12ac36ece..22523e1cd478 100644 > --- a/fs/nfsd/vfs.c > +++ b/fs/nfsd/vfs.c > @@ -64,7 +64,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, > .dentry = dget(path_parent->dentry)}; > int err = 0; > > - err = follow_down(&path, 0); > + err = follow_down(&path, LOOKUP_AUTOMOUNT); > if (err < 0) > goto out; > if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry && > @@ -73,6 +73,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, > path_put(&path); > goto out; > } > + if (mount_is_internal(path.mnt)) { > + /* Use the new path, but don't look for a new export */ > + /* FIXME should I check NOHIDE in this case?? */ > + path_put(path_parent); > + *path_parent = path; > + goto out; > + } ... IOW, mount_is_internal() is called with no exclusion whatsoever. What's there to * keep its return value valid? * prevent fetching ->mnt_mountpoint, getting preempted away, having the mount moved *and* what used to be ->mnt_mountpoint evicted from dcache, now that it's no longer pinned, then mount_is_internal() regaining CPU and dereferencing ->mnt_mountpoint, which now points to hell knows what?
On Fri, 30 Jul 2021, Al Viro wrote: > On Wed, Jul 28, 2021 at 08:37:45AM +1000, NeilBrown wrote: > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > > index baa12ac36ece..22523e1cd478 100644 > > --- a/fs/nfsd/vfs.c > > +++ b/fs/nfsd/vfs.c > > @@ -64,7 +64,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, > > .dentry = dget(path_parent->dentry)}; > > int err = 0; > > > > - err = follow_down(&path, 0); > > + err = follow_down(&path, LOOKUP_AUTOMOUNT); > > if (err < 0) > > goto out; > > if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry && > > @@ -73,6 +73,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, > > path_put(&path); > > goto out; > > } > > + if (mount_is_internal(path.mnt)) { > > + /* Use the new path, but don't look for a new export */ > > + /* FIXME should I check NOHIDE in this case?? */ > > + path_put(path_parent); > > + *path_parent = path; > > + goto out; > > + } > > ... IOW, mount_is_internal() is called with no exclusion whatsoever. What's there > to > * keep its return value valid? > * prevent fetching ->mnt_mountpoint, getting preempted away, having > the mount moved *and* what used to be ->mnt_mountpoint evicted from dcache, > now that it's no longer pinned, then mount_is_internal() regaining CPU and > dereferencing ->mnt_mountpoint, which now points to hell knows what? > Yes, mount_is_internal needs to same mount_lock protection that lookup_mnt() has. Thanks. I don't think it matter how long the result remains valid. The only realistic transtion is from True to False, but the fact that it *was* True means that it is acceptable for the lookup to have succeeded. i.e. If the mountpoint was moved which a request was being processed it will either cause the same result as if it happened before the request started, or after it finished. Either seems OK. Thanks, NeilBrown
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 67af0c5c1543..80b1cc0334fa 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -370,6 +370,8 @@ svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, case FSIDSOURCE_UUID: fsid = ((u64 *)fhp->fh_export->ex_uuid)[0]; fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1]; + if (fhp->fh_mnt != fhp->fh_export->ex_path.mnt) + fsid ^= nfsd_get_mounted_on(fhp->fh_mnt); break; default: fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev); @@ -1094,8 +1096,8 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, __be32 rv = nfserr_noent; dparent = cd->fh.fh_dentry; - exp = cd->fh.fh_export; - child.mnt = cd->fh.fh_mnt; + exp = exp_get(cd->fh.fh_export); + child.mnt = mntget(cd->fh.fh_mnt); if (isdotent(name, namlen)) { if (namlen == 2) { @@ -1112,15 +1114,27 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, child.dentry = dget(dparent); } else child.dentry = lookup_positive_unlocked(name, dparent, namlen); - if (IS_ERR(child.dentry)) + if (IS_ERR(child.dentry)) { + mntput(child.mnt); + exp_put(exp); return rv; - if (d_mountpoint(child.dentry)) - goto out; - if (child.dentry->d_inode->i_ino != ino) + } + /* If child is a mountpoint, then we want to expose the fact + * so client can create a mountpoint. If not, then a different + * ino number probably means a race with rename, so avoid providing + * too much detail. + */ + if (nfsd_mountpoint(child.dentry, exp)) { + int err; + err = nfsd_cross_mnt(cd->rqstp, &child, &exp); + if (err) + goto out; + } else if (child.dentry->d_inode->i_ino != ino) goto out; rv = fh_compose(fhp, exp, &child, &cd->fh); out: - dput(child.dentry); + path_put(&child); + exp_put(exp); return rv; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d5683b6a74b2..4dbc99ed2c8b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2817,6 +2817,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, struct kstat stat; struct svc_fh *tempfh = NULL; struct kstatfs statfs; + u64 mounted_on_ino; + u64 sub_fsid; __be32 *p; int starting_len = xdr->buf->len; int attrlen_offset; @@ -2871,6 +2873,24 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; fhp = tempfh; } + if ((bmval0 & FATTR4_WORD0_FSID) || + (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID)) { + mounted_on_ino = stat.ino; + sub_fsid = 0; + /* + * The inode number that the current mnt is mounted on is + * used for MOUNTED_ON_FILED if we are at the root, + * and for sub_fsid if mnt is not the export mnt. + */ + if (ignore_crossmnt == 0) { + u64 moi = nfsd_get_mounted_on(mnt); + + if (dentry == mnt->mnt_root && moi) + mounted_on_ino = moi; + if (mnt != exp->ex_path.mnt) + sub_fsid = moi; + } + } if (bmval0 & FATTR4_WORD0_ACL) { err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); if (err == -EOPNOTSUPP) @@ -3008,6 +3028,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, case FSIDSOURCE_UUID: p = xdr_encode_opaque_fixed(p, exp->ex_uuid, EX_UUID_LEN); + if (mnt != exp->ex_path.mnt) + *(u64*)(p-2) ^= sub_fsid; break; } } @@ -3253,20 +3275,10 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, *p++ = cpu_to_be32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - u64 ino; - p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; - /* - * Get parent's attributes if not ignoring crossmount - * and this is the root of a cross-mounted filesystem. - */ - if (ignore_crossmnt == 0 && dentry == mnt->mnt_root) - ino = nfsd_get_mounted_on(mnt); - if (!ino) - ino = stat.ino; - p = xdr_encode_hyper(p, ino); + p = xdr_encode_hyper(p, mounted_on_ino); } #ifdef CONFIG_NFSD_PNFS if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 4023046f63e2..4b53838bca89 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -9,7 +9,7 @@ */ #include <linux/exportfs.h> - +#include <linux/namei.h> #include <linux/sunrpc/svcauth_gss.h> #include "nfsd.h" #include "vfs.h" @@ -285,6 +285,11 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) default: dentry = ERR_PTR(-ESTALE); } + } else if (nfsd_mountpoint(dentry, exp)) { + struct path path = { .mnt = mnt, .dentry = dentry }; + follow_down(&path, LOOKUP_AUTOMOUNT); + mnt = path.mnt; + dentry = path.dentry; } } if (dentry == NULL) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index baa12ac36ece..22523e1cd478 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -64,7 +64,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, .dentry = dget(path_parent->dentry)}; int err = 0; - err = follow_down(&path, 0); + err = follow_down(&path, LOOKUP_AUTOMOUNT); if (err < 0) goto out; if (path.mnt == path_parent->mnt && path.dentry == path_parent->dentry && @@ -73,6 +73,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct path *path_parent, path_put(&path); goto out; } + if (mount_is_internal(path.mnt)) { + /* Use the new path, but don't look for a new export */ + /* FIXME should I check NOHIDE in this case?? */ + path_put(path_parent); + *path_parent = path; + goto out; + } exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { @@ -157,7 +164,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) return 1; if (nfsd4_is_junction(dentry)) return 1; - if (d_mountpoint(dentry)) + if (d_managed(dentry)) /* * Might only be a mountpoint in a different namespace, * but we need to check.
Enhance nfsd to detect internal mounts and to cross them without requiring a new export. Also ensure the fsid reported is different for different submounts. We do this by xoring in the ino of the mounted-on directory. This makes sense for btrfs at least. Signed-off-by: NeilBrown <neilb@suse.de> --- fs/nfsd/nfs3xdr.c | 28 +++++++++++++++++++++------- fs/nfsd/nfs4xdr.c | 34 +++++++++++++++++++++++----------- fs/nfsd/nfsfh.c | 7 ++++++- fs/nfsd/vfs.c | 11 +++++++++-- 4 files changed, 59 insertions(+), 21 deletions(-)