diff mbox series

[2/4] nfsd: pre/post attr is using wrong change attribute

Message ID 1605583086-19869-2-git-send-email-bfields@redhat.com (mailing list archive)
State New, archived
Headers show
Series [1/4] nfsd: move fill_{pre,post}_wcc to nfsfh.c | expand

Commit Message

J. Bruce Fields Nov. 17, 2020, 3:18 a.m. UTC
From: "J. Bruce Fields" <bfields@redhat.com>

fill_{pre/post}_attr are unconditionally using i_version even when the
underlying filesystem doesn't have proper support for i_version.

Move the code that chooses which i_version to use to the common
nfsd4_change_attribute().

The NFSEXP_V4ROOT case probably doesn't matter (the pseudoroot
filesystem is usually read-only and unlikely to see operations with pre
and post change attributes), but let's put it in the same place anyway
for consistency.

Fixes: c654b8a9cba6 ("nfsd: support ext4 i_version")
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 11 +----------
 fs/nfsd/nfsfh.c   | 11 +++++++----
 fs/nfsd/nfsfh.h   | 23 -----------------------
 fs/nfsd/vfs.c     | 32 ++++++++++++++++++++++++++++++++
 fs/nfsd/vfs.h     |  3 +++
 5 files changed, 43 insertions(+), 37 deletions(-)

Comments

Jeff Layton Nov. 17, 2020, 12:34 p.m. UTC | #1
On Mon, 2020-11-16 at 22:18 -0500, J. Bruce Fields wrote:
> From: "J. Bruce Fields" <bfields@redhat.com>
> 
> fill_{pre/post}_attr are unconditionally using i_version even when the
> underlying filesystem doesn't have proper support for i_version.
> 
> Move the code that chooses which i_version to use to the common
> nfsd4_change_attribute().
> 
> The NFSEXP_V4ROOT case probably doesn't matter (the pseudoroot
> filesystem is usually read-only and unlikely to see operations with pre
> and post change attributes), but let's put it in the same place anyway
> for consistency.
> 
> Fixes: c654b8a9cba6 ("nfsd: support ext4 i_version")
> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> ---
>  fs/nfsd/nfs4xdr.c | 11 +----------
>  fs/nfsd/nfsfh.c   | 11 +++++++----
>  fs/nfsd/nfsfh.h   | 23 -----------------------
>  fs/nfsd/vfs.c     | 32 ++++++++++++++++++++++++++++++++
>  fs/nfsd/vfs.h     |  3 +++
>  5 files changed, 43 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 833a2c64dfe8..6806207b6d18 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2295,16 +2295,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
>  static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
>  			     struct svc_export *exp)
>  {
> -	if (exp->ex_flags & NFSEXP_V4ROOT) {
> -		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
> -		*p++ = 0;
> -	} else if (IS_I_VERSION(inode)) {
> -		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
> -	} else {
> -		*p++ = cpu_to_be32(stat->ctime.tv_sec);
> -		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
> -	}
> -	return p;
> +	return xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode, exp));
>  }
>  
> 
> 
> 
>  /*
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index b3b4e8809aa9..4fbe1413e767 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -719,6 +719,7 @@ void fill_pre_wcc(struct svc_fh *fhp)
>  {
>  	struct inode    *inode;
>  	struct kstat	stat;
> +	struct svc_export *exp = fhp->fh_export;
>  	__be32 err;
>  
> 
> 
> 
>  	if (fhp->fh_pre_saved)
> @@ -736,7 +737,7 @@ void fill_pre_wcc(struct svc_fh *fhp)
>  	fhp->fh_pre_mtime = stat.mtime;
>  	fhp->fh_pre_ctime = stat.ctime;
>  	fhp->fh_pre_size  = stat.size;
> -	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> +	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode, exp);
>  	fhp->fh_pre_saved = true;
>  }
>  
> 
> 
> 
> @@ -746,17 +747,19 @@ void fill_pre_wcc(struct svc_fh *fhp)
>  void fill_post_wcc(struct svc_fh *fhp)
>  {
>  	__be32 err;
> +	struct inode *inode = d_inode(fhp->fh_dentry);
> +	struct svc_export *exp = fhp->fh_export;
>  
> 
> 
> 
>  	if (fhp->fh_post_saved)
>  		printk("nfsd: inode locked twice during operation.\n");
>  
> 
> 
> 
>  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> -	fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
> -						     d_inode(fhp->fh_dentry));
> +	fhp->fh_post_change =
> +			nfsd4_change_attribute(&fhp->fh_post_attr, inode, exp);
>  	if (err) {
>  		fhp->fh_post_saved = false;
>  		/* Grab the ctime anyway - set_change_info might use it */
> -		fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
> +		fhp->fh_post_attr.ctime = inode->i_ctime;
>  	} else
>  		fhp->fh_post_saved = true;
>  }
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index 56cfbc361561..547aef9b3265 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -245,29 +245,6 @@ fh_clear_wcc(struct svc_fh *fhp)
>  	fhp->fh_pre_saved = false;
>  }
>  
> 
> 
> 
> -/*
> - * We could use i_version alone as the change attribute.  However,
> - * i_version can go backwards after a reboot.  On its own that doesn't
> - * necessarily cause a problem, but if i_version goes backwards and then
> - * is incremented again it could reuse a value that was previously used
> - * before boot, and a client who queried the two values might
> - * incorrectly assume nothing changed.
> - *
> - * By using both ctime and the i_version counter we guarantee that as
> - * long as time doesn't go backwards we never reuse an old value.
> - */
> -static inline u64 nfsd4_change_attribute(struct kstat *stat,
> -					 struct inode *inode)
> -{
> -	u64 chattr;
> -
> -	chattr =  stat->ctime.tv_sec;
> -	chattr <<= 30;
> -	chattr += stat->ctime.tv_nsec;
> -	chattr += inode_query_iversion(inode);
> -	return chattr;
> -}
> -
>  extern void fill_pre_wcc(struct svc_fh *fhp);
>  extern void fill_post_wcc(struct svc_fh *fhp);
>  #else
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 1ecaceebee13..2c71b02dd1fe 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -2390,3 +2390,35 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
>  
> 
> 
> 
>  	return err? nfserrno(err) : 0;
>  }
> +
> +/*
> + * We could use i_version alone as the change attribute.  However,
> + * i_version can go backwards after a reboot.  On its own that doesn't
> + * necessarily cause a problem, but if i_version goes backwards and then
> + * is incremented again it could reuse a value that was previously used
> + * before boot, and a client who queried the two values might
> + * incorrectly assume nothing changed.
> + *
> + * By using both ctime and the i_version counter we guarantee that as
> + * long as time doesn't go backwards we never reuse an old value.
> + */
> +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode,
> +					 struct svc_export *exp)
> +{
> +	u64 chattr;
> +
> +	if (exp->ex_flags & NFSEXP_V4ROOT) {
> +		chattr = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
> +		chattr <<= 32;
> +	} else if (IS_I_VERSION(inode)) {
> +		chattr = stat->ctime.tv_sec;
> +		chattr <<= 30;
> +		chattr += stat->ctime.tv_nsec;
> +		chattr += inode_query_iversion(inode);
> +	} else {
> +		chattr = stat->ctime.tv_sec;
> +		chattr <<= 32;
> +		chattr += stat->ctime.tv_nsec;
> +	}
> +	return chattr;
> +}


I don't think I described what I was thinking well. Let me try again...

There should be no need to change the code in iversion.h -- I think we
can do this in a way that's confined to just nfsd/export code.

What I would suggest is to have nfsd4_change_attribute call the
fetch_iversion op if it exists, instead of checking IS_I_VERSION and
doing the stuff in that block. If fetch_iversion is NULL, then just use
the ctime.

Then, you just need to make sure that the filesystems' export_ops have
an appropriate fetch_iversion vector. xfs, ext4 and btrfs can just call
inode_query_iversion, and NFS and Ceph can call inode_peek_iversion_raw.
The rest of the filesystems can leave fetch_iversion as NULL (since we
don't want to use it on them).

> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index a2442ebe5acf..26ed15256340 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -132,6 +132,9 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
>  __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
>  				struct dentry *, int);
>  
> 
> 
> 
> +u64		nfsd4_change_attribute(struct kstat *stat, struct inode *inode,
> +				struct svc_export *exp);
> +
>  static inline int fh_want_write(struct svc_fh *fh)
>  {
>  	int ret;
J. Bruce Fields Nov. 17, 2020, 3:25 p.m. UTC | #2
On Mon, Nov 16, 2020 at 10:18:04PM -0500, J. Bruce Fields wrote:
> From: "J. Bruce Fields" <bfields@redhat.com>
> 
> fill_{pre/post}_attr are unconditionally using i_version even when the
> underlying filesystem doesn't have proper support for i_version.

Actually, I didn't have this quite right....

These values are queried, but they aren't used, thanks to the
"change_supported" field of nfsd4_change_info; in set_change_info():

	cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));

and then later on encode_cinfo() chooses to use stored change attribute
or ctime values depending on how change_supported.

But as of the ctime changes, just querying the change attribute here has
side effects.

So, that explains why Daire's team was seeing a performance regression,
while no one was complaining about our returned change info being
garbage.

Anyway.

--b.

> 
> Move the code that chooses which i_version to use to the common
> nfsd4_change_attribute().
> 
> The NFSEXP_V4ROOT case probably doesn't matter (the pseudoroot
> filesystem is usually read-only and unlikely to see operations with pre
> and post change attributes), but let's put it in the same place anyway
> for consistency.
> 
> Fixes: c654b8a9cba6 ("nfsd: support ext4 i_version")
> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> ---
>  fs/nfsd/nfs4xdr.c | 11 +----------
>  fs/nfsd/nfsfh.c   | 11 +++++++----
>  fs/nfsd/nfsfh.h   | 23 -----------------------
>  fs/nfsd/vfs.c     | 32 ++++++++++++++++++++++++++++++++
>  fs/nfsd/vfs.h     |  3 +++
>  5 files changed, 43 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 833a2c64dfe8..6806207b6d18 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2295,16 +2295,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
>  static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
>  			     struct svc_export *exp)
>  {
> -	if (exp->ex_flags & NFSEXP_V4ROOT) {
> -		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
> -		*p++ = 0;
> -	} else if (IS_I_VERSION(inode)) {
> -		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
> -	} else {
> -		*p++ = cpu_to_be32(stat->ctime.tv_sec);
> -		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
> -	}
> -	return p;
> +	return xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode, exp));
>  }
>  
>  /*
> diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> index b3b4e8809aa9..4fbe1413e767 100644
> --- a/fs/nfsd/nfsfh.c
> +++ b/fs/nfsd/nfsfh.c
> @@ -719,6 +719,7 @@ void fill_pre_wcc(struct svc_fh *fhp)
>  {
>  	struct inode    *inode;
>  	struct kstat	stat;
> +	struct svc_export *exp = fhp->fh_export;
>  	__be32 err;
>  
>  	if (fhp->fh_pre_saved)
> @@ -736,7 +737,7 @@ void fill_pre_wcc(struct svc_fh *fhp)
>  	fhp->fh_pre_mtime = stat.mtime;
>  	fhp->fh_pre_ctime = stat.ctime;
>  	fhp->fh_pre_size  = stat.size;
> -	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> +	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode, exp);
>  	fhp->fh_pre_saved = true;
>  }
>  
> @@ -746,17 +747,19 @@ void fill_pre_wcc(struct svc_fh *fhp)
>  void fill_post_wcc(struct svc_fh *fhp)
>  {
>  	__be32 err;
> +	struct inode *inode = d_inode(fhp->fh_dentry);
> +	struct svc_export *exp = fhp->fh_export;
>  
>  	if (fhp->fh_post_saved)
>  		printk("nfsd: inode locked twice during operation.\n");
>  
>  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> -	fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
> -						     d_inode(fhp->fh_dentry));
> +	fhp->fh_post_change =
> +			nfsd4_change_attribute(&fhp->fh_post_attr, inode, exp);
>  	if (err) {
>  		fhp->fh_post_saved = false;
>  		/* Grab the ctime anyway - set_change_info might use it */
> -		fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
> +		fhp->fh_post_attr.ctime = inode->i_ctime;
>  	} else
>  		fhp->fh_post_saved = true;
>  }
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index 56cfbc361561..547aef9b3265 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -245,29 +245,6 @@ fh_clear_wcc(struct svc_fh *fhp)
>  	fhp->fh_pre_saved = false;
>  }
>  
> -/*
> - * We could use i_version alone as the change attribute.  However,
> - * i_version can go backwards after a reboot.  On its own that doesn't
> - * necessarily cause a problem, but if i_version goes backwards and then
> - * is incremented again it could reuse a value that was previously used
> - * before boot, and a client who queried the two values might
> - * incorrectly assume nothing changed.
> - *
> - * By using both ctime and the i_version counter we guarantee that as
> - * long as time doesn't go backwards we never reuse an old value.
> - */
> -static inline u64 nfsd4_change_attribute(struct kstat *stat,
> -					 struct inode *inode)
> -{
> -	u64 chattr;
> -
> -	chattr =  stat->ctime.tv_sec;
> -	chattr <<= 30;
> -	chattr += stat->ctime.tv_nsec;
> -	chattr += inode_query_iversion(inode);
> -	return chattr;
> -}
> -
>  extern void fill_pre_wcc(struct svc_fh *fhp);
>  extern void fill_post_wcc(struct svc_fh *fhp);
>  #else
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 1ecaceebee13..2c71b02dd1fe 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -2390,3 +2390,35 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
>  
>  	return err? nfserrno(err) : 0;
>  }
> +
> +/*
> + * We could use i_version alone as the change attribute.  However,
> + * i_version can go backwards after a reboot.  On its own that doesn't
> + * necessarily cause a problem, but if i_version goes backwards and then
> + * is incremented again it could reuse a value that was previously used
> + * before boot, and a client who queried the two values might
> + * incorrectly assume nothing changed.
> + *
> + * By using both ctime and the i_version counter we guarantee that as
> + * long as time doesn't go backwards we never reuse an old value.
> + */
> +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode,
> +					 struct svc_export *exp)
> +{
> +	u64 chattr;
> +
> +	if (exp->ex_flags & NFSEXP_V4ROOT) {
> +		chattr = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
> +		chattr <<= 32;
> +	} else if (IS_I_VERSION(inode)) {
> +		chattr = stat->ctime.tv_sec;
> +		chattr <<= 30;
> +		chattr += stat->ctime.tv_nsec;
> +		chattr += inode_query_iversion(inode);
> +	} else {
> +		chattr = stat->ctime.tv_sec;
> +		chattr <<= 32;
> +		chattr += stat->ctime.tv_nsec;
> +	}
> +	return chattr;
> +}
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index a2442ebe5acf..26ed15256340 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -132,6 +132,9 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
>  __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
>  				struct dentry *, int);
>  
> +u64		nfsd4_change_attribute(struct kstat *stat, struct inode *inode,
> +				struct svc_export *exp);
> +
>  static inline int fh_want_write(struct svc_fh *fh)
>  {
>  	int ret;
> -- 
> 2.28.0
J. Bruce Fields Nov. 17, 2020, 3:26 p.m. UTC | #3
On Tue, Nov 17, 2020 at 07:34:49AM -0500, Jeff Layton wrote:
> I don't think I described what I was thinking well. Let me try again...
> 
> There should be no need to change the code in iversion.h -- I think we
> can do this in a way that's confined to just nfsd/export code.
> 
> What I would suggest is to have nfsd4_change_attribute call the
> fetch_iversion op if it exists, instead of checking IS_I_VERSION and
> doing the stuff in that block. If fetch_iversion is NULL, then just use
> the ctime.
> 
> Then, you just need to make sure that the filesystems' export_ops have
> an appropriate fetch_iversion vector. xfs, ext4 and btrfs can just call
> inode_query_iversion, and NFS and Ceph can call inode_peek_iversion_raw.
> The rest of the filesystems can leave fetch_iversion as NULL (since we
> don't want to use it on them).

Thanks for your patience, that makes sense, I'll try it.

--b.
Jeff Layton Nov. 17, 2020, 3:34 p.m. UTC | #4
On Tue, 2020-11-17 at 10:26 -0500, J. Bruce Fields wrote:
> On Tue, Nov 17, 2020 at 07:34:49AM -0500, Jeff Layton wrote:
> > I don't think I described what I was thinking well. Let me try again...
> > 
> > There should be no need to change the code in iversion.h -- I think we
> > can do this in a way that's confined to just nfsd/export code.
> > 
> > What I would suggest is to have nfsd4_change_attribute call the
> > fetch_iversion op if it exists, instead of checking IS_I_VERSION and
> > doing the stuff in that block. If fetch_iversion is NULL, then just use
> > the ctime.
> > 
> > Then, you just need to make sure that the filesystems' export_ops have
> > an appropriate fetch_iversion vector. xfs, ext4 and btrfs can just call
> > inode_query_iversion, and NFS and Ceph can call inode_peek_iversion_raw.
> > The rest of the filesystems can leave fetch_iversion as NULL (since we
> > don't want to use it on them).
> 
> Thanks for your patience, that makes sense, I'll try it.
> 

There is one gotcha in here though... ext4 needs to also handle the case
where SB_I_VERSION is not set. The simple fix might be to just have
different export ops for ext4 based on whether it was mounted with -o
iversion or not, but maybe there is some better way to do it?
J. Bruce Fields Nov. 20, 2020, 10:38 p.m. UTC | #5
On Tue, Nov 17, 2020 at 10:34:57AM -0500, Jeff Layton wrote:
> On Tue, 2020-11-17 at 10:26 -0500, J. Bruce Fields wrote:
> > On Tue, Nov 17, 2020 at 07:34:49AM -0500, Jeff Layton wrote:
> > > I don't think I described what I was thinking well. Let me try again...
> > > 
> > > There should be no need to change the code in iversion.h -- I think we
> > > can do this in a way that's confined to just nfsd/export code.
> > > 
> > > What I would suggest is to have nfsd4_change_attribute call the
> > > fetch_iversion op if it exists, instead of checking IS_I_VERSION and
> > > doing the stuff in that block. If fetch_iversion is NULL, then just use
> > > the ctime.
> > > 
> > > Then, you just need to make sure that the filesystems' export_ops have
> > > an appropriate fetch_iversion vector. xfs, ext4 and btrfs can just call
> > > inode_query_iversion, and NFS and Ceph can call inode_peek_iversion_raw.
> > > The rest of the filesystems can leave fetch_iversion as NULL (since we
> > > don't want to use it on them).
> > 
> > Thanks for your patience, that makes sense, I'll try it.
> > 
> 
> There is one gotcha in here though... ext4 needs to also handle the case
> where SB_I_VERSION is not set. The simple fix might be to just have
> different export ops for ext4 based on whether it was mounted with -o
> iversion or not, but maybe there is some better way to do it?

I was thinking ext4's export op could check for I_VERSION on its own and
vary behavior based on that.

I'll follow up with new patches in a moment.

I think the first one's all that's needed to fix the problem Daire
identified.  I'm a little less sure of the rest.

Lightly tested, just by running them through my usual regression tests
(which don't re-export) and then running connectathon on a 4.2 re-export
of a 4.2 mount.

The latter triggered a crash preceded by a KASAN use-after free warning.
Looks like it might be a problem with blocking lock notifications,
probably not related to these patches.

--b.
J. Bruce Fields Nov. 20, 2020, 10:44 p.m. UTC | #6
On Fri, Nov 20, 2020 at 05:38:31PM -0500, J. Bruce Fields wrote:
> On Tue, Nov 17, 2020 at 10:34:57AM -0500, Jeff Layton wrote:
> > On Tue, 2020-11-17 at 10:26 -0500, J. Bruce Fields wrote:
> > > On Tue, Nov 17, 2020 at 07:34:49AM -0500, Jeff Layton wrote:
> > > > I don't think I described what I was thinking well. Let me try again...
> > > > 
> > > > There should be no need to change the code in iversion.h -- I think we
> > > > can do this in a way that's confined to just nfsd/export code.
> > > > 
> > > > What I would suggest is to have nfsd4_change_attribute call the
> > > > fetch_iversion op if it exists, instead of checking IS_I_VERSION and
> > > > doing the stuff in that block. If fetch_iversion is NULL, then just use
> > > > the ctime.
> > > > 
> > > > Then, you just need to make sure that the filesystems' export_ops have
> > > > an appropriate fetch_iversion vector. xfs, ext4 and btrfs can just call
> > > > inode_query_iversion, and NFS and Ceph can call inode_peek_iversion_raw.
> > > > The rest of the filesystems can leave fetch_iversion as NULL (since we
> > > > don't want to use it on them).
> > > 
> > > Thanks for your patience, that makes sense, I'll try it.
> > > 
> > 
> > There is one gotcha in here though... ext4 needs to also handle the case
> > where SB_I_VERSION is not set. The simple fix might be to just have
> > different export ops for ext4 based on whether it was mounted with -o
> > iversion or not, but maybe there is some better way to do it?
> 
> I was thinking ext4's export op could check for I_VERSION on its own and
> vary behavior based on that.
> 
> I'll follow up with new patches in a moment.
> 
> I think the first one's all that's needed to fix the problem Daire
> identified.  I'm a little less sure of the rest.
> 
> Lightly tested, just by running them through my usual regression tests
> (which don't re-export) and then running connectathon on a 4.2 re-export
> of a 4.2 mount.
> 
> The latter triggered a crash preceded by a KASAN use-after free warning.
> Looks like it might be a problem with blocking lock notifications,
> probably not related to these patches.

Another nit I ran across:

Some NFSv4 directory-modifying operations return pre- and post- change
attributes together with an "atomic" flag that's supposed to indicate
whether the change attributes were read atomically with the operation.
It looks like we're setting the atomic flag under the assumptions that
local vfs locks are sufficient to guarantee atomicity, which isn't right
when we're exporting a distributed filesystem.

In the case we're reexporting NFS I guess ideal would be to use the pre-
and post- attributes that the original server returned and also save
having to do extra getattr calls.  Not sure how we'd do that,
though--more export operations?  Maybe for now we could just figure out
when to turn off the atomic bit.

--b.
Jeff Layton Nov. 21, 2020, 1:03 a.m. UTC | #7
On Fri, 2020-11-20 at 17:44 -0500, J. Bruce Fields wrote:
> On Fri, Nov 20, 2020 at 05:38:31PM -0500, J. Bruce Fields wrote:
> > On Tue, Nov 17, 2020 at 10:34:57AM -0500, Jeff Layton wrote:
> > > On Tue, 2020-11-17 at 10:26 -0500, J. Bruce Fields wrote:
> > > > On Tue, Nov 17, 2020 at 07:34:49AM -0500, Jeff Layton wrote:
> > > > > I don't think I described what I was thinking well. Let me try again...
> > > > > 
> > > > > There should be no need to change the code in iversion.h -- I think we
> > > > > can do this in a way that's confined to just nfsd/export code.
> > > > > 
> > > > > What I would suggest is to have nfsd4_change_attribute call the
> > > > > fetch_iversion op if it exists, instead of checking IS_I_VERSION and
> > > > > doing the stuff in that block. If fetch_iversion is NULL, then just use
> > > > > the ctime.
> > > > > 
> > > > > Then, you just need to make sure that the filesystems' export_ops have
> > > > > an appropriate fetch_iversion vector. xfs, ext4 and btrfs can just call
> > > > > inode_query_iversion, and NFS and Ceph can call inode_peek_iversion_raw.
> > > > > The rest of the filesystems can leave fetch_iversion as NULL (since we
> > > > > don't want to use it on them).
> > > > 
> > > > Thanks for your patience, that makes sense, I'll try it.
> > > > 
> > > 
> > > There is one gotcha in here though... ext4 needs to also handle the case
> > > where SB_I_VERSION is not set. The simple fix might be to just have
> > > different export ops for ext4 based on whether it was mounted with -o
> > > iversion or not, but maybe there is some better way to do it?
> > 
> > I was thinking ext4's export op could check for I_VERSION on its own and
> > vary behavior based on that.
> > 
> > I'll follow up with new patches in a moment.
> > 
> > I think the first one's all that's needed to fix the problem Daire
> > identified.  I'm a little less sure of the rest.
> > 
> > Lightly tested, just by running them through my usual regression tests
> > (which don't re-export) and then running connectathon on a 4.2 re-export
> > of a 4.2 mount.
> > 
> > The latter triggered a crash preceded by a KASAN use-after free warning.
> > Looks like it might be a problem with blocking lock notifications,
> > probably not related to these patches.
> 

The set looks pretty reasonable at first glance. Nice work.

Once you put this in, I'll plan to add a suitable fetch_iversion op for
ceph too.

> Another nit I ran across:
> 
> Some NFSv4 directory-modifying operations return pre- and post- change
> attributes together with an "atomic" flag that's supposed to indicate
> whether the change attributes were read atomically with the operation.
> It looks like we're setting the atomic flag under the assumptions that
> local vfs locks are sufficient to guarantee atomicity, which isn't right
> when we're exporting a distributed filesystem.
> 
> In the case we're reexporting NFS I guess ideal would be to use the pre-
> and post- attributes that the original server returned and also save
> having to do extra getattr calls.  Not sure how we'd do that,
> though--more export operations?  Maybe for now we could just figure out
> when to turn off the atomic bit.

Oh yeah, good point.

I'm not even sure that local locks are really enough -- IIRC, there are
still some race windows between doing the metadata operations and the
getattrs called to fill pre/post op attrs. Still, those windows are a
lot larger on something like NFS, so setting the flag there is really
stretching things.

One hacky fix might be to add a flags field to export_operations, and
have one that indicates that the atomic flag shouldn't be set. Then we
could add that flag to all of the netfs' (nfs, ceph, cifs), and anywhere
else that we thought it appropriate?

That approach might be helpful later too since we're starting to see a
wider variety of exportable filesystems these days. We may need more
"quirk" flags like this.
Daire Byrne Nov. 21, 2020, 9:44 p.m. UTC | #8
----- On 21 Nov, 2020, at 01:03, Jeff Layton jlayton@kernel.org wrote:
> On Fri, 2020-11-20 at 17:44 -0500, J. Bruce Fields wrote:
>> On Fri, Nov 20, 2020 at 05:38:31PM -0500, J. Bruce Fields wrote:
>> > On Tue, Nov 17, 2020 at 10:34:57AM -0500, Jeff Layton wrote:
>> > > On Tue, 2020-11-17 at 10:26 -0500, J. Bruce Fields wrote:
>> > > > On Tue, Nov 17, 2020 at 07:34:49AM -0500, Jeff Layton wrote:
>> > > > > I don't think I described what I was thinking well. Let me try again...
>> > > > > 
>> > > > > There should be no need to change the code in iversion.h -- I think we
>> > > > > can do this in a way that's confined to just nfsd/export code.
>> > > > > 
>> > > > > What I would suggest is to have nfsd4_change_attribute call the
>> > > > > fetch_iversion op if it exists, instead of checking IS_I_VERSION and
>> > > > > doing the stuff in that block. If fetch_iversion is NULL, then just use
>> > > > > the ctime.
>> > > > > 
>> > > > > Then, you just need to make sure that the filesystems' export_ops have
>> > > > > an appropriate fetch_iversion vector. xfs, ext4 and btrfs can just call
>> > > > > inode_query_iversion, and NFS and Ceph can call inode_peek_iversion_raw.
>> > > > > The rest of the filesystems can leave fetch_iversion as NULL (since we
>> > > > > don't want to use it on them).
>> > > > 
>> > > > Thanks for your patience, that makes sense, I'll try it.
>> > > > 
>> > > 
>> > > There is one gotcha in here though... ext4 needs to also handle the case
>> > > where SB_I_VERSION is not set. The simple fix might be to just have
>> > > different export ops for ext4 based on whether it was mounted with -o
>> > > iversion or not, but maybe there is some better way to do it?
>> > 
>> > I was thinking ext4's export op could check for I_VERSION on its own and
>> > vary behavior based on that.
>> > 
>> > I'll follow up with new patches in a moment.
>> > 
>> > I think the first one's all that's needed to fix the problem Daire
>> > identified.  I'm a little less sure of the rest.

I can confirm that patch 1/8 alone does indeed address the reported revalidation issue for us (as did the previous patch). The re-export server's client cache seems to remain intact and can serve the same cached results to multiple clients.

>> > Lightly tested, just by running them through my usual regression tests
>> > (which don't re-export) and then running connectathon on a 4.2 re-export
>> > of a 4.2 mount.
>> > 
>> > The latter triggered a crash preceded by a KASAN use-after free warning.
>> > Looks like it might be a problem with blocking lock notifications,
>> > probably not related to these patches.
>> >
> The set looks pretty reasonable at first glance. Nice work.
> 
> Once you put this in, I'll plan to add a suitable fetch_iversion op for
> ceph too.
> 
>> Another nit I ran across:
>> 
>> Some NFSv4 directory-modifying operations return pre- and post- change
>> attributes together with an "atomic" flag that's supposed to indicate
>> whether the change attributes were read atomically with the operation.
>> It looks like we're setting the atomic flag under the assumptions that
>> local vfs locks are sufficient to guarantee atomicity, which isn't right
>> when we're exporting a distributed filesystem.
>> 
>> In the case we're reexporting NFS I guess ideal would be to use the pre-
>> and post- attributes that the original server returned and also save
>> having to do extra getattr calls.  Not sure how we'd do that,
>> though--more export operations?  Maybe for now we could just figure out
>> when to turn off the atomic bit.
> 
> Oh yeah, good point.
> 
> I'm not even sure that local locks are really enough -- IIRC, there are
> still some race windows between doing the metadata operations and the
> getattrs called to fill pre/post op attrs. Still, those windows are a
> lot larger on something like NFS, so setting the flag there is really
> stretching things.
> 
> One hacky fix might be to add a flags field to export_operations, and
> have one that indicates that the atomic flag shouldn't be set. Then we
> could add that flag to all of the netfs' (nfs, ceph, cifs), and anywhere
> else that we thought it appropriate?
> 
> That approach might be helpful later too since we're starting to see a
> wider variety of exportable filesystems these days. We may need more
> "quirk" flags like this.
> --
> Jeff Layton <jlayton@kernel.org>

I should also mention that I still see a lot of unexpected repeat lookups even with the iversion optimisation patches with certain workloads. For example, looking at a network capture on the re-export server I might see 100s of getattr calls to the originating server for the same filehandle within 30 seconds which I would have expected the client cache to serve. But it could also be that the client cache is under memory pressure and not holding that data for very long.

But now I do wonder if these NFSv4 directory modifications and pre/post change attributes could be one potential contributor? I might run some production loads with a v3 re-export of a v3 server to see if that changes anything.

Many thanks again for the patches, I will take the entire set and run them through our production re-export workloads to see if anything shakes out.

Daire
J. Bruce Fields Nov. 22, 2020, 12:02 a.m. UTC | #9
On Sat, Nov 21, 2020 at 09:44:29PM +0000, Daire Byrne wrote:
> ----- On 21 Nov, 2020, at 01:03, Jeff Layton jlayton@kernel.org wrote:
> > On Fri, 2020-11-20 at 17:44 -0500, J. Bruce Fields wrote:
> >> On Fri, Nov 20, 2020 at 05:38:31PM -0500, J. Bruce Fields wrote:
> >> > I think the first one's all that's needed to fix the problem Daire
> >> > identified.  I'm a little less sure of the rest.
> 
> I can confirm that patch 1/8 alone does indeed address the reported revalidation issue for us (as did the previous patch). The re-export server's client cache seems to remain intact and can serve the same cached results to multiple clients.

Thanks again for the testing.

> I should also mention that I still see a lot of unexpected repeat
> lookups even with the iversion optimisation patches with certain
> workloads. For example, looking at a network capture on the re-export
> server I might see 100s of getattr calls to the originating server for
> the same filehandle within 30 seconds which I would have expected the
> client cache to serve. But it could also be that the client cache is
> under memory pressure and not holding that data for very long.

That sounds weird.  Is the filehandle for a file or a directory?  Is the
file or directory actually changing at the time, and if so, is it the
client that's changing it?

Remind me what the setup is--a v3 re-export of a v4 mount?

--b.

> But now I do wonder if these NFSv4 directory modifications and
> pre/post change attributes could be one potential contributor? I might
> run some production loads with a v3 re-export of a v3 server to see if
> that changes anything.
> 
> Many thanks again for the patches, I will take the entire set and run
> them through our production re-export workloads to see if anything
> shakes out.
> 
> Daire
Daire Byrne Nov. 22, 2020, 1:55 a.m. UTC | #10
----- On 22 Nov, 2020, at 00:02, bfields bfields@fieldses.org wrote:
>> I should also mention that I still see a lot of unexpected repeat
>> lookups even with the iversion optimisation patches with certain
>> workloads. For example, looking at a network capture on the re-export
>> server I might see 100s of getattr calls to the originating server for
>> the same filehandle within 30 seconds which I would have expected the
>> client cache to serve. But it could also be that the client cache is
>> under memory pressure and not holding that data for very long.
> 
> That sounds weird.  Is the filehandle for a file or a directory?  Is the
> file or directory actually changing at the time, and if so, is it the
> client that's changing it?
> 
> Remind me what the setup is--a v3 re-export of a v4 mount?

Maybe this discussion should go back into the "Advenvetures in re-exporting" thread? But to give a quick answer here anyway...

The workload I have been looking at recently is a NFSv3 re-export of a NFSv4.2 mount. I can also say that it is generally when new files are being written to a directory. So yes, the files and dir are changing at the time but I still didn't expect to see so many repeated getattr neatly bundled together in short bursts, e.g. (re-export server = 10.156.12.1, originating server 10.21.22.117).

54544  88.147927  10.156.12.1 → 10.21.22.117 NFS 326 V4 Call SETATTR FH: 0x4dbdfb01
54547  88.160469  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call SETATTR FH: 0x4dbdfb01
54556  88.185592  10.156.12.1 → 10.21.22.117 NFS 330 V4 Call SETATTR FH: 0x4dbdfb01
54559  88.198350  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call SETATTR FH: 0x4dbdfb01
54562  88.211670  10.156.12.1 → 10.21.22.117 NFS 326 V4 Call SETATTR FH: 0x4dbdfb01
54565  88.243251  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call OPEN DH: 0x4dbdfb01/
54637  88.269587  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
55078  88.277138  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0
57747  88.390197  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57748  88.390212  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57749  88.390215  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57750  88.390218  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57751  88.390220  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57752  88.390222  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57753  88.390231  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57754  88.390261  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0
57755  88.390292  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
57852  88.415541  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0
57853  88.415551  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
58965  88.442004  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
60201  88.486231  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
60615  88.505453  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
60616  88.505473  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
60617  88.505477  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
60618  88.505480  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
60619  88.505482  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0

Often I only capture an open dh followed by a flurry of getattr:

 3068  24.603153  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call OPEN DH: 0xb63a98ec/
 3089  24.641542  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3093  24.642172  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3140  24.719930  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3360  24.769423  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3376  24.771353  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3436  24.782817  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3569  24.798207  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3753  24.855233  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3777  24.856130  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3824  24.862919  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 3873  24.873890  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 4001  24.898289  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 4070  24.925970  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 4127  24.940616  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 4174  24.985160  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 4343  25.007565  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 4344  25.008343  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
 4358  25.036177  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec

The common workload is that we will have multiple clients of the re-export server all writing different (frame) files into the same directory at the same time. But on the re-export server it is ultimately 16 threads of nfsd making those calls to the originating server.

The re-export server's client should be the only one making most of the changes, although there are other NFSv3 clients of the originating servers that could conceivably be updating files too.

Like I said, it might be interesting to see if we see the same behaviour with a NFSv3 re-export of an NFSv3 server.

Daire
J. Bruce Fields Nov. 22, 2020, 3:03 a.m. UTC | #11
On Sun, Nov 22, 2020 at 01:55:50AM +0000, Daire Byrne wrote:
> 
> ----- On 22 Nov, 2020, at 00:02, bfields bfields@fieldses.org wrote:
> >> I should also mention that I still see a lot of unexpected repeat
> >> lookups even with the iversion optimisation patches with certain
> >> workloads. For example, looking at a network capture on the re-export
> >> server I might see 100s of getattr calls to the originating server for
> >> the same filehandle within 30 seconds which I would have expected the
> >> client cache to serve. But it could also be that the client cache is
> >> under memory pressure and not holding that data for very long.
> > 
> > That sounds weird.  Is the filehandle for a file or a directory?  Is the
> > file or directory actually changing at the time, and if so, is it the
> > client that's changing it?
> > 
> > Remind me what the setup is--a v3 re-export of a v4 mount?
> 
> Maybe this discussion should go back into the "Advenvetures in re-exporting" thread? But to give a quick answer here anyway...
> 
> The workload I have been looking at recently is a NFSv3 re-export of a NFSv4.2 mount. I can also say that it is generally when new files are being written to a directory. So yes, the files and dir are changing at the time but I still didn't expect to see so many repeated getattr neatly bundled together in short bursts, e.g. (re-export server = 10.156.12.1, originating server 10.21.22.117).

Well, I guess the pre/post-op attributes could contribute to the
problem, in that they could unnecessarily turn a COMMIT into

	GETATTR
	COMMIT
	GETATTR

And ditto for anything that modifies file or directory contents.  But
I'd've thought some of those could have been cached.  Also it looks like
you've got more GETATTRs than that.  Hm.

--b.

> 
> 54544  88.147927  10.156.12.1 → 10.21.22.117 NFS 326 V4 Call SETATTR FH: 0x4dbdfb01
> 54547  88.160469  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call SETATTR FH: 0x4dbdfb01
> 54556  88.185592  10.156.12.1 → 10.21.22.117 NFS 330 V4 Call SETATTR FH: 0x4dbdfb01
> 54559  88.198350  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call SETATTR FH: 0x4dbdfb01
> 54562  88.211670  10.156.12.1 → 10.21.22.117 NFS 326 V4 Call SETATTR FH: 0x4dbdfb01
> 54565  88.243251  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call OPEN DH: 0x4dbdfb01/
> 54637  88.269587  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 55078  88.277138  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0
> 57747  88.390197  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57748  88.390212  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57749  88.390215  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57750  88.390218  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57751  88.390220  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57752  88.390222  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57753  88.390231  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57754  88.390261  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0
> 57755  88.390292  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 57852  88.415541  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0
> 57853  88.415551  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 58965  88.442004  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 60201  88.486231  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 60615  88.505453  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 60616  88.505473  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 60617  88.505477  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 60618  88.505480  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0x4dbdfb01
> 60619  88.505482  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call COMMIT FH: 0x4dbdfb01 Offset: 0 Len: 0
> 
> Often I only capture an open dh followed by a flurry of getattr:
> 
>  3068  24.603153  10.156.12.1 → 10.21.22.117 NFS 350 V4 Call OPEN DH: 0xb63a98ec/
>  3089  24.641542  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3093  24.642172  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3140  24.719930  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3360  24.769423  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3376  24.771353  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3436  24.782817  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3569  24.798207  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3753  24.855233  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3777  24.856130  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3824  24.862919  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  3873  24.873890  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  4001  24.898289  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  4070  24.925970  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  4127  24.940616  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  4174  24.985160  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  4343  25.007565  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  4344  25.008343  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
>  4358  25.036177  10.156.12.1 → 10.21.22.117 NFS 282 V4 Call GETATTR FH: 0xb63a98ec
> 
> The common workload is that we will have multiple clients of the re-export server all writing different (frame) files into the same directory at the same time. But on the re-export server it is ultimately 16 threads of nfsd making those calls to the originating server.
> 
> The re-export server's client should be the only one making most of the changes, although there are other NFSv3 clients of the originating servers that could conceivably be updating files too.
> 
> Like I said, it might be interesting to see if we see the same behaviour with a NFSv3 re-export of an NFSv3 server.
> 
> Daire
Daire Byrne Nov. 23, 2020, 8:07 p.m. UTC | #12
----- On 22 Nov, 2020, at 03:03, bfields bfields@fieldses.org wrote:
>> The workload I have been looking at recently is a NFSv3 re-export of a NFSv4.2
>> mount. I can also say that it is generally when new files are being written to
>> a directory. So yes, the files and dir are changing at the time but I still
>> didn't expect to see so many repeated getattr neatly bundled together in short
>> bursts, e.g. (re-export server = 10.156.12.1, originating server 10.21.22.117).
> 
> Well, I guess the pre/post-op attributes could contribute to the
> problem, in that they could unnecessarily turn a COMMIT into
> 
>	GETATTR
>	COMMIT
>	GETATTR
> 
> And ditto for anything that modifies file or directory contents.  But
> I'd've thought some of those could have been cached.  Also it looks like
> you've got more GETATTRs than that.  Hm.

Yea, I definitely see those COMMITs surrounded by GETTATTRs with NFSv4.2... But as you say, I get way more repeat GETATTRs for the same filehandles.

I switched to a NFSv4.2 re-export of a NFSv3 server and saw the kind of thing - sometimes the wire would see 4-5 GETTATRs for the same FH in tight sequence with nothing in between. So then I started thinking.... how does nconnect work again? Because my re-export server is mounting the originating server with nconnect=16 and the flurries of repeat GETATTRs often contain a count in that ballpark.

I need to re-test without nconnect... Maybe that's how it's supposed to work and I'm just being over sensitive after this iversion issue.

Daire
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 833a2c64dfe8..6806207b6d18 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2295,16 +2295,7 @@  nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
 			     struct svc_export *exp)
 {
-	if (exp->ex_flags & NFSEXP_V4ROOT) {
-		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
-		*p++ = 0;
-	} else if (IS_I_VERSION(inode)) {
-		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
-	} else {
-		*p++ = cpu_to_be32(stat->ctime.tv_sec);
-		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
-	}
-	return p;
+	return xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode, exp));
 }
 
 /*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index b3b4e8809aa9..4fbe1413e767 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -719,6 +719,7 @@  void fill_pre_wcc(struct svc_fh *fhp)
 {
 	struct inode    *inode;
 	struct kstat	stat;
+	struct svc_export *exp = fhp->fh_export;
 	__be32 err;
 
 	if (fhp->fh_pre_saved)
@@ -736,7 +737,7 @@  void fill_pre_wcc(struct svc_fh *fhp)
 	fhp->fh_pre_mtime = stat.mtime;
 	fhp->fh_pre_ctime = stat.ctime;
 	fhp->fh_pre_size  = stat.size;
-	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
+	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode, exp);
 	fhp->fh_pre_saved = true;
 }
 
@@ -746,17 +747,19 @@  void fill_pre_wcc(struct svc_fh *fhp)
 void fill_post_wcc(struct svc_fh *fhp)
 {
 	__be32 err;
+	struct inode *inode = d_inode(fhp->fh_dentry);
+	struct svc_export *exp = fhp->fh_export;
 
 	if (fhp->fh_post_saved)
 		printk("nfsd: inode locked twice during operation.\n");
 
 	err = fh_getattr(fhp, &fhp->fh_post_attr);
-	fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
-						     d_inode(fhp->fh_dentry));
+	fhp->fh_post_change =
+			nfsd4_change_attribute(&fhp->fh_post_attr, inode, exp);
 	if (err) {
 		fhp->fh_post_saved = false;
 		/* Grab the ctime anyway - set_change_info might use it */
-		fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
+		fhp->fh_post_attr.ctime = inode->i_ctime;
 	} else
 		fhp->fh_post_saved = true;
 }
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 56cfbc361561..547aef9b3265 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -245,29 +245,6 @@  fh_clear_wcc(struct svc_fh *fhp)
 	fhp->fh_pre_saved = false;
 }
 
-/*
- * We could use i_version alone as the change attribute.  However,
- * i_version can go backwards after a reboot.  On its own that doesn't
- * necessarily cause a problem, but if i_version goes backwards and then
- * is incremented again it could reuse a value that was previously used
- * before boot, and a client who queried the two values might
- * incorrectly assume nothing changed.
- *
- * By using both ctime and the i_version counter we guarantee that as
- * long as time doesn't go backwards we never reuse an old value.
- */
-static inline u64 nfsd4_change_attribute(struct kstat *stat,
-					 struct inode *inode)
-{
-	u64 chattr;
-
-	chattr =  stat->ctime.tv_sec;
-	chattr <<= 30;
-	chattr += stat->ctime.tv_nsec;
-	chattr += inode_query_iversion(inode);
-	return chattr;
-}
-
 extern void fill_pre_wcc(struct svc_fh *fhp);
 extern void fill_post_wcc(struct svc_fh *fhp);
 #else
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1ecaceebee13..2c71b02dd1fe 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2390,3 +2390,35 @@  nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 
 	return err? nfserrno(err) : 0;
 }
+
+/*
+ * We could use i_version alone as the change attribute.  However,
+ * i_version can go backwards after a reboot.  On its own that doesn't
+ * necessarily cause a problem, but if i_version goes backwards and then
+ * is incremented again it could reuse a value that was previously used
+ * before boot, and a client who queried the two values might
+ * incorrectly assume nothing changed.
+ *
+ * By using both ctime and the i_version counter we guarantee that as
+ * long as time doesn't go backwards we never reuse an old value.
+ */
+u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode,
+					 struct svc_export *exp)
+{
+	u64 chattr;
+
+	if (exp->ex_flags & NFSEXP_V4ROOT) {
+		chattr = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
+		chattr <<= 32;
+	} else if (IS_I_VERSION(inode)) {
+		chattr = stat->ctime.tv_sec;
+		chattr <<= 30;
+		chattr += stat->ctime.tv_nsec;
+		chattr += inode_query_iversion(inode);
+	} else {
+		chattr = stat->ctime.tv_sec;
+		chattr <<= 32;
+		chattr += stat->ctime.tv_nsec;
+	}
+	return chattr;
+}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a2442ebe5acf..26ed15256340 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -132,6 +132,9 @@  __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
+u64		nfsd4_change_attribute(struct kstat *stat, struct inode *inode,
+				struct svc_export *exp);
+
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret;