diff mbox

[v2,2/2] nfsd: store stat times in fill_pre_wcc() instead of inode times

Message ID 1514992475-8142-3-git-send-email-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Amir Goldstein Jan. 3, 2018, 3:14 p.m. UTC
The time values in stat and inode may differ for overlayfs and stat time
values are the correct ones to use. This is also consistent with the fact
that fill_post_wcc() also stores stat time values.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
 fs/nfsd/nfs4xdr.c |  2 +-
 fs/nfsd/nfsfh.h   | 28 ++++++----------------------
 3 files changed, 37 insertions(+), 24 deletions(-)

Comments

Jeff Layton Jan. 3, 2018, 3:41 p.m. UTC | #1
On Wed, 2018-01-03 at 17:14 +0200, Amir Goldstein wrote:
> The time values in stat and inode may differ for overlayfs and stat time
> values are the correct ones to use. This is also consistent with the fact
> that fill_post_wcc() also stores stat time values.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
>  fs/nfsd/nfs4xdr.c |  2 +-
>  fs/nfsd/nfsfh.h   | 28 ++++++----------------------
>  3 files changed, 37 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 2758480555fa..1a70581e1cb2 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -251,6 +251,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
>  }
>  
>  /*
> + * Fill in the pre_op attr for the wcc data
> + */
> +void fill_pre_wcc(struct svc_fh *fhp)
> +{
> +	struct inode    *inode;
> +	struct kstat	stat;
> +	__be32 err;
> +
> +	if (fhp->fh_pre_saved)
> +		return;
> +
> +	inode = d_inode(fhp->fh_dentry);
> +	err = fh_getattr(fhp, &stat);
> +	if (err) {
> +		/* Grab the times from inode anyway */
> +		stat.mtime = inode->i_mtime;
> +		stat.ctime = inode->i_ctime;
> +		stat.size  = inode->i_size;
> +	}
> +

Might be best to instead just not supply pre/post op attrs if the
getattr fails? They are technically optional with v3 -- we can just set
the attributes_follow bit to false there.

For v4, it's a little more complicated. Scraping it out of the inode
might be necessary for the cases where we need a change_info4.

Either way, it'd be good to know that these getattrs are failing if this
occurs. Maybe a KERN_WARNING printk or something might be good there?


> +	fhp->fh_pre_mtime = stat.mtime;
> +	fhp->fh_pre_ctime = stat.ctime;
> +	fhp->fh_pre_size  = stat.size;
> +	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> +	fhp->fh_pre_saved = true;
> +}
> +
> +/*
>   * Fill in the post_op attr for the wcc data
>   */
>  void fill_post_wcc(struct svc_fh *fhp)
> @@ -261,7 +289,8 @@ void fill_post_wcc(struct svc_fh *fhp)
>  		printk("nfsd: inode locked twice during operation.\n");
>  
>  	err = fh_getattr(fhp, &fhp->fh_post_attr);
> -	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
> +	fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
> +						     d_inode(fhp->fh_dentry));
>  	if (err) {
>  		fhp->fh_post_saved = false;
>  		/* Grab the ctime anyway - set_change_info might use it */
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 2c61c6b8ae09..fbe11ad9b9ff 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1991,7 +1991,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
>  		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
>  		*p++ = 0;
>  	} else if (IS_I_VERSION(inode)) {
> -		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
> +		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
>  	} else {
>  		*p++ = cpu_to_be32(stat->ctime.tv_sec);
>  		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index 43f31cf49bae..99be87b50ebe 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -252,36 +252,20 @@ fh_clear_wcc(struct svc_fh *fhp)
>   * By using both ctime and the i_version counter we guarantee that as
>   * long as time doesn't go backwards we never reuse an old value.
>   */
> -static inline u64 nfsd4_change_attribute(struct inode *inode)
> +static inline u64 nfsd4_change_attribute(struct kstat *stat,
> +					 struct inode *inode)
>  {
>  	u64 chattr;
>  
> -	chattr =  inode->i_ctime.tv_sec;
> +	chattr =  stat->ctime.tv_sec;
>  	chattr <<= 30;
> -	chattr += inode->i_ctime.tv_nsec;
> +	chattr += stat->ctime.tv_nsec;
>  	chattr += inode->i_version;
>  	return chattr;
>  }
>  
> -/*
> - * Fill in the pre_op attr for the wcc data
> - */
> -static inline void
> -fill_pre_wcc(struct svc_fh *fhp)
> -{
> -	struct inode    *inode;
> -
> -	inode = d_inode(fhp->fh_dentry);
> -	if (!fhp->fh_pre_saved) {
> -		fhp->fh_pre_mtime = inode->i_mtime;
> -		fhp->fh_pre_ctime = inode->i_ctime;
> -		fhp->fh_pre_size  = inode->i_size;
> -		fhp->fh_pre_change = nfsd4_change_attribute(inode);
> -		fhp->fh_pre_saved = true;
> -	}
> -}
> -
> -extern void fill_post_wcc(struct svc_fh *);
> +extern void fill_pre_wcc(struct svc_fh *fhp);
> +extern void fill_post_wcc(struct svc_fh *fhp);
>  #else
>  #define fh_clear_wcc(ignored)
>  #define fill_pre_wcc(ignored)
Amir Goldstein Jan. 3, 2018, 3:48 p.m. UTC | #2
On Wed, Jan 3, 2018 at 5:41 PM, Jeff Layton <jlayton@kernel.org> wrote:
> On Wed, 2018-01-03 at 17:14 +0200, Amir Goldstein wrote:
>> The time values in stat and inode may differ for overlayfs and stat time
>> values are the correct ones to use. This is also consistent with the fact
>> that fill_post_wcc() also stores stat time values.
>>
>> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>> ---
>>  fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
>>  fs/nfsd/nfs4xdr.c |  2 +-
>>  fs/nfsd/nfsfh.h   | 28 ++++++----------------------
>>  3 files changed, 37 insertions(+), 24 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
>> index 2758480555fa..1a70581e1cb2 100644
>> --- a/fs/nfsd/nfs3xdr.c
>> +++ b/fs/nfsd/nfs3xdr.c
>> @@ -251,6 +251,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
>>  }
>>
>>  /*
>> + * Fill in the pre_op attr for the wcc data
>> + */
>> +void fill_pre_wcc(struct svc_fh *fhp)
>> +{
>> +     struct inode    *inode;
>> +     struct kstat    stat;
>> +     __be32 err;
>> +
>> +     if (fhp->fh_pre_saved)
>> +             return;
>> +
>> +     inode = d_inode(fhp->fh_dentry);
>> +     err = fh_getattr(fhp, &stat);
>> +     if (err) {
>> +             /* Grab the times from inode anyway */
>> +             stat.mtime = inode->i_mtime;
>> +             stat.ctime = inode->i_ctime;
>> +             stat.size  = inode->i_size;
>> +     }
>> +
>
> Might be best to instead just not supply pre/post op attrs if the
> getattr fails? They are technically optional with v3 -- we can just set
> the attributes_follow bit to false there.

I considered to set fh_pre_saved = false on error just like
fill_post_wcc() does, but wasn't sure of the consequences or how to test
for that matter, so I chose a more delicate fallback instead.


>
> For v4, it's a little more complicated. Scraping it out of the inode
> might be necessary for the cases where we need a change_info4.
>
> Either way, it'd be good to know that these getattrs are failing if this
> occurs. Maybe a KERN_WARNING printk or something might be good there?
>

As you wish, but fill_post_wcc() does not emit a warning, so...

I'll wait for feedback from Bruce as well.
Let me know the error handing of your choice.

Thanks,
Amir.
Jeff Layton Jan. 3, 2018, 6:45 p.m. UTC | #3
On Wed, 2018-01-03 at 17:48 +0200, Amir Goldstein wrote:
> On Wed, Jan 3, 2018 at 5:41 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > On Wed, 2018-01-03 at 17:14 +0200, Amir Goldstein wrote:
> > > The time values in stat and inode may differ for overlayfs and stat time
> > > values are the correct ones to use. This is also consistent with the fact
> > > that fill_post_wcc() also stores stat time values.
> > > 
> > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > ---
> > >  fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
> > >  fs/nfsd/nfs4xdr.c |  2 +-
> > >  fs/nfsd/nfsfh.h   | 28 ++++++----------------------
> > >  3 files changed, 37 insertions(+), 24 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> > > index 2758480555fa..1a70581e1cb2 100644
> > > --- a/fs/nfsd/nfs3xdr.c
> > > +++ b/fs/nfsd/nfs3xdr.c
> > > @@ -251,6 +251,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
> > >  }
> > > 
> > >  /*
> > > + * Fill in the pre_op attr for the wcc data
> > > + */
> > > +void fill_pre_wcc(struct svc_fh *fhp)
> > > +{
> > > +     struct inode    *inode;
> > > +     struct kstat    stat;
> > > +     __be32 err;
> > > +
> > > +     if (fhp->fh_pre_saved)
> > > +             return;
> > > +
> > > +     inode = d_inode(fhp->fh_dentry);
> > > +     err = fh_getattr(fhp, &stat);
> > > +     if (err) {
> > > +             /* Grab the times from inode anyway */
> > > +             stat.mtime = inode->i_mtime;
> > > +             stat.ctime = inode->i_ctime;
> > > +             stat.size  = inode->i_size;
> > > +     }
> > > +
> > 
> > Might be best to instead just not supply pre/post op attrs if the
> > getattr fails? They are technically optional with v3 -- we can just set
> > the attributes_follow bit to false there.
> 
> I considered to set fh_pre_saved = false on error just like
> fill_post_wcc() does, but wasn't sure of the consequences or how to test
> for that matter, so I chose a more delicate fallback instead.
> 

Take care with the BUG_ON in set_change_info if you do that. 

Note that all of this is really just to handle weak cache consistency in
v3, and the change_info4 value in v4. When the info is not reliable, the
client doesn't trust its cache, for the most part. Getting it wrong
shouldn't be fatal in most cases.

For v3 you can just clear the attributes_follow bit when you fill out
the info, and leave it zeroed out. I had a patch a few years ago that
did this on a per-export basis that you're welcome to take an run with:

    https://patchwork.kernel.org/patch/7159311/

Obviously, the conditions for doing this here are different.

For v4, I think we can just try to scrape what we have like you're doing
here, and simply ensure that the "atomic" field being encoded in
encode_cinfo is false when this occurs.

> > 
> > For v4, it's a little more complicated. Scraping it out of the inode
> > might be necessary for the cases where we need a change_info4.
> > 
> > Either way, it'd be good to know that these getattrs are failing if this
> > occurs. Maybe a KERN_WARNING printk or something might be good there?
> > 
> 
> As you wish, but fill_post_wcc() does not emit a warning, so...
> 
> I'll wait for feedback from Bruce as well.
> Let me know the error handing of your choice.
> 

Yeah, it doesn't. I think that's good info to know though.

All of this is to support client-server cache coherency, and bugs in
those areas can be subtle and hard to detect. A stack trace is probably
not terribly helpful, but doing a one-time printk to show that you got
an error there may be.
Amir Goldstein Jan. 3, 2018, 9:03 p.m. UTC | #4
On Wed, Jan 3, 2018 at 8:45 PM, Jeff Layton <jlayton@kernel.org> wrote:
> On Wed, 2018-01-03 at 17:48 +0200, Amir Goldstein wrote:
>> On Wed, Jan 3, 2018 at 5:41 PM, Jeff Layton <jlayton@kernel.org> wrote:
>> > On Wed, 2018-01-03 at 17:14 +0200, Amir Goldstein wrote:
>> > > The time values in stat and inode may differ for overlayfs and stat time
>> > > values are the correct ones to use. This is also consistent with the fact
>> > > that fill_post_wcc() also stores stat time values.
>> > >
>> > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>> > > ---
>> > >  fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
>> > >  fs/nfsd/nfs4xdr.c |  2 +-
>> > >  fs/nfsd/nfsfh.h   | 28 ++++++----------------------
>> > >  3 files changed, 37 insertions(+), 24 deletions(-)
>> > >
>> > > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
>> > > index 2758480555fa..1a70581e1cb2 100644
>> > > --- a/fs/nfsd/nfs3xdr.c
>> > > +++ b/fs/nfsd/nfs3xdr.c
>> > > @@ -251,6 +251,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
>> > >  }
>> > >
>> > >  /*
>> > > + * Fill in the pre_op attr for the wcc data
>> > > + */
>> > > +void fill_pre_wcc(struct svc_fh *fhp)
>> > > +{
>> > > +     struct inode    *inode;
>> > > +     struct kstat    stat;
>> > > +     __be32 err;
>> > > +
>> > > +     if (fhp->fh_pre_saved)
>> > > +             return;
>> > > +
>> > > +     inode = d_inode(fhp->fh_dentry);
>> > > +     err = fh_getattr(fhp, &stat);
>> > > +     if (err) {
>> > > +             /* Grab the times from inode anyway */
>> > > +             stat.mtime = inode->i_mtime;
>> > > +             stat.ctime = inode->i_ctime;
>> > > +             stat.size  = inode->i_size;
>> > > +     }
>> > > +
>> >
>> > Might be best to instead just not supply pre/post op attrs if the
>> > getattr fails? They are technically optional with v3 -- we can just set
>> > the attributes_follow bit to false there.
>>
>> I considered to set fh_pre_saved = false on error just like
>> fill_post_wcc() does, but wasn't sure of the consequences or how to test
>> for that matter, so I chose a more delicate fallback instead.
>>
>
> Take care with the BUG_ON in set_change_info if you do that.
>
> Note that all of this is really just to handle weak cache consistency in
> v3, and the change_info4 value in v4. When the info is not reliable, the
> client doesn't trust its cache, for the most part. Getting it wrong
> shouldn't be fatal in most cases.
>
> For v3 you can just clear the attributes_follow bit when you fill out
> the info, and leave it zeroed out. I had a patch a few years ago that
> did this on a per-export basis that you're welcome to take an run with:
>
>     https://patchwork.kernel.org/patch/7159311/
>
> Obviously, the conditions for doing this here are different.
>
> For v4, I think we can just try to scrape what we have like you're doing
> here, and simply ensure that the "atomic" field being encoded in
> encode_cinfo is false when this occurs.
>

Honestly, Jeff, at this point I am so far out into the woods with overlay NFS
export, that I would like to remain focused on correctness and leave
performance for later time.

So if I understand you correctly, patch 2/2 is not needed for correctness?
Meaning that if overlay inode times are not uptodate, nothing fatal will
happen? Or did you mean that I must take care of signalling the client
that time values are not reliable for overlayfs?

If patch 2/2 is indeed not a must, then I would like to ask you to ACK
patch 1/2. It seems quite simple, trivial and harmless to me even  without
diving deep into NFS protocols. I think patch 1/2 should be enough for
first implementation - it certainly is enough to fix the nfstest_posix failures.

Thanks!
Amir.
Jeff Layton Jan. 4, 2018, 1:26 p.m. UTC | #5
On Wed, 2018-01-03 at 23:03 +0200, Amir Goldstein wrote:
> On Wed, Jan 3, 2018 at 8:45 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > On Wed, 2018-01-03 at 17:48 +0200, Amir Goldstein wrote:
> > > On Wed, Jan 3, 2018 at 5:41 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > On Wed, 2018-01-03 at 17:14 +0200, Amir Goldstein wrote:
> > > > > The time values in stat and inode may differ for overlayfs and stat time
> > > > > values are the correct ones to use. This is also consistent with the fact
> > > > > that fill_post_wcc() also stores stat time values.
> > > > > 
> > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > > > ---
> > > > >  fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
> > > > >  fs/nfsd/nfs4xdr.c |  2 +-
> > > > >  fs/nfsd/nfsfh.h   | 28 ++++++----------------------
> > > > >  3 files changed, 37 insertions(+), 24 deletions(-)
> > > > > 
> > > > > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> > > > > index 2758480555fa..1a70581e1cb2 100644
> > > > > --- a/fs/nfsd/nfs3xdr.c
> > > > > +++ b/fs/nfsd/nfs3xdr.c
> > > > > @@ -251,6 +251,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
> > > > >  }
> > > > > 
> > > > >  /*
> > > > > + * Fill in the pre_op attr for the wcc data
> > > > > + */
> > > > > +void fill_pre_wcc(struct svc_fh *fhp)
> > > > > +{
> > > > > +     struct inode    *inode;
> > > > > +     struct kstat    stat;
> > > > > +     __be32 err;
> > > > > +
> > > > > +     if (fhp->fh_pre_saved)
> > > > > +             return;
> > > > > +
> > > > > +     inode = d_inode(fhp->fh_dentry);
> > > > > +     err = fh_getattr(fhp, &stat);
> > > > > +     if (err) {
> > > > > +             /* Grab the times from inode anyway */
> > > > > +             stat.mtime = inode->i_mtime;
> > > > > +             stat.ctime = inode->i_ctime;
> > > > > +             stat.size  = inode->i_size;
> > > > > +     }
> > > > > +
> > > > 
> > > > Might be best to instead just not supply pre/post op attrs if the
> > > > getattr fails? They are technically optional with v3 -- we can just set
> > > > the attributes_follow bit to false there.
> > > 
> > > I considered to set fh_pre_saved = false on error just like
> > > fill_post_wcc() does, but wasn't sure of the consequences or how to test
> > > for that matter, so I chose a more delicate fallback instead.
> > > 
> > 
> > Take care with the BUG_ON in set_change_info if you do that.
> > 
> > Note that all of this is really just to handle weak cache consistency in
> > v3, and the change_info4 value in v4. When the info is not reliable, the
> > client doesn't trust its cache, for the most part. Getting it wrong
> > shouldn't be fatal in most cases.
> > 
> > For v3 you can just clear the attributes_follow bit when you fill out
> > the info, and leave it zeroed out. I had a patch a few years ago that
> > did this on a per-export basis that you're welcome to take an run with:
> > 
> >     https://patchwork.kernel.org/patch/7159311/
> > 
> > Obviously, the conditions for doing this here are different.
> > 
> > For v4, I think we can just try to scrape what we have like you're doing
> > here, and simply ensure that the "atomic" field being encoded in
> > encode_cinfo is false when this occurs.
> > 
> 
> Honestly, Jeff, at this point I am so far out into the woods with overlay NFS
> export, that I would like to remain focused on correctness and leave
> performance for later time.
> 

:)

> So if I understand you correctly, patch 2/2 is not needed for correctness?
> Meaning that if overlay inode times are not uptodate, nothing fatal will
> happen? Or did you mean that I must take care of signalling the client
> that time values are not reliable for overlayfs?
> 
> If patch 2/2 is indeed not a must, then I would like to ask you to ACK
> patch 1/2. It seems quite simple, trivial and harmless to me even  without
> diving deep into NFS protocols. I think patch 1/2 should be enough for
> first implementation - it certainly is enough to fix the nfstest_posix failures.
> 
> Thanks!
> Amir.

Patch #1 looks fine. I think we ought to wait on #2.

We really should be doing getattrs like this, but when that fails we
should probably just zero out the wcc / change_info4 at the end rather
than pretending that it's valid.

I think Bruce or I can take care of that bit after patch #1 is merged.
Amir Goldstein Jan. 4, 2018, 1:47 p.m. UTC | #6
On Thu, Jan 4, 2018 at 3:26 PM, Jeff Layton <jlayton@kernel.org> wrote:
> On Wed, 2018-01-03 at 23:03 +0200, Amir Goldstein wrote:
[...]

>
>> So if I understand you correctly, patch 2/2 is not needed for correctness?
>> Meaning that if overlay inode times are not uptodate, nothing fatal will
>> happen? Or did you mean that I must take care of signalling the client
>> that time values are not reliable for overlayfs?
>>
>> If patch 2/2 is indeed not a must, then I would like to ask you to ACK
>> patch 1/2. It seems quite simple, trivial and harmless to me even  without
>> diving deep into NFS protocols. I think patch 1/2 should be enough for
>> first implementation - it certainly is enough to fix the nfstest_posix failures.
>>
>> Thanks!
>> Amir.
>
> Patch #1 looks fine. I think we ought to wait on #2.
>
> We really should be doing getattrs like this, but when that fails we
> should probably just zero out the wcc / change_info4 at the end rather
> than pretending that it's valid.
>
> I think Bruce or I can take care of that bit after patch #1 is merged.

Great!

Then I'll stack patch #1 along with my overlay nfs export series
with your ACK.

Thanks!
Amir.
J. Bruce Fields Jan. 4, 2018, 11:05 p.m. UTC | #7
On Thu, Jan 04, 2018 at 08:26:43AM -0500, Jeff Layton wrote:
> On Wed, 2018-01-03 at 23:03 +0200, Amir Goldstein wrote:
> > On Wed, Jan 3, 2018 at 8:45 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > On Wed, 2018-01-03 at 17:48 +0200, Amir Goldstein wrote:
> > > > On Wed, Jan 3, 2018 at 5:41 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > > On Wed, 2018-01-03 at 17:14 +0200, Amir Goldstein wrote:
> > > > > > The time values in stat and inode may differ for overlayfs and stat time
> > > > > > values are the correct ones to use. This is also consistent with the fact
> > > > > > that fill_post_wcc() also stores stat time values.
> > > > > > 
> > > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > > > > ---
> > > > > >  fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
> > > > > >  fs/nfsd/nfs4xdr.c |  2 +-
> > > > > >  fs/nfsd/nfsfh.h   | 28 ++++++----------------------
> > > > > >  3 files changed, 37 insertions(+), 24 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> > > > > > index 2758480555fa..1a70581e1cb2 100644
> > > > > > --- a/fs/nfsd/nfs3xdr.c
> > > > > > +++ b/fs/nfsd/nfs3xdr.c
> > > > > > @@ -251,6 +251,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
> > > > > >  }
> > > > > > 
> > > > > >  /*
> > > > > > + * Fill in the pre_op attr for the wcc data
> > > > > > + */
> > > > > > +void fill_pre_wcc(struct svc_fh *fhp)
> > > > > > +{
> > > > > > +     struct inode    *inode;
> > > > > > +     struct kstat    stat;
> > > > > > +     __be32 err;
> > > > > > +
> > > > > > +     if (fhp->fh_pre_saved)
> > > > > > +             return;
> > > > > > +
> > > > > > +     inode = d_inode(fhp->fh_dentry);
> > > > > > +     err = fh_getattr(fhp, &stat);
> > > > > > +     if (err) {
> > > > > > +             /* Grab the times from inode anyway */
> > > > > > +             stat.mtime = inode->i_mtime;
> > > > > > +             stat.ctime = inode->i_ctime;
> > > > > > +             stat.size  = inode->i_size;
> > > > > > +     }
> > > > > > +
> > > > > 
> > > > > Might be best to instead just not supply pre/post op attrs if the
> > > > > getattr fails? They are technically optional with v3 -- we can just set
> > > > > the attributes_follow bit to false there.
> > > > 
> > > > I considered to set fh_pre_saved = false on error just like
> > > > fill_post_wcc() does, but wasn't sure of the consequences or how to test
> > > > for that matter, so I chose a more delicate fallback instead.
> > > > 
> > > 
> > > Take care with the BUG_ON in set_change_info if you do that.
> > > 
> > > Note that all of this is really just to handle weak cache consistency in
> > > v3, and the change_info4 value in v4. When the info is not reliable, the
> > > client doesn't trust its cache, for the most part. Getting it wrong
> > > shouldn't be fatal in most cases.
> > > 
> > > For v3 you can just clear the attributes_follow bit when you fill out
> > > the info, and leave it zeroed out. I had a patch a few years ago that
> > > did this on a per-export basis that you're welcome to take an run with:
> > > 
> > >     https://patchwork.kernel.org/patch/7159311/
> > > 
> > > Obviously, the conditions for doing this here are different.
> > > 
> > > For v4, I think we can just try to scrape what we have like you're doing
> > > here, and simply ensure that the "atomic" field being encoded in
> > > encode_cinfo is false when this occurs.
> > > 
> > 
> > Honestly, Jeff, at this point I am so far out into the woods with overlay NFS
> > export, that I would like to remain focused on correctness and leave
> > performance for later time.
> > 
> 
> :)
> 
> > So if I understand you correctly, patch 2/2 is not needed for correctness?
> > Meaning that if overlay inode times are not uptodate, nothing fatal will
> > happen? Or did you mean that I must take care of signalling the client
> > that time values are not reliable for overlayfs?
> > 
> > If patch 2/2 is indeed not a must, then I would like to ask you to ACK
> > patch 1/2. It seems quite simple, trivial and harmless to me even  without
> > diving deep into NFS protocols. I think patch 1/2 should be enough for
> > first implementation - it certainly is enough to fix the nfstest_posix failures.
> > 
> > Thanks!
> > Amir.
> 
> Patch #1 looks fine. I think we ought to wait on #2.
> 
> We really should be doing getattrs like this, but when that fails we
> should probably just zero out the wcc / change_info4 at the end rather
> than pretending that it's valid.

I can believe that, but that's what the current behavior is, so I don't
think this patch needs to (or should) make that change too.  I'd rather
fix the one overlayfs problem (as this patch does) and then handle the
change in behavior in the stat failure case separately.

(Though maybe a comment there to remind future selves of this dicussion
would be helpful.)

--b.
Amir Goldstein Jan. 5, 2018, 2:45 p.m. UTC | #8
On Fri, Jan 5, 2018 at 1:05 AM, J . Bruce Fields <bfields@fieldses.org> wrote:
> On Thu, Jan 04, 2018 at 08:26:43AM -0500, Jeff Layton wrote:
>> On Wed, 2018-01-03 at 23:03 +0200, Amir Goldstein wrote:
>> > On Wed, Jan 3, 2018 at 8:45 PM, Jeff Layton <jlayton@kernel.org> wrote:
>> > > On Wed, 2018-01-03 at 17:48 +0200, Amir Goldstein wrote:
>> > > > On Wed, Jan 3, 2018 at 5:41 PM, Jeff Layton <jlayton@kernel.org> wrote:
>> > > > > On Wed, 2018-01-03 at 17:14 +0200, Amir Goldstein wrote:
>> > > > > > The time values in stat and inode may differ for overlayfs and stat time
>> > > > > > values are the correct ones to use. This is also consistent with the fact
>> > > > > > that fill_post_wcc() also stores stat time values.
>> > > > > >
>> > > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>> > > > > > ---
>> > > > > >  fs/nfsd/nfs3xdr.c | 31 ++++++++++++++++++++++++++++++-
>> > > > > >  fs/nfsd/nfs4xdr.c |  2 +-
>> > > > > >  fs/nfsd/nfsfh.h   | 28 ++++++----------------------
>> > > > > >  3 files changed, 37 insertions(+), 24 deletions(-)
>> > > > > >
>> > > > > > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
>> > > > > > index 2758480555fa..1a70581e1cb2 100644
>> > > > > > --- a/fs/nfsd/nfs3xdr.c
>> > > > > > +++ b/fs/nfsd/nfs3xdr.c
>> > > > > > @@ -251,6 +251,34 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
>> > > > > >  }
>> > > > > >
>> > > > > >  /*
>> > > > > > + * Fill in the pre_op attr for the wcc data
>> > > > > > + */
>> > > > > > +void fill_pre_wcc(struct svc_fh *fhp)
>> > > > > > +{
>> > > > > > +     struct inode    *inode;
>> > > > > > +     struct kstat    stat;
>> > > > > > +     __be32 err;
>> > > > > > +
>> > > > > > +     if (fhp->fh_pre_saved)
>> > > > > > +             return;
>> > > > > > +
>> > > > > > +     inode = d_inode(fhp->fh_dentry);
>> > > > > > +     err = fh_getattr(fhp, &stat);
>> > > > > > +     if (err) {
>> > > > > > +             /* Grab the times from inode anyway */
>> > > > > > +             stat.mtime = inode->i_mtime;
>> > > > > > +             stat.ctime = inode->i_ctime;
>> > > > > > +             stat.size  = inode->i_size;
>> > > > > > +     }
>> > > > > > +
>> > > > >
>> > > > > Might be best to instead just not supply pre/post op attrs if the
>> > > > > getattr fails? They are technically optional with v3 -- we can just set
>> > > > > the attributes_follow bit to false there.
>> > > >
>> > > > I considered to set fh_pre_saved = false on error just like
>> > > > fill_post_wcc() does, but wasn't sure of the consequences or how to test
>> > > > for that matter, so I chose a more delicate fallback instead.
>> > > >
>> > >
>> > > Take care with the BUG_ON in set_change_info if you do that.
>> > >
>> > > Note that all of this is really just to handle weak cache consistency in
>> > > v3, and the change_info4 value in v4. When the info is not reliable, the
>> > > client doesn't trust its cache, for the most part. Getting it wrong
>> > > shouldn't be fatal in most cases.
>> > >
>> > > For v3 you can just clear the attributes_follow bit when you fill out
>> > > the info, and leave it zeroed out. I had a patch a few years ago that
>> > > did this on a per-export basis that you're welcome to take an run with:
>> > >
>> > >     https://patchwork.kernel.org/patch/7159311/
>> > >
>> > > Obviously, the conditions for doing this here are different.
>> > >
>> > > For v4, I think we can just try to scrape what we have like you're doing
>> > > here, and simply ensure that the "atomic" field being encoded in
>> > > encode_cinfo is false when this occurs.
>> > >
>> >
>> > Honestly, Jeff, at this point I am so far out into the woods with overlay NFS
>> > export, that I would like to remain focused on correctness and leave
>> > performance for later time.
>> >
>>
>> :)
>>
>> > So if I understand you correctly, patch 2/2 is not needed for correctness?
>> > Meaning that if overlay inode times are not uptodate, nothing fatal will
>> > happen? Or did you mean that I must take care of signalling the client
>> > that time values are not reliable for overlayfs?
>> >
>> > If patch 2/2 is indeed not a must, then I would like to ask you to ACK
>> > patch 1/2. It seems quite simple, trivial and harmless to me even  without
>> > diving deep into NFS protocols. I think patch 1/2 should be enough for
>> > first implementation - it certainly is enough to fix the nfstest_posix failures.
>> >
>> > Thanks!
>> > Amir.
>>
>> Patch #1 looks fine. I think we ought to wait on #2.
>>
>> We really should be doing getattrs like this, but when that fails we
>> should probably just zero out the wcc / change_info4 at the end rather
>> than pretending that it's valid.
>
> I can believe that, but that's what the current behavior is, so I don't
> think this patch needs to (or should) make that change too.  I'd rather
> fix the one overlayfs problem (as this patch does) and then handle the
> change in behavior in the stat failure case separately.
>
> (Though maybe a comment there to remind future selves of this dicussion
> would be helpful.)
>

So can I assume you will be taking both patches through your tree?
and add said comment to your future self?
Since patches are independent of overlayfs work, it does not matter if they
get merged before or after overlayfs work, just please let us know what you
intend to do, because right now I posted patch 1/2 at the end of the
overlayfs series for Miklos to take.

Which reminds me... I never got any feedback from tmpfs/vfs folks about the
tmpfs patch: https://patchwork.kernel.org/patch/10055177/
but IIRC, both you and Jeff where in favor of the change.
Do you feel confident enough about this patch to carry it though your tree?
Or should I nudge someone else about it?
The test for decoding a file handle of an unlinked file has already been
merged to xfstest generic/467 and the test is failing with tmpfs without this
change.

Thanks!
Amir.
J. Bruce Fields Jan. 5, 2018, 3:30 p.m. UTC | #9
On Fri, Jan 05, 2018 at 04:45:46PM +0200, Amir Goldstein wrote:
> So can I assume you will be taking both patches through your tree?
> and add said comment to your future self?

I'm OK with that (if Jeff is).

> Since patches are independent of overlayfs work, it does not matter if they
> get merged before or after overlayfs work, just please let us know what you
> intend to do, because right now I posted patch 1/2 at the end of the
> overlayfs series for Miklos to take.

I'd be fine with that or with Miklos taking either or both (feel free to
add an Acked-by: tag for me if you take that route).

But I'll assume I'm taking them unless I hear otherwise.

> Which reminds me... I never got any feedback from tmpfs/vfs folks about the
> tmpfs patch: https://patchwork.kernel.org/patch/10055177/
> but IIRC, both you and Jeff where in favor of the change.
> Do you feel confident enough about this patch to carry it though your tree?

No, I'd recommend resending.  Looks like it should go to Hugh Dickins
<hughd@google.com> and linux-mm@kvack.org ?

--b.

> Or should I nudge someone else about it?
> The test for decoding a file handle of an unlinked file has already been
> merged to xfstest generic/467 and the test is failing with tmpfs without this
> change.
> 
> Thanks!
> Amir.
J. Bruce Fields Jan. 19, 2018, 10:03 p.m. UTC | #10
On Fri, Jan 05, 2018 at 10:30:18AM -0500, J . Bruce Fields wrote:
> On Fri, Jan 05, 2018 at 04:45:46PM +0200, Amir Goldstein wrote:
> > So can I assume you will be taking both patches through your tree?
> > and add said comment to your future self?
> 
> I'm OK with that (if Jeff is).
> 
> > Since patches are independent of overlayfs work, it does not matter if they
> > get merged before or after overlayfs work, just please let us know what you
> > intend to do, because right now I posted patch 1/2 at the end of the
> > overlayfs series for Miklos to take.
> 
> I'd be fine with that or with Miklos taking either or both (feel free to
> add an Acked-by: tag for me if you take that route).
> 
> But I'll assume I'm taking them unless I hear otherwise.

By the way, I just got back to this: committed to nfsd-next at
git://linux-nfs.org/~bfields/linux.git, with just one extra paragraph on
#2's changelog.

--b.
diff mbox

Patch

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2758480555fa..1a70581e1cb2 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -251,6 +251,34 @@  encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 }
 
 /*
+ * Fill in the pre_op attr for the wcc data
+ */
+void fill_pre_wcc(struct svc_fh *fhp)
+{
+	struct inode    *inode;
+	struct kstat	stat;
+	__be32 err;
+
+	if (fhp->fh_pre_saved)
+		return;
+
+	inode = d_inode(fhp->fh_dentry);
+	err = fh_getattr(fhp, &stat);
+	if (err) {
+		/* Grab the times from inode anyway */
+		stat.mtime = inode->i_mtime;
+		stat.ctime = inode->i_ctime;
+		stat.size  = inode->i_size;
+	}
+
+	fhp->fh_pre_mtime = stat.mtime;
+	fhp->fh_pre_ctime = stat.ctime;
+	fhp->fh_pre_size  = stat.size;
+	fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
+	fhp->fh_pre_saved = true;
+}
+
+/*
  * Fill in the post_op attr for the wcc data
  */
 void fill_post_wcc(struct svc_fh *fhp)
@@ -261,7 +289,8 @@  void fill_post_wcc(struct svc_fh *fhp)
 		printk("nfsd: inode locked twice during operation.\n");
 
 	err = fh_getattr(fhp, &fhp->fh_post_attr);
-	fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
+	fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
+						     d_inode(fhp->fh_dentry));
 	if (err) {
 		fhp->fh_post_saved = false;
 		/* Grab the ctime anyway - set_change_info might use it */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2c61c6b8ae09..fbe11ad9b9ff 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1991,7 +1991,7 @@  static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
 		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
 		*p++ = 0;
 	} else if (IS_I_VERSION(inode)) {
-		p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
+		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
 	} else {
 		*p++ = cpu_to_be32(stat->ctime.tv_sec);
 		*p++ = cpu_to_be32(stat->ctime.tv_nsec);
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 43f31cf49bae..99be87b50ebe 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -252,36 +252,20 @@  fh_clear_wcc(struct svc_fh *fhp)
  * By using both ctime and the i_version counter we guarantee that as
  * long as time doesn't go backwards we never reuse an old value.
  */
-static inline u64 nfsd4_change_attribute(struct inode *inode)
+static inline u64 nfsd4_change_attribute(struct kstat *stat,
+					 struct inode *inode)
 {
 	u64 chattr;
 
-	chattr =  inode->i_ctime.tv_sec;
+	chattr =  stat->ctime.tv_sec;
 	chattr <<= 30;
-	chattr += inode->i_ctime.tv_nsec;
+	chattr += stat->ctime.tv_nsec;
 	chattr += inode->i_version;
 	return chattr;
 }
 
-/*
- * Fill in the pre_op attr for the wcc data
- */
-static inline void
-fill_pre_wcc(struct svc_fh *fhp)
-{
-	struct inode    *inode;
-
-	inode = d_inode(fhp->fh_dentry);
-	if (!fhp->fh_pre_saved) {
-		fhp->fh_pre_mtime = inode->i_mtime;
-		fhp->fh_pre_ctime = inode->i_ctime;
-		fhp->fh_pre_size  = inode->i_size;
-		fhp->fh_pre_change = nfsd4_change_attribute(inode);
-		fhp->fh_pre_saved = true;
-	}
-}
-
-extern void fill_post_wcc(struct svc_fh *);
+extern void fill_pre_wcc(struct svc_fh *fhp);
+extern void fill_post_wcc(struct svc_fh *fhp);
 #else
 #define fh_clear_wcc(ignored)
 #define fill_pre_wcc(ignored)