diff mbox series

nfsd: Don't reset the write verifier on a commit EAGAIN

Message ID 20230911184357.11739-1-trond.myklebust@hammerspace.com (mailing list archive)
State New, archived
Headers show
Series nfsd: Don't reset the write verifier on a commit EAGAIN | expand

Commit Message

Trond Myklebust Sept. 11, 2023, 6:43 p.m. UTC
From: Trond Myklebust <trond.myklebust@hammerspace.com>

If fsync() is returning EAGAIN, then we can assume that the filesystem
being exported is something like NFS with the 'softerr' mount option
enabled, and that it is just asking us to replay the fsync() operation
at a later date.
If we see an ESTALE, then ditto: the file is gone, so there is no danger
of losing the error.
For those cases, do not reset the write verifier.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

Comments

Trond Myklebust Sept. 11, 2023, 8:54 p.m. UTC | #1
On Mon, 2023-09-11 at 16:14 -0400, Chuck Lever wrote:
> On Mon, Sep 11, 2023 at 02:43:57PM -0400, trondmy@gmail.com wrote:
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > If fsync() is returning EAGAIN, then we can assume that the
> > filesystem
> > being exported is something like NFS with the 'softerr' mount
> > option
> > enabled, and that it is just asking us to replay the fsync()
> > operation
> > at a later date.
> > If we see an ESTALE, then ditto: the file is gone, so there is no
> > danger
> > of losing the error.
> > For those cases, do not reset the write verifier.
> 
> Out of interest, what's the hazard in a write verifier change in
> these cases? There could be a slight performance penalty, I imagine,
> but how frequently does this happen?

When re-exporting to NFSv4 clients, it should be less of a problem,
since any REMOVE will result in a sillyrenamed file that only
disappears once the file is closed. However with NFSv3 clients, that is
circumvented by the fact that the filecache closes the files when they
are inactive. We've seen this occur frequently with VMware vmdks: their
lock files appear to generate a lot of these phantom ESTALE writes.

As for EAGAIN, I just pushed out a 2 patch client series that makes it
a lot more frequent when re-exporting NFSv4 with 'softerr'.

Finally, it is worth noting that a write verifier change has a global
effect, causing retransmission by all clients of all uncommitted
unstable writes for all files, so is worth mitigating where possible.

> 
> One more below.
> 
> 
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> >  fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
> >  1 file changed, 19 insertions(+), 10 deletions(-)
> > 
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index 98fa4fd0556d..31daf9f63572 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -337,6 +337,20 @@ nfsd_lookup(struct svc_rqst *rqstp, struct
> > svc_fh *fhp, const char *name,
> >         return err;
> >  }
> >  
> > +static void
> > +commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst
> > *rqstp,
> > +                           int err)
> > +{
> > +       switch (err) {
> > +       case -EAGAIN:
> > +       case -ESTALE:
> > +               break;
> > +       default:
> > +               nfsd_reset_write_verifier(nn);
> > +               trace_nfsd_writeverf_reset(nn, rqstp, err);
> > +       }
> > +}
> > +
> >  /*
> >   * Commit metadata changes to stable storage.
> >   */
> > @@ -647,8 +661,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst
> > *rqstp,
> >                                         &nfsd4_get_cstate(rqstp)-
> > >current_fh,
> >                                         dst_pos,
> >                                         count, status);
> > -                       nfsd_reset_write_verifier(nn);
> > -                       trace_nfsd_writeverf_reset(nn, rqstp,
> > status);
> > +                       commit_reset_write_verifier(nn, rqstp,
> > status);
> >                         ret = nfserrno(status);
> >                 }
> >         }
> > @@ -1170,8 +1183,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct
> > svc_fh *fhp, struct nfsd_file *nf,
> >         host_err = vfs_iter_write(file, &iter, &pos, flags);
> >         file_end_write(file);
> >         if (host_err < 0) {
> > -               nfsd_reset_write_verifier(nn);
> > -               trace_nfsd_writeverf_reset(nn, rqstp, host_err);
> > +               commit_reset_write_verifier(nn, rqstp, host_err);
> 
> Can generic_file_write_iter() or its brethren return STALE or AGAIN
> before they get to the generic_write_sync() call ?

The call to nfs_revalidate_file_size(), which can occur when you are
appending to the file (whether or not O_APPEND is set) could indeed
return ESTALE.
With the new patchset mentioned above, it could also return EAGAIN.

> 
> 
> >                 goto out_nfserr;
> >         }
> >         *cnt = host_err;
> > @@ -1183,10 +1195,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp,
> > struct svc_fh *fhp, struct nfsd_file *nf,
> >  
> >         if (stable && use_wgather) {
> >                 host_err = wait_for_concurrent_writes(file);
> > -               if (host_err < 0) {
> > -                       nfsd_reset_write_verifier(nn);
> > -                       trace_nfsd_writeverf_reset(nn, rqstp,
> > host_err);
> > -               }
> > +               if (host_err < 0)
> > +                       commit_reset_write_verifier(nn, rqstp,
> > host_err);
> >         }
> >  
> >  out_nfserr:
> > @@ -1329,8 +1339,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct
> > svc_fh *fhp, struct nfsd_file *nf,
> >                         err = nfserr_notsupp;
> >                         break;
> >                 default:
> > -                       nfsd_reset_write_verifier(nn);
> > -                       trace_nfsd_writeverf_reset(nn, rqstp,
> > err2);
> > +                       commit_reset_write_verifier(nn, rqstp,
> > err2);
> >                         err = nfserrno(err2);
> >                 }
> >         } else
> > -- 
> > 2.41.0
> > 
>
Chuck Lever Sept. 11, 2023, 10:10 p.m. UTC | #2
> On Sep 11, 2023, at 4:54 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Mon, 2023-09-11 at 16:14 -0400, Chuck Lever wrote:
>> On Mon, Sep 11, 2023 at 02:43:57PM -0400, trondmy@gmail.com wrote:
>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> 
>>> If fsync() is returning EAGAIN, then we can assume that the
>>> filesystem
>>> being exported is something like NFS with the 'softerr' mount
>>> option
>>> enabled, and that it is just asking us to replay the fsync()
>>> operation
>>> at a later date.
>>> If we see an ESTALE, then ditto: the file is gone, so there is no
>>> danger
>>> of losing the error.
>>> For those cases, do not reset the write verifier.
>> 
>> Out of interest, what's the hazard in a write verifier change in
>> these cases? There could be a slight performance penalty, I imagine,
>> but how frequently does this happen?
> 
> When re-exporting to NFSv4 clients, it should be less of a problem,
> since any REMOVE will result in a sillyrenamed file that only
> disappears once the file is closed. However with NFSv3 clients, that is
> circumvented by the fact that the filecache closes the files when they
> are inactive. We've seen this occur frequently with VMware vmdks: their
> lock files appear to generate a lot of these phantom ESTALE writes.
> 
> As for EAGAIN, I just pushed out a 2 patch client series that makes it
> a lot more frequent when re-exporting NFSv4 with 'softerr'.
> 
> Finally, it is worth noting that a write verifier change has a global
> effect, causing retransmission by all clients of all uncommitted
> unstable writes for all files, so is worth mitigating where possible.

Good info. I've added some of this to the patch description.


>> One more below.
>> 
>> 
>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
>>> ---
>>>  fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
>>>  1 file changed, 19 insertions(+), 10 deletions(-)
>>> 
>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>> index 98fa4fd0556d..31daf9f63572 100644
>>> --- a/fs/nfsd/vfs.c
>>> +++ b/fs/nfsd/vfs.c
>>> @@ -337,6 +337,20 @@ nfsd_lookup(struct svc_rqst *rqstp, struct
>>> svc_fh *fhp, const char *name,
>>>         return err;
>>>  }
>>>  
>>> +static void
>>> +commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst
>>> *rqstp,
>>> +                           int err)
>>> +{
>>> +       switch (err) {
>>> +       case -EAGAIN:
>>> +       case -ESTALE:
>>> +               break;
>>> +       default:
>>> +               nfsd_reset_write_verifier(nn);
>>> +               trace_nfsd_writeverf_reset(nn, rqstp, err);
>>> +       }
>>> +}
>>> +
>>>  /*
>>>   * Commit metadata changes to stable storage.
>>>   */
>>> @@ -647,8 +661,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst
>>> *rqstp,
>>>                                         &nfsd4_get_cstate(rqstp)-
>>>> current_fh,
>>>                                         dst_pos,
>>>                                         count, status);
>>> -                       nfsd_reset_write_verifier(nn);
>>> -                       trace_nfsd_writeverf_reset(nn, rqstp,
>>> status);
>>> +                       commit_reset_write_verifier(nn, rqstp,
>>> status);
>>>                         ret = nfserrno(status);
>>>                 }
>>>         }
>>> @@ -1170,8 +1183,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct
>>> svc_fh *fhp, struct nfsd_file *nf,
>>>         host_err = vfs_iter_write(file, &iter, &pos, flags);
>>>         file_end_write(file);
>>>         if (host_err < 0) {
>>> -               nfsd_reset_write_verifier(nn);
>>> -               trace_nfsd_writeverf_reset(nn, rqstp, host_err);
>>> +               commit_reset_write_verifier(nn, rqstp, host_err);
>> 
>> Can generic_file_write_iter() or its brethren return STALE or AGAIN
>> before they get to the generic_write_sync() call ?
> 
> The call to nfs_revalidate_file_size(), which can occur when you are
> appending to the file (whether or not O_APPEND is set) could indeed
> return ESTALE.
> With the new patchset mentioned above, it could also return EAGAIN.

Sounds like I should drop this hunk when applying this fix.


>>>                 goto out_nfserr;
>>>         }
>>>         *cnt = host_err;
>>> @@ -1183,10 +1195,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp,
>>> struct svc_fh *fhp, struct nfsd_file *nf,
>>>  
>>>         if (stable && use_wgather) {
>>>                 host_err = wait_for_concurrent_writes(file);
>>> -               if (host_err < 0) {
>>> -                       nfsd_reset_write_verifier(nn);
>>> -                       trace_nfsd_writeverf_reset(nn, rqstp,
>>> host_err);
>>> -               }
>>> +               if (host_err < 0)
>>> +                       commit_reset_write_verifier(nn, rqstp,
>>> host_err);
>>>         }
>>>  
>>>  out_nfserr:
>>> @@ -1329,8 +1339,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct
>>> svc_fh *fhp, struct nfsd_file *nf,
>>>                         err = nfserr_notsupp;
>>>                         break;
>>>                 default:
>>> -                       nfsd_reset_write_verifier(nn);
>>> -                       trace_nfsd_writeverf_reset(nn, rqstp,
>>> err2);
>>> +                       commit_reset_write_verifier(nn, rqstp,
>>> err2);
>>>                         err = nfserrno(err2);
>>>                 }
>>>         } else
>>> -- 
>>> 2.41.0
>>> 
>> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com


--
Chuck Lever
Trond Myklebust Sept. 11, 2023, 11:42 p.m. UTC | #3
On Mon, 2023-09-11 at 22:10 +0000, Chuck Lever III wrote:
> 
> > On Sep 11, 2023, at 4:54 PM, Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > 
> > On Mon, 2023-09-11 at 16:14 -0400, Chuck Lever wrote:
> > > On Mon, Sep 11, 2023 at 02:43:57PM -0400,
> > > trondmy@gmail.com wrote:
> > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > 
> > > > If fsync() is returning EAGAIN, then we can assume that the
> > > > filesystem
> > > > being exported is something like NFS with the 'softerr' mount
> > > > option
> > > > enabled, and that it is just asking us to replay the fsync()
> > > > operation
> > > > at a later date.
> > > > If we see an ESTALE, then ditto: the file is gone, so there is
> > > > no
> > > > danger
> > > > of losing the error.
> > > > For those cases, do not reset the write verifier.
> > > 
> > > Out of interest, what's the hazard in a write verifier change in
> > > these cases? There could be a slight performance penalty, I
> > > imagine,
> > > but how frequently does this happen?
> > 
> > When re-exporting to NFSv4 clients, it should be less of a problem,
> > since any REMOVE will result in a sillyrenamed file that only
> > disappears once the file is closed. However with NFSv3 clients,
> > that is
> > circumvented by the fact that the filecache closes the files when
> > they
> > are inactive. We've seen this occur frequently with VMware vmdks:
> > their
> > lock files appear to generate a lot of these phantom ESTALE writes.
> > 
> > As for EAGAIN, I just pushed out a 2 patch client series that makes
> > it
> > a lot more frequent when re-exporting NFSv4 with 'softerr'.
> > 
> > Finally, it is worth noting that a write verifier change has a
> > global
> > effect, causing retransmission by all clients of all uncommitted
> > unstable writes for all files, so is worth mitigating where
> > possible.
> 
> Good info. I've added some of this to the patch description.
> 
> 
> > > One more below.
> > > 
> > > 
> > > > Signed-off-by: Trond Myklebust
> > > > <trond.myklebust@hammerspace.com>
> > > > ---
> > > >  fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
> > > >  1 file changed, 19 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > index 98fa4fd0556d..31daf9f63572 100644
> > > > --- a/fs/nfsd/vfs.c
> > > > +++ b/fs/nfsd/vfs.c
> > > > @@ -337,6 +337,20 @@ nfsd_lookup(struct svc_rqst *rqstp, struct
> > > > svc_fh *fhp, const char *name,
> > > >         return err;
> > > >  }
> > > >  
> > > > +static void
> > > > +commit_reset_write_verifier(struct nfsd_net *nn, struct
> > > > svc_rqst
> > > > *rqstp,
> > > > +                           int err)
> > > > +{
> > > > +       switch (err) {
> > > > +       case -EAGAIN:
> > > > +       case -ESTALE:
> > > > +               break;
> > > > +       default:
> > > > +               nfsd_reset_write_verifier(nn);
> > > > +               trace_nfsd_writeverf_reset(nn, rqstp, err);
> > > > +       }
> > > > +}
> > > > +
> > > >  /*
> > > >   * Commit metadata changes to stable storage.
> > > >   */
> > > > @@ -647,8 +661,7 @@ __be32 nfsd4_clone_file_range(struct
> > > > svc_rqst
> > > > *rqstp,
> > > >                                        
> > > > &nfsd4_get_cstate(rqstp)-
> > > > > current_fh,
> > > >                                         dst_pos,
> > > >                                         count, status);
> > > > -                       nfsd_reset_write_verifier(nn);
> > > > -                       trace_nfsd_writeverf_reset(nn, rqstp,
> > > > status);
> > > > +                       commit_reset_write_verifier(nn, rqstp,
> > > > status);
> > > >                         ret = nfserrno(status);
> > > >                 }
> > > >         }
> > > > @@ -1170,8 +1183,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp,
> > > > struct
> > > > svc_fh *fhp, struct nfsd_file *nf,
> > > >         host_err = vfs_iter_write(file, &iter, &pos, flags);
> > > >         file_end_write(file);
> > > >         if (host_err < 0) {
> > > > -               nfsd_reset_write_verifier(nn);
> > > > -               trace_nfsd_writeverf_reset(nn, rqstp,
> > > > host_err);
> > > > +               commit_reset_write_verifier(nn, rqstp,
> > > > host_err);
> > > 
> > > Can generic_file_write_iter() or its brethren return STALE or
> > > AGAIN
> > > before they get to the generic_write_sync() call ?
> > 
> > The call to nfs_revalidate_file_size(), which can occur when you
> > are
> > appending to the file (whether or not O_APPEND is set) could indeed
> > return ESTALE.
> > With the new patchset mentioned above, it could also return EAGAIN.
> 
> Sounds like I should drop this hunk when applying this fix.

I'm not understanding. Why would you not keep it? 

> 
> 
> > > >                 goto out_nfserr;
> > > >         }
> > > >         *cnt = host_err;
> > > > @@ -1183,10 +1195,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp,
> > > > struct svc_fh *fhp, struct nfsd_file *nf,
> > > >  
> > > >         if (stable && use_wgather) {
> > > >                 host_err = wait_for_concurrent_writes(file);
> > > > -               if (host_err < 0) {
> > > > -                       nfsd_reset_write_verifier(nn);
> > > > -                       trace_nfsd_writeverf_reset(nn, rqstp,
> > > > host_err);
> > > > -               }
> > > > +               if (host_err < 0)
> > > > +                       commit_reset_write_verifier(nn, rqstp,
> > > > host_err);
> > > >         }
> > > >  
> > > >  out_nfserr:
> > > > @@ -1329,8 +1339,7 @@ nfsd_commit(struct svc_rqst *rqstp,
> > > > struct
> > > > svc_fh *fhp, struct nfsd_file *nf,
> > > >                         err = nfserr_notsupp;
> > > >                         break;
> > > >                 default:
> > > > -                       nfsd_reset_write_verifier(nn);
> > > > -                       trace_nfsd_writeverf_reset(nn, rqstp,
> > > > err2);
> > > > +                       commit_reset_write_verifier(nn, rqstp,
> > > > err2);
> > > >                         err = nfserrno(err2);
> > > >                 }
> > > >         } else
> > > > -- 
> > > > 2.41.0
> > > > 
> > > 
> > 
> > -- 
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> 
> 
> --
> Chuck Lever
> 
>
Chuck Lever Sept. 12, 2023, 12:45 a.m. UTC | #4
> On Sep 11, 2023, at 7:42 PM, Trond Myklebust <trondmy@hammerspace.com> wrote:
> 
> On Mon, 2023-09-11 at 22:10 +0000, Chuck Lever III wrote:
>> 
>>> On Sep 11, 2023, at 4:54 PM, Trond Myklebust
>>> <trondmy@hammerspace.com> wrote:
>>> 
>>> On Mon, 2023-09-11 at 16:14 -0400, Chuck Lever wrote:
>>>> On Mon, Sep 11, 2023 at 02:43:57PM -0400,
>>>> trondmy@gmail.com wrote:
>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>>>>> 
>>>>> If fsync() is returning EAGAIN, then we can assume that the
>>>>> filesystem
>>>>> being exported is something like NFS with the 'softerr' mount
>>>>> option
>>>>> enabled, and that it is just asking us to replay the fsync()
>>>>> operation
>>>>> at a later date.
>>>>> If we see an ESTALE, then ditto: the file is gone, so there is
>>>>> no
>>>>> danger
>>>>> of losing the error.
>>>>> For those cases, do not reset the write verifier.
>>>> 
>>>> Out of interest, what's the hazard in a write verifier change in
>>>> these cases? There could be a slight performance penalty, I
>>>> imagine,
>>>> but how frequently does this happen?
>>> 
>>> When re-exporting to NFSv4 clients, it should be less of a problem,
>>> since any REMOVE will result in a sillyrenamed file that only
>>> disappears once the file is closed. However with NFSv3 clients,
>>> that is
>>> circumvented by the fact that the filecache closes the files when
>>> they
>>> are inactive. We've seen this occur frequently with VMware vmdks:
>>> their
>>> lock files appear to generate a lot of these phantom ESTALE writes.
>>> 
>>> As for EAGAIN, I just pushed out a 2 patch client series that makes
>>> it
>>> a lot more frequent when re-exporting NFSv4 with 'softerr'.
>>> 
>>> Finally, it is worth noting that a write verifier change has a
>>> global
>>> effect, causing retransmission by all clients of all uncommitted
>>> unstable writes for all files, so is worth mitigating where
>>> possible.
>> 
>> Good info. I've added some of this to the patch description.
>> 
>> 
>>>> One more below.
>>>> 
>>>> 
>>>>> Signed-off-by: Trond Myklebust
>>>>> <trond.myklebust@hammerspace.com>
>>>>> ---
>>>>>  fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
>>>>>  1 file changed, 19 insertions(+), 10 deletions(-)
>>>>> 
>>>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>>>> index 98fa4fd0556d..31daf9f63572 100644
>>>>> --- a/fs/nfsd/vfs.c
>>>>> +++ b/fs/nfsd/vfs.c
>>>>> @@ -337,6 +337,20 @@ nfsd_lookup(struct svc_rqst *rqstp, struct
>>>>> svc_fh *fhp, const char *name,
>>>>>         return err;
>>>>>  }
>>>>>  
>>>>> +static void
>>>>> +commit_reset_write_verifier(struct nfsd_net *nn, struct
>>>>> svc_rqst
>>>>> *rqstp,
>>>>> +                           int err)
>>>>> +{
>>>>> +       switch (err) {
>>>>> +       case -EAGAIN:
>>>>> +       case -ESTALE:
>>>>> +               break;
>>>>> +       default:
>>>>> +               nfsd_reset_write_verifier(nn);
>>>>> +               trace_nfsd_writeverf_reset(nn, rqstp, err);
>>>>> +       }
>>>>> +}
>>>>> +
>>>>>  /*
>>>>>   * Commit metadata changes to stable storage.
>>>>>   */
>>>>> @@ -647,8 +661,7 @@ __be32 nfsd4_clone_file_range(struct
>>>>> svc_rqst
>>>>> *rqstp,
>>>>>                                        
>>>>> &nfsd4_get_cstate(rqstp)-
>>>>>> current_fh,
>>>>>                                         dst_pos,
>>>>>                                         count, status);
>>>>> -                       nfsd_reset_write_verifier(nn);
>>>>> -                       trace_nfsd_writeverf_reset(nn, rqstp,
>>>>> status);
>>>>> +                       commit_reset_write_verifier(nn, rqstp,
>>>>> status);
>>>>>                         ret = nfserrno(status);
>>>>>                 }
>>>>>         }
>>>>> @@ -1170,8 +1183,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp,
>>>>> struct
>>>>> svc_fh *fhp, struct nfsd_file *nf,
>>>>>         host_err = vfs_iter_write(file, &iter, &pos, flags);
>>>>>         file_end_write(file);
>>>>>         if (host_err < 0) {
>>>>> -               nfsd_reset_write_verifier(nn);
>>>>> -               trace_nfsd_writeverf_reset(nn, rqstp,
>>>>> host_err);
>>>>> +               commit_reset_write_verifier(nn, rqstp,
>>>>> host_err);
>>>> 
>>>> Can generic_file_write_iter() or its brethren return STALE or
>>>> AGAIN
>>>> before they get to the generic_write_sync() call ?
>>> 
>>> The call to nfs_revalidate_file_size(), which can occur when you
>>> are
>>> appending to the file (whether or not O_APPEND is set) could indeed
>>> return ESTALE.
>>> With the new patchset mentioned above, it could also return EAGAIN.
>> 
>> Sounds like I should drop this hunk when applying this fix.
> 
> I'm not understanding. Why would you not keep it?

generic_file_write_iter() and its brethren are two calls in
one, if I'm following this correctly:

1. write
2. sync

All the other places you change are "sync" only, so it's
fairly obvious that those callers get a return code that
reflects a failure of "sync".

I asked above if it's possible for the "write" part of
generic_file_write_iter() to fail with STALE/AGAIN before the
sync part is even called.

You seemed to be answering "yes, the 'write' part can fail
that way" but I may have misunderstood your response.

If the "write" step can fail, isn't that something that should
be reflected in a write verifier change? If yes, I don't see
how this particular call site can distinguish between a "write"
failure versus a "sync" failure.

Or, if the vfs_iter_write() call here is guaranteed to never
be a sync write request, then again, I think we want to reflect
all failures here with a write verifier change.

However, if STALE and AGAIN have the exact same semantics
for "write" as they do for "sync", those failures can be
thrown away too, and I can keep this hunk. Are you saying
this is the case?

(this is /only/ for the vfs_iter_write() call site. The others
look OK to me).


>>>>>                 goto out_nfserr;
>>>>>         }
>>>>>         *cnt = host_err;
>>>>> @@ -1183,10 +1195,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp,
>>>>> struct svc_fh *fhp, struct nfsd_file *nf,
>>>>>  
>>>>>         if (stable && use_wgather) {
>>>>>                 host_err = wait_for_concurrent_writes(file);
>>>>> -               if (host_err < 0) {
>>>>> -                       nfsd_reset_write_verifier(nn);
>>>>> -                       trace_nfsd_writeverf_reset(nn, rqstp,
>>>>> host_err);
>>>>> -               }
>>>>> +               if (host_err < 0)
>>>>> +                       commit_reset_write_verifier(nn, rqstp,
>>>>> host_err);
>>>>>         }
>>>>>  
>>>>>  out_nfserr:
>>>>> @@ -1329,8 +1339,7 @@ nfsd_commit(struct svc_rqst *rqstp,
>>>>> struct
>>>>> svc_fh *fhp, struct nfsd_file *nf,
>>>>>                         err = nfserr_notsupp;
>>>>>                         break;
>>>>>                 default:
>>>>> -                       nfsd_reset_write_verifier(nn);
>>>>> -                       trace_nfsd_writeverf_reset(nn, rqstp,
>>>>> err2);
>>>>> +                       commit_reset_write_verifier(nn, rqstp,
>>>>> err2);
>>>>>                         err = nfserrno(err2);
>>>>>                 }
>>>>>         } else
>>>>> -- 
>>>>> 2.41.0
>>>>> 
>>>> 
>>> 
>>> -- 
>>> Trond Myklebust
>>> Linux NFS client maintainer, Hammerspace
>>> trond.myklebust@hammerspace.com
>> 
>> 
>> --
>> Chuck Lever
>> 
>> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
> 
> 

--
Chuck Lever
Trond Myklebust Sept. 12, 2023, 1:11 a.m. UTC | #5
On Tue, 2023-09-12 at 00:45 +0000, Chuck Lever III wrote:
> 
> 
> > On Sep 11, 2023, at 7:42 PM, Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > 
> > On Mon, 2023-09-11 at 22:10 +0000, Chuck Lever III wrote:
> > > 
> > > > On Sep 11, 2023, at 4:54 PM, Trond Myklebust
> > > > <trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Mon, 2023-09-11 at 16:14 -0400, Chuck Lever wrote:
> > > > > On Mon, Sep 11, 2023 at 02:43:57PM -0400,
> > > > > trondmy@gmail.com wrote:
> > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > 
> > > > > > If fsync() is returning EAGAIN, then we can assume that the
> > > > > > filesystem
> > > > > > being exported is something like NFS with the 'softerr'
> > > > > > mount
> > > > > > option
> > > > > > enabled, and that it is just asking us to replay the
> > > > > > fsync()
> > > > > > operation
> > > > > > at a later date.
> > > > > > If we see an ESTALE, then ditto: the file is gone, so there
> > > > > > is
> > > > > > no
> > > > > > danger
> > > > > > of losing the error.
> > > > > > For those cases, do not reset the write verifier.
> > > > > 
> > > > > Out of interest, what's the hazard in a write verifier change
> > > > > in
> > > > > these cases? There could be a slight performance penalty, I
> > > > > imagine,
> > > > > but how frequently does this happen?
> > > > 
> > > > When re-exporting to NFSv4 clients, it should be less of a
> > > > problem,
> > > > since any REMOVE will result in a sillyrenamed file that only
> > > > disappears once the file is closed. However with NFSv3 clients,
> > > > that is
> > > > circumvented by the fact that the filecache closes the files
> > > > when
> > > > they
> > > > are inactive. We've seen this occur frequently with VMware
> > > > vmdks:
> > > > their
> > > > lock files appear to generate a lot of these phantom ESTALE
> > > > writes.
> > > > 
> > > > As for EAGAIN, I just pushed out a 2 patch client series that
> > > > makes
> > > > it
> > > > a lot more frequent when re-exporting NFSv4 with 'softerr'.
> > > > 
> > > > Finally, it is worth noting that a write verifier change has a
> > > > global
> > > > effect, causing retransmission by all clients of all
> > > > uncommitted
> > > > unstable writes for all files, so is worth mitigating where
> > > > possible.
> > > 
> > > Good info. I've added some of this to the patch description.
> > > 
> > > 
> > > > > One more below.
> > > > > 
> > > > > 
> > > > > > Signed-off-by: Trond Myklebust
> > > > > > <trond.myklebust@hammerspace.com>
> > > > > > ---
> > > > > >  fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
> > > > > >  1 file changed, 19 insertions(+), 10 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > > > index 98fa4fd0556d..31daf9f63572 100644
> > > > > > --- a/fs/nfsd/vfs.c
> > > > > > +++ b/fs/nfsd/vfs.c
> > > > > > @@ -337,6 +337,20 @@ nfsd_lookup(struct svc_rqst *rqstp,
> > > > > > struct
> > > > > > svc_fh *fhp, const char *name,
> > > > > >         return err;
> > > > > >  }
> > > > > >  
> > > > > > +static void
> > > > > > +commit_reset_write_verifier(struct nfsd_net *nn, struct
> > > > > > svc_rqst
> > > > > > *rqstp,
> > > > > > +                           int err)
> > > > > > +{
> > > > > > +       switch (err) {
> > > > > > +       case -EAGAIN:
> > > > > > +       case -ESTALE:
> > > > > > +               break;
> > > > > > +       default:
> > > > > > +               nfsd_reset_write_verifier(nn);
> > > > > > +               trace_nfsd_writeverf_reset(nn, rqstp, err);
> > > > > > +       }
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * Commit metadata changes to stable storage.
> > > > > >   */
> > > > > > @@ -647,8 +661,7 @@ __be32 nfsd4_clone_file_range(struct
> > > > > > svc_rqst
> > > > > > *rqstp,
> > > > > >                                        
> > > > > > &nfsd4_get_cstate(rqstp)-
> > > > > > > current_fh,
> > > > > >                                         dst_pos,
> > > > > >                                         count, status);
> > > > > > -                       nfsd_reset_write_verifier(nn);
> > > > > > -                       trace_nfsd_writeverf_reset(nn,
> > > > > > rqstp,
> > > > > > status);
> > > > > > +                       commit_reset_write_verifier(nn,
> > > > > > rqstp,
> > > > > > status);
> > > > > >                         ret = nfserrno(status);
> > > > > >                 }
> > > > > >         }
> > > > > > @@ -1170,8 +1183,7 @@ nfsd_vfs_write(struct svc_rqst
> > > > > > *rqstp,
> > > > > > struct
> > > > > > svc_fh *fhp, struct nfsd_file *nf,
> > > > > >         host_err = vfs_iter_write(file, &iter, &pos,
> > > > > > flags);
> > > > > >         file_end_write(file);
> > > > > >         if (host_err < 0) {
> > > > > > -               nfsd_reset_write_verifier(nn);
> > > > > > -               trace_nfsd_writeverf_reset(nn, rqstp,
> > > > > > host_err);
> > > > > > +               commit_reset_write_verifier(nn, rqstp,
> > > > > > host_err);
> > > > > 
> > > > > Can generic_file_write_iter() or its brethren return STALE or
> > > > > AGAIN
> > > > > before they get to the generic_write_sync() call ?
> > > > 
> > > > The call to nfs_revalidate_file_size(), which can occur when
> > > > you
> > > > are
> > > > appending to the file (whether or not O_APPEND is set) could
> > > > indeed
> > > > return ESTALE.
> > > > With the new patchset mentioned above, it could also return
> > > > EAGAIN.
> > > 
> > > Sounds like I should drop this hunk when applying this fix.
> > 
> > I'm not understanding. Why would you not keep it?
> 
> generic_file_write_iter() and its brethren are two calls in
> one, if I'm following this correctly:
> 
> 1. write
> 2. sync
> 
> All the other places you change are "sync" only, so it's
> fairly obvious that those callers get a return code that
> reflects a failure of "sync".
> 
> I asked above if it's possible for the "write" part of
> generic_file_write_iter() to fail with STALE/AGAIN before the
> sync part is even called.
> 
> You seemed to be answering "yes, the 'write' part can fail
> that way" but I may have misunderstood your response.
> 
> If the "write" step can fail, isn't that something that should
> be reflected in a write verifier change? If yes, I don't see
> how this particular call site can distinguish between a "write"
> failure versus a "sync" failure.

The point of EAGAIN is that just like NFS4ERR_DELAY, it implies no
stateful (read "non-idempotent") changes occurred during the operation
that returned it.
The call to nfs_revalidate_file_size() will happen before you've
changed the page cache, so if it fails with EAGAIN, it won't leave the
file in a state where unrecorded changes happened, nor will it cause
existing page cache changes to be lost.

If, OTOH, the nfs_revalidate_file_size() returns ESTALE, then the page
cache changes indeed will not be recorded, but we don't care because
the point is that the file is no more.

> 
> Or, if the vfs_iter_write() call here is guaranteed to never
> be a sync write request, then again, I think we want to reflect
> all failures here with a write verifier change.
> 
> However, if STALE and AGAIN have the exact same semantics
> for "write" as they do for "sync", those failures can be
> thrown away too, and I can keep this hunk. Are you saying
> this is the case?
> 
> (this is /only/ for the vfs_iter_write() call site. The others
> look OK to me).
> 
> 
> > > > > >                 goto out_nfserr;
> > > > > >         }
> > > > > >         *cnt = host_err;
> > > > > > @@ -1183,10 +1195,8 @@ nfsd_vfs_write(struct svc_rqst
> > > > > > *rqstp,
> > > > > > struct svc_fh *fhp, struct nfsd_file *nf,
> > > > > >  
> > > > > >         if (stable && use_wgather) {
> > > > > >                 host_err =
> > > > > > wait_for_concurrent_writes(file);
> > > > > > -               if (host_err < 0) {
> > > > > > -                       nfsd_reset_write_verifier(nn);
> > > > > > -                       trace_nfsd_writeverf_reset(nn,
> > > > > > rqstp,
> > > > > > host_err);
> > > > > > -               }
> > > > > > +               if (host_err < 0)
> > > > > > +                       commit_reset_write_verifier(nn,
> > > > > > rqstp,
> > > > > > host_err);
> > > > > >         }
> > > > > >  
> > > > > >  out_nfserr:
> > > > > > @@ -1329,8 +1339,7 @@ nfsd_commit(struct svc_rqst *rqstp,
> > > > > > struct
> > > > > > svc_fh *fhp, struct nfsd_file *nf,
> > > > > >                         err = nfserr_notsupp;
> > > > > >                         break;
> > > > > >                 default:
> > > > > > -                       nfsd_reset_write_verifier(nn);
> > > > > > -                       trace_nfsd_writeverf_reset(nn,
> > > > > > rqstp,
> > > > > > err2);
> > > > > > +                       commit_reset_write_verifier(nn,
> > > > > > rqstp,
> > > > > > err2);
> > > > > >                         err = nfserrno(err2);
> > > > > >                 }
> > > > > >         } else
> > > > > > -- 
> > > > > > 2.41.0
> > > > > > 
> > > > > 
> > > > 
> > > > -- 
> > > > Trond Myklebust
> > > > Linux NFS client maintainer, Hammerspace
> > > > trond.myklebust@hammerspace.com
> > > 
> > > 
> > > --
> > > Chuck Lever
> > > 
> > > 
> > 
> > -- 
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> > 
> > 
> 
> --
> Chuck Lever
> 
>
Jeffrey Layton Sept. 12, 2023, 10:54 a.m. UTC | #6
On Mon, 2023-09-11 at 14:43 -0400, trondmy@gmail.com wrote:
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> If fsync() is returning EAGAIN, then we can assume that the filesystem
> being exported is something like NFS with the 'softerr' mount option
> enabled, and that it is just asking us to replay the fsync() operation
> at a later date.
> If we see an ESTALE, then ditto: the file is gone, so there is no danger
> of losing the error.
> For those cases, do not reset the write verifier.
> 
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
>  fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
>  1 file changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 98fa4fd0556d..31daf9f63572 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -337,6 +337,20 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
>  	return err;
>  }
>  
> +static void
> +commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp,
> +			    int err)
> +{
> +	switch (err) {
> +	case -EAGAIN:
> +	case -ESTALE:
> +		break;
> +	default:
> +		nfsd_reset_write_verifier(nn);
> +		trace_nfsd_writeverf_reset(nn, rqstp, err);
> +	}
> +}
> +
>  /*
>   * Commit metadata changes to stable storage.
>   */
> @@ -647,8 +661,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
>  					&nfsd4_get_cstate(rqstp)->current_fh,
>  					dst_pos,
>  					count, status);
> -			nfsd_reset_write_verifier(nn);
> -			trace_nfsd_writeverf_reset(nn, rqstp, status);
> +			commit_reset_write_verifier(nn, rqstp, status);
>  			ret = nfserrno(status);
>  		}
>  	}
> @@ -1170,8 +1183,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
>  	host_err = vfs_iter_write(file, &iter, &pos, flags);
>  	file_end_write(file);
>  	if (host_err < 0) {
> -		nfsd_reset_write_verifier(nn);
> -		trace_nfsd_writeverf_reset(nn, rqstp, host_err);
> +		commit_reset_write_verifier(nn, rqstp, host_err);
>  		goto out_nfserr;
>  	}
>  	*cnt = host_err;
> @@ -1183,10 +1195,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
>  
>  	if (stable && use_wgather) {
>  		host_err = wait_for_concurrent_writes(file);
> -		if (host_err < 0) {
> -			nfsd_reset_write_verifier(nn);
> -			trace_nfsd_writeverf_reset(nn, rqstp, host_err);
> -		}
> +		if (host_err < 0)
> +			commit_reset_write_verifier(nn, rqstp, host_err);
>  	}
>  
>  out_nfserr:
> @@ -1329,8 +1339,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
>  			err = nfserr_notsupp;
>  			break;
>  		default:
> -			nfsd_reset_write_verifier(nn);
> -			trace_nfsd_writeverf_reset(nn, rqstp, err2);
> +			commit_reset_write_verifier(nn, rqstp, err2);
>  			err = nfserrno(err2);
>  		}
>  	} else

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Chuck Lever Sept. 12, 2023, 1:19 p.m. UTC | #7
On Tue, Sep 12, 2023 at 01:11:28AM +0000, Trond Myklebust wrote:
> On Tue, 2023-09-12 at 00:45 +0000, Chuck Lever III wrote:
> > 
> > 
> > > On Sep 11, 2023, at 7:42 PM, Trond Myklebust
> > > <trondmy@hammerspace.com> wrote:
> > > 
> > > On Mon, 2023-09-11 at 22:10 +0000, Chuck Lever III wrote:
> > > > 
> > > > > On Sep 11, 2023, at 4:54 PM, Trond Myklebust
> > > > > <trondmy@hammerspace.com> wrote:
> > > > > 
> > > > > On Mon, 2023-09-11 at 16:14 -0400, Chuck Lever wrote:
> > > > > > On Mon, Sep 11, 2023 at 02:43:57PM -0400,
> > > > > > trondmy@gmail.com wrote:
> > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > > 
> > > > > > > If fsync() is returning EAGAIN, then we can assume that the
> > > > > > > filesystem
> > > > > > > being exported is something like NFS with the 'softerr'
> > > > > > > mount
> > > > > > > option
> > > > > > > enabled, and that it is just asking us to replay the
> > > > > > > fsync()
> > > > > > > operation
> > > > > > > at a later date.
> > > > > > > If we see an ESTALE, then ditto: the file is gone, so there
> > > > > > > is
> > > > > > > no
> > > > > > > danger
> > > > > > > of losing the error.
> > > > > > > For those cases, do not reset the write verifier.
> > > > > > 
> > > > > > Out of interest, what's the hazard in a write verifier change
> > > > > > in
> > > > > > these cases? There could be a slight performance penalty, I
> > > > > > imagine,
> > > > > > but how frequently does this happen?
> > > > > 
> > > > > When re-exporting to NFSv4 clients, it should be less of a
> > > > > problem,
> > > > > since any REMOVE will result in a sillyrenamed file that only
> > > > > disappears once the file is closed. However with NFSv3 clients,
> > > > > that is
> > > > > circumvented by the fact that the filecache closes the files
> > > > > when
> > > > > they
> > > > > are inactive. We've seen this occur frequently with VMware
> > > > > vmdks:
> > > > > their
> > > > > lock files appear to generate a lot of these phantom ESTALE
> > > > > writes.
> > > > > 
> > > > > As for EAGAIN, I just pushed out a 2 patch client series that
> > > > > makes
> > > > > it
> > > > > a lot more frequent when re-exporting NFSv4 with 'softerr'.
> > > > > 
> > > > > Finally, it is worth noting that a write verifier change has a
> > > > > global
> > > > > effect, causing retransmission by all clients of all
> > > > > uncommitted
> > > > > unstable writes for all files, so is worth mitigating where
> > > > > possible.
> > > > 
> > > > Good info. I've added some of this to the patch description.
> > > > 
> > > > 
> > > > > > One more below.
> > > > > > 
> > > > > > 
> > > > > > > Signed-off-by: Trond Myklebust
> > > > > > > <trond.myklebust@hammerspace.com>
> > > > > > > ---
> > > > > > >  fs/nfsd/vfs.c | 29 +++++++++++++++++++----------
> > > > > > >  1 file changed, 19 insertions(+), 10 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > > > > index 98fa4fd0556d..31daf9f63572 100644
> > > > > > > --- a/fs/nfsd/vfs.c
> > > > > > > +++ b/fs/nfsd/vfs.c
> > > > > > > @@ -337,6 +337,20 @@ nfsd_lookup(struct svc_rqst *rqstp,
> > > > > > > struct
> > > > > > > svc_fh *fhp, const char *name,
> > > > > > >         return err;
> > > > > > >  }
> > > > > > >  
> > > > > > > +static void
> > > > > > > +commit_reset_write_verifier(struct nfsd_net *nn, struct
> > > > > > > svc_rqst
> > > > > > > *rqstp,
> > > > > > > +                           int err)
> > > > > > > +{
> > > > > > > +       switch (err) {
> > > > > > > +       case -EAGAIN:
> > > > > > > +       case -ESTALE:
> > > > > > > +               break;
> > > > > > > +       default:
> > > > > > > +               nfsd_reset_write_verifier(nn);
> > > > > > > +               trace_nfsd_writeverf_reset(nn, rqstp, err);
> > > > > > > +       }
> > > > > > > +}
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * Commit metadata changes to stable storage.
> > > > > > >   */
> > > > > > > @@ -647,8 +661,7 @@ __be32 nfsd4_clone_file_range(struct
> > > > > > > svc_rqst
> > > > > > > *rqstp,
> > > > > > >                                        
> > > > > > > &nfsd4_get_cstate(rqstp)-
> > > > > > > > current_fh,
> > > > > > >                                         dst_pos,
> > > > > > >                                         count, status);
> > > > > > > -                       nfsd_reset_write_verifier(nn);
> > > > > > > -                       trace_nfsd_writeverf_reset(nn,
> > > > > > > rqstp,
> > > > > > > status);
> > > > > > > +                       commit_reset_write_verifier(nn,
> > > > > > > rqstp,
> > > > > > > status);
> > > > > > >                         ret = nfserrno(status);
> > > > > > >                 }
> > > > > > >         }
> > > > > > > @@ -1170,8 +1183,7 @@ nfsd_vfs_write(struct svc_rqst
> > > > > > > *rqstp,
> > > > > > > struct
> > > > > > > svc_fh *fhp, struct nfsd_file *nf,
> > > > > > >         host_err = vfs_iter_write(file, &iter, &pos,
> > > > > > > flags);
> > > > > > >         file_end_write(file);
> > > > > > >         if (host_err < 0) {
> > > > > > > -               nfsd_reset_write_verifier(nn);
> > > > > > > -               trace_nfsd_writeverf_reset(nn, rqstp,
> > > > > > > host_err);
> > > > > > > +               commit_reset_write_verifier(nn, rqstp,
> > > > > > > host_err);
> > > > > > 
> > > > > > Can generic_file_write_iter() or its brethren return STALE or
> > > > > > AGAIN
> > > > > > before they get to the generic_write_sync() call ?
> > > > > 
> > > > > The call to nfs_revalidate_file_size(), which can occur when
> > > > > you
> > > > > are
> > > > > appending to the file (whether or not O_APPEND is set) could
> > > > > indeed
> > > > > return ESTALE.
> > > > > With the new patchset mentioned above, it could also return
> > > > > EAGAIN.
> > > > 
> > > > Sounds like I should drop this hunk when applying this fix.
> > > 
> > > I'm not understanding. Why would you not keep it?
> > 
> > generic_file_write_iter() and its brethren are two calls in
> > one, if I'm following this correctly:
> > 
> > 1. write
> > 2. sync
> > 
> > All the other places you change are "sync" only, so it's
> > fairly obvious that those callers get a return code that
> > reflects a failure of "sync".
> > 
> > I asked above if it's possible for the "write" part of
> > generic_file_write_iter() to fail with STALE/AGAIN before the
> > sync part is even called.
> > 
> > You seemed to be answering "yes, the 'write' part can fail
> > that way" but I may have misunderstood your response.
> > 
> > If the "write" step can fail, isn't that something that should
> > be reflected in a write verifier change? If yes, I don't see
> > how this particular call site can distinguish between a "write"
> > failure versus a "sync" failure.
> 
> The point of EAGAIN is that just like NFS4ERR_DELAY, it implies no
> stateful (read "non-idempotent") changes occurred during the operation
> that returned it.
> The call to nfs_revalidate_file_size() will happen before you've
> changed the page cache, so if it fails with EAGAIN, it won't leave the
> file in a state where unrecorded changes happened, nor will it cause
> existing page cache changes to be lost.
> 
> If, OTOH, the nfs_revalidate_file_size() returns ESTALE, then the page
> cache changes indeed will not be recorded, but we don't care because
> the point is that the file is no more.

Thanks. Applied to nfsd-next.


> > Or, if the vfs_iter_write() call here is guaranteed to never
> > be a sync write request, then again, I think we want to reflect
> > all failures here with a write verifier change.
> > 
> > However, if STALE and AGAIN have the exact same semantics
> > for "write" as they do for "sync", those failures can be
> > thrown away too, and I can keep this hunk. Are you saying
> > this is the case?
> > 
> > (this is /only/ for the vfs_iter_write() call site. The others
> > look OK to me).
> > 
> > 
> > > > > > >                 goto out_nfserr;
> > > > > > >         }
> > > > > > >         *cnt = host_err;
> > > > > > > @@ -1183,10 +1195,8 @@ nfsd_vfs_write(struct svc_rqst
> > > > > > > *rqstp,
> > > > > > > struct svc_fh *fhp, struct nfsd_file *nf,
> > > > > > >  
> > > > > > >         if (stable && use_wgather) {
> > > > > > >                 host_err =
> > > > > > > wait_for_concurrent_writes(file);
> > > > > > > -               if (host_err < 0) {
> > > > > > > -                       nfsd_reset_write_verifier(nn);
> > > > > > > -                       trace_nfsd_writeverf_reset(nn,
> > > > > > > rqstp,
> > > > > > > host_err);
> > > > > > > -               }
> > > > > > > +               if (host_err < 0)
> > > > > > > +                       commit_reset_write_verifier(nn,
> > > > > > > rqstp,
> > > > > > > host_err);
> > > > > > >         }
> > > > > > >  
> > > > > > >  out_nfserr:
> > > > > > > @@ -1329,8 +1339,7 @@ nfsd_commit(struct svc_rqst *rqstp,
> > > > > > > struct
> > > > > > > svc_fh *fhp, struct nfsd_file *nf,
> > > > > > >                         err = nfserr_notsupp;
> > > > > > >                         break;
> > > > > > >                 default:
> > > > > > > -                       nfsd_reset_write_verifier(nn);
> > > > > > > -                       trace_nfsd_writeverf_reset(nn,
> > > > > > > rqstp,
> > > > > > > err2);
> > > > > > > +                       commit_reset_write_verifier(nn,
> > > > > > > rqstp,
> > > > > > > err2);
> > > > > > >                         err = nfserrno(err2);
> > > > > > >                 }
> > > > > > >         } else
> > > > > > > -- 
> > > > > > > 2.41.0
> > > > > > > 
> > > > > > 
> > > > > 
> > > > > -- 
> > > > > Trond Myklebust
> > > > > Linux NFS client maintainer, Hammerspace
> > > > > trond.myklebust@hammerspace.com
> > > > 
> > > > 
> > > > --
> > > > Chuck Lever
> > > > 
> > > > 
> > > 
> > > -- 
> > > Trond Myklebust
> > > Linux NFS client maintainer, Hammerspace
> > > trond.myklebust@hammerspace.com
> > > 
> > > 
> > 
> > --
> > Chuck Lever
> > 
> > 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
> 
>
diff mbox series

Patch

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 98fa4fd0556d..31daf9f63572 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -337,6 +337,20 @@  nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 	return err;
 }
 
+static void
+commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp,
+			    int err)
+{
+	switch (err) {
+	case -EAGAIN:
+	case -ESTALE:
+		break;
+	default:
+		nfsd_reset_write_verifier(nn);
+		trace_nfsd_writeverf_reset(nn, rqstp, err);
+	}
+}
+
 /*
  * Commit metadata changes to stable storage.
  */
@@ -647,8 +661,7 @@  __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
 					&nfsd4_get_cstate(rqstp)->current_fh,
 					dst_pos,
 					count, status);
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, status);
+			commit_reset_write_verifier(nn, rqstp, status);
 			ret = nfserrno(status);
 		}
 	}
@@ -1170,8 +1183,7 @@  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 	host_err = vfs_iter_write(file, &iter, &pos, flags);
 	file_end_write(file);
 	if (host_err < 0) {
-		nfsd_reset_write_verifier(nn);
-		trace_nfsd_writeverf_reset(nn, rqstp, host_err);
+		commit_reset_write_verifier(nn, rqstp, host_err);
 		goto out_nfserr;
 	}
 	*cnt = host_err;
@@ -1183,10 +1195,8 @@  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 
 	if (stable && use_wgather) {
 		host_err = wait_for_concurrent_writes(file);
-		if (host_err < 0) {
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, host_err);
-		}
+		if (host_err < 0)
+			commit_reset_write_verifier(nn, rqstp, host_err);
 	}
 
 out_nfserr:
@@ -1329,8 +1339,7 @@  nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 			err = nfserr_notsupp;
 			break;
 		default:
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, err2);
+			commit_reset_write_verifier(nn, rqstp, err2);
 			err = nfserrno(err2);
 		}
 	} else