diff mbox series

[v2,3/3] nfsd: start non-blocking writeback after adding nfsd_file to the LRU

Message ID 20221027215213.138304-4-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series nfsd: clean up refcounting in filecache | expand

Commit Message

Jeff Layton Oct. 27, 2022, 9:52 p.m. UTC
When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
so that we can be ready to close it out when the time comes.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

Comments

NeilBrown Oct. 27, 2022, 10:25 p.m. UTC | #1
On Fri, 28 Oct 2022, Jeff Layton wrote:
> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> so that we can be ready to close it out when the time comes.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>

This looks sensible.
Reviewed-by: NeilBrown <neilb@suse.de>

Thanks,
NeilBrown


> ---
>  fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
>  1 file changed, 31 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index d2bbded805d4..491d3d9a1870 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>  }
>  
>  static void
> -nfsd_file_flush(struct nfsd_file *nf)
> +nfsd_file_fsync(struct nfsd_file *nf)
>  {
>  	struct file *file = nf->nf_file;
>  
> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
>  		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
>  }
>  
> +static void
> +nfsd_file_flush(struct nfsd_file *nf)
> +{
> +	struct file *file = nf->nf_file;
> +	unsigned long nrpages;
> +
> +	if (!file || !(file->f_mode & FMODE_WRITE))
> +		return;
> +
> +	nrpages = file->f_mapping->nrpages;
> +	if (nrpages) {
> +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
> +		filemap_flush(file->f_mapping);
> +	}
> +}
> +
>  static void
>  nfsd_file_free(struct nfsd_file *nf)
>  {
> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
>  	this_cpu_inc(nfsd_file_releases);
>  	this_cpu_add(nfsd_file_total_age, age);
>  
> -	nfsd_file_flush(nf);
> +	nfsd_file_fsync(nf);
>  
>  	if (nf->nf_mark)
>  		nfsd_file_mark_put(nf->nf_mark);
> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
>  
>  	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
>  		/*
> -		 * If this is the last reference (nf_ref == 1), then transfer
> -		 * it to the LRU. If the add to the LRU fails, just put it as
> -		 * usual.
> +		 * If this is the last reference (nf_ref == 1), then try
> +		 * to transfer it to the LRU.
> +		 */
> +		if (refcount_dec_not_one(&nf->nf_ref))
> +			return;
> +
> +		/*
> +		 * If the add to the list succeeds, try to kick off SYNC_NONE
> +		 * writeback. If the add fails, then just fall through to
> +		 * decrement as usual.
>  		 */
> -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> +		if (nfsd_file_lru_add(nf)) {
> +			nfsd_file_flush(nf);
>  			return;
> +		}
>  	}
>  	__nfsd_file_put(nf);
>  }
> -- 
> 2.37.3
> 
>
Jeff Layton Oct. 28, 2022, 1:01 p.m. UTC | #2
On Fri, 2022-10-28 at 09:25 +1100, NeilBrown wrote:
> On Fri, 28 Oct 2022, Jeff Layton wrote:
> > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > so that we can be ready to close it out when the time comes.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> 
> This looks sensible.
> Reviewed-by: NeilBrown <neilb@suse.de>
> 
> Thanks,
> NeilBrown
> 
> 
> > ---
> >  fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> >  1 file changed, 31 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index d2bbded805d4..491d3d9a1870 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> >  }
> >  
> >  static void
> > -nfsd_file_flush(struct nfsd_file *nf)
> > +nfsd_file_fsync(struct nfsd_file *nf)
> >  {
> >  	struct file *file = nf->nf_file;
> >  
> > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> >  		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> >  }
> >  
> > +static void
> > +nfsd_file_flush(struct nfsd_file *nf)
> > +{
> > +	struct file *file = nf->nf_file;
> > +	unsigned long nrpages;
> > +
> > +	if (!file || !(file->f_mode & FMODE_WRITE))
> > +		return;
> > +
> > +	nrpages = file->f_mapping->nrpages;
> > +	if (nrpages) {
> > 

I may change this to:

    if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {

I'm not sure here...Does nrpages count all of the pages in the mapping,
or just the dirty ones? I'm wondering if we're overcounting in
nfsd_file_pages_flushed?

> > +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > +		filemap_flush(file->f_mapping);
> > +	}
> > +}
> > +
> >  static void
> >  nfsd_file_free(struct nfsd_file *nf)
> >  {
> > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> >  	this_cpu_inc(nfsd_file_releases);
> >  	this_cpu_add(nfsd_file_total_age, age);
> >  
> > -	nfsd_file_flush(nf);
> > +	nfsd_file_fsync(nf);
> >  
> >  	if (nf->nf_mark)
> >  		nfsd_file_mark_put(nf->nf_mark);
> > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> >  
> >  	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> >  		/*
> > -		 * If this is the last reference (nf_ref == 1), then transfer
> > -		 * it to the LRU. If the add to the LRU fails, just put it as
> > -		 * usual.
> > +		 * If this is the last reference (nf_ref == 1), then try
> > +		 * to transfer it to the LRU.
> > +		 */
> > +		if (refcount_dec_not_one(&nf->nf_ref))
> > +			return;
> > +
> > +		/*
> > +		 * If the add to the list succeeds, try to kick off SYNC_NONE
> > +		 * writeback. If the add fails, then just fall through to
> > +		 * decrement as usual.
> >  		 */
> > -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > +		if (nfsd_file_lru_add(nf)) {
> > +			nfsd_file_flush(nf);
> >  			return;
> > +		}
> >  	}
> >  	__nfsd_file_put(nf);
> >  }
> > -- 
> > 2.37.3
> > 
> >
Chuck Lever III Oct. 28, 2022, 1:16 p.m. UTC | #3
> On Oct 27, 2022, at 5:52 PM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> so that we can be ready to close it out when the time comes.

For a large file, a background flush still has to walk the file's
pages to see if they are dirty, and that consumes time, CPU, and
memory bandwidth. We're talking hundreds of microseconds for a
large file.

Then the final flush does all that again.

Basically, two (or more!) passes through the file for exactly the
same amount of work. Is there any measured improvement in latency
or throughput?

And then... for a GC file, no-one is waiting on data persistence
during nfsd_file_put() so I'm not sure what is gained by taking
control of the flushing process away from the underlying filesystem.


Remind me why the filecache is flushing? Shouldn't NFSD rely on
COMMIT operations for that? (It's not obvious reading the code,
maybe there should be a documenting comment somewhere that
explains this arrangement).


> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> 1 file changed, 31 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index d2bbded805d4..491d3d9a1870 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> }
> 
> static void
> -nfsd_file_flush(struct nfsd_file *nf)
> +nfsd_file_fsync(struct nfsd_file *nf)
> {
> 	struct file *file = nf->nf_file;
> 
> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
> 
> +static void
> +nfsd_file_flush(struct nfsd_file *nf)
> +{
> +	struct file *file = nf->nf_file;
> +	unsigned long nrpages;
> +
> +	if (!file || !(file->f_mode & FMODE_WRITE))
> +		return;
> +
> +	nrpages = file->f_mapping->nrpages;
> +	if (nrpages) {
> +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
> +		filemap_flush(file->f_mapping);
> +	}
> +}
> +
> static void
> nfsd_file_free(struct nfsd_file *nf)
> {
> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> 	this_cpu_inc(nfsd_file_releases);
> 	this_cpu_add(nfsd_file_total_age, age);
> 
> -	nfsd_file_flush(nf);
> +	nfsd_file_fsync(nf);
> 
> 	if (nf->nf_mark)
> 		nfsd_file_mark_put(nf->nf_mark);
> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> 
> 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> 		/*
> -		 * If this is the last reference (nf_ref == 1), then transfer
> -		 * it to the LRU. If the add to the LRU fails, just put it as
> -		 * usual.
> +		 * If this is the last reference (nf_ref == 1), then try
> +		 * to transfer it to the LRU.
> +		 */
> +		if (refcount_dec_not_one(&nf->nf_ref))
> +			return;
> +
> +		/*
> +		 * If the add to the list succeeds, try to kick off SYNC_NONE
> +		 * writeback. If the add fails, then just fall through to
> +		 * decrement as usual.

These comments simply repeat what the code does, so they seem
redundant to me. Could they instead explain why?


> 		 */
> -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> +		if (nfsd_file_lru_add(nf)) {
> +			nfsd_file_flush(nf);
> 			return;
> +		}
> 	}
> 	__nfsd_file_put(nf);
> }
> -- 
> 2.37.3
> 

--
Chuck Lever
Jeff Layton Oct. 28, 2022, 3:05 p.m. UTC | #4
On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
> 
> > On Oct 27, 2022, at 5:52 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > so that we can be ready to close it out when the time comes.
> 
> For a large file, a background flush still has to walk the file's
> pages to see if they are dirty, and that consumes time, CPU, and
> memory bandwidth. We're talking hundreds of microseconds for a
> large file.
> 
> Then the final flush does all that again.
> 
> Basically, two (or more!) passes through the file for exactly the
> same amount of work. Is there any measured improvement in latency
> or throughput?
> 
> And then... for a GC file, no-one is waiting on data persistence
> during nfsd_file_put() so I'm not sure what is gained by taking
> control of the flushing process away from the underlying filesystem.
> 
> 
> Remind me why the filecache is flushing? Shouldn't NFSD rely on
> COMMIT operations for that? (It's not obvious reading the code,
> maybe there should be a documenting comment somewhere that
> explains this arrangement).
> 


Fair point. I was trying to replicate the behaviors introduced in these
patches:

b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
6b8a94332ee4 nfsd: Fix a write performance regression

AFAICT, the fsync is there to catch writeback errors so that we can
reset the write verifiers (AFAICT). The rationale for that is described
here:

055b24a8f230 nfsd: Don't garbage collect files that might contain write errors

The problem with not calling vfs_fsync is that we might miss writeback
errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
nfsd would eventually reopen the file but it could miss seeing the error
if it got opened locally in the interim.

I'm not sure we need to worry about that so much for v4 though. Maybe we
should just do this for GC files?

> 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> > fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> > 1 file changed, 31 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index d2bbded805d4..491d3d9a1870 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > }
> > 
> > static void
> > -nfsd_file_flush(struct nfsd_file *nf)
> > +nfsd_file_fsync(struct nfsd_file *nf)
> > {
> > 	struct file *file = nf->nf_file;
> > 
> > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> > 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > }
> > 
> > +static void
> > +nfsd_file_flush(struct nfsd_file *nf)
> > +{
> > +	struct file *file = nf->nf_file;
> > +	unsigned long nrpages;
> > +
> > +	if (!file || !(file->f_mode & FMODE_WRITE))
> > +		return;
> > +
> > +	nrpages = file->f_mapping->nrpages;
> > +	if (nrpages) {
> > +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > +		filemap_flush(file->f_mapping);
> > +	}
> > +}
> > +
> > static void
> > nfsd_file_free(struct nfsd_file *nf)
> > {
> > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> > 	this_cpu_inc(nfsd_file_releases);
> > 	this_cpu_add(nfsd_file_total_age, age);
> > 
> > -	nfsd_file_flush(nf);
> > +	nfsd_file_fsync(nf);
> > 
> > 	if (nf->nf_mark)
> > 		nfsd_file_mark_put(nf->nf_mark);
> > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> > 
> > 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > 		/*
> > -		 * If this is the last reference (nf_ref == 1), then transfer
> > -		 * it to the LRU. If the add to the LRU fails, just put it as
> > -		 * usual.
> > +		 * If this is the last reference (nf_ref == 1), then try
> > +		 * to transfer it to the LRU.
> > +		 */
> > +		if (refcount_dec_not_one(&nf->nf_ref))
> > +			return;
> > +
> > +		/*
> > +		 * If the add to the list succeeds, try to kick off SYNC_NONE
> > +		 * writeback. If the add fails, then just fall through to
> > +		 * decrement as usual.
> 
> These comments simply repeat what the code does, so they seem
> redundant to me. Could they instead explain why?
> 
> 
> > 		 */
> > -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > +		if (nfsd_file_lru_add(nf)) {
> > +			nfsd_file_flush(nf);
> > 			return;
> > +		}
> > 	}
> > 	__nfsd_file_put(nf);
> > }
> > -- 
> > 2.37.3
> > 
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever III Oct. 28, 2022, 3:29 p.m. UTC | #5
> On Oct 28, 2022, at 11:05 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
>> 
>>> On Oct 27, 2022, at 5:52 PM, Jeff Layton <jlayton@kernel.org> wrote:
>>> 
>>> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
>>> so that we can be ready to close it out when the time comes.
>> 
>> For a large file, a background flush still has to walk the file's
>> pages to see if they are dirty, and that consumes time, CPU, and
>> memory bandwidth. We're talking hundreds of microseconds for a
>> large file.
>> 
>> Then the final flush does all that again.
>> 
>> Basically, two (or more!) passes through the file for exactly the
>> same amount of work. Is there any measured improvement in latency
>> or throughput?
>> 
>> And then... for a GC file, no-one is waiting on data persistence
>> during nfsd_file_put() so I'm not sure what is gained by taking
>> control of the flushing process away from the underlying filesystem.
>> 
>> 
>> Remind me why the filecache is flushing? Shouldn't NFSD rely on
>> COMMIT operations for that? (It's not obvious reading the code,
>> maybe there should be a documenting comment somewhere that
>> explains this arrangement).
>> 
> 
> 
> Fair point. I was trying to replicate the behaviors introduced in these
> patches:
> 
> b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
> 6b8a94332ee4 nfsd: Fix a write performance regression
> 
> AFAICT, the fsync is there to catch writeback errors so that we can
> reset the write verifiers (AFAICT). The rationale for that is described
> here:
> 
> 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors

Yes, I've been confused about this since then :-)

So, the patch description says:

    If a file may contain unstable writes that can error out, then we want
    to avoid garbage collecting the struct nfsd_file that may be
    tracking those errors.

That doesn't explain why that's a problem, it just says what we plan to
do about it.


> The problem with not calling vfs_fsync is that we might miss writeback
> errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> nfsd would eventually reopen the file but it could miss seeing the error
> if it got opened locally in the interim.

That helps. So we're surfacing writeback errors for local writers?

I guess I would like this flushing to interfere as little as possible
with the server's happy zone, since it's not something clients need to
wait for, and an error is exceptionally rare.

But also, we can't let writeback errors hold onto a bunch of memory
indefinitely. How much nfsd_file and page cache memory might be be
pinned by a writeback error, and for how long?


> I'm not sure we need to worry about that so much for v4 though. Maybe we
> should just do this for GC files?

I'm not caffeinated yet. Why is it not a problem for v4? Is it because
an open or delegation stateid will prevent the nfsd_file from going
away?

Sorry for the noise. It's all a little subtle.


>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>> ---
>>> fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
>>> 1 file changed, 31 insertions(+), 6 deletions(-)
>>> 
>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>> index d2bbded805d4..491d3d9a1870 100644
>>> --- a/fs/nfsd/filecache.c
>>> +++ b/fs/nfsd/filecache.c
>>> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>>> }
>>> 
>>> static void
>>> -nfsd_file_flush(struct nfsd_file *nf)
>>> +nfsd_file_fsync(struct nfsd_file *nf)
>>> {
>>> 	struct file *file = nf->nf_file;
>>> 
>>> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
>>> 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
>>> }
>>> 
>>> +static void
>>> +nfsd_file_flush(struct nfsd_file *nf)
>>> +{
>>> +	struct file *file = nf->nf_file;
>>> +	unsigned long nrpages;
>>> +
>>> +	if (!file || !(file->f_mode & FMODE_WRITE))
>>> +		return;
>>> +
>>> +	nrpages = file->f_mapping->nrpages;
>>> +	if (nrpages) {
>>> +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
>>> +		filemap_flush(file->f_mapping);
>>> +	}
>>> +}
>>> +
>>> static void
>>> nfsd_file_free(struct nfsd_file *nf)
>>> {
>>> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
>>> 	this_cpu_inc(nfsd_file_releases);
>>> 	this_cpu_add(nfsd_file_total_age, age);
>>> 
>>> -	nfsd_file_flush(nf);
>>> +	nfsd_file_fsync(nf);
>>> 
>>> 	if (nf->nf_mark)
>>> 		nfsd_file_mark_put(nf->nf_mark);
>>> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
>>> 
>>> 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
>>> 		/*
>>> -		 * If this is the last reference (nf_ref == 1), then transfer
>>> -		 * it to the LRU. If the add to the LRU fails, just put it as
>>> -		 * usual.
>>> +		 * If this is the last reference (nf_ref == 1), then try
>>> +		 * to transfer it to the LRU.
>>> +		 */
>>> +		if (refcount_dec_not_one(&nf->nf_ref))
>>> +			return;
>>> +
>>> +		/*
>>> +		 * If the add to the list succeeds, try to kick off SYNC_NONE
>>> +		 * writeback. If the add fails, then just fall through to
>>> +		 * decrement as usual.
>> 
>> These comments simply repeat what the code does, so they seem
>> redundant to me. Could they instead explain why?
>> 
>> 
>>> 		 */
>>> -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
>>> +		if (nfsd_file_lru_add(nf)) {
>>> +			nfsd_file_flush(nf);
>>> 			return;
>>> +		}
>>> 	}
>>> 	__nfsd_file_put(nf);
>>> }
>>> -- 
>>> 2.37.3
>>> 
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>

--
Chuck Lever
Jeff Layton Oct. 28, 2022, 3:51 p.m. UTC | #6
On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
> 
> > On Oct 28, 2022, at 11:05 AM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
> > > 
> > > > On Oct 27, 2022, at 5:52 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > 
> > > > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > > > so that we can be ready to close it out when the time comes.
> > > 
> > > For a large file, a background flush still has to walk the file's
> > > pages to see if they are dirty, and that consumes time, CPU, and
> > > memory bandwidth. We're talking hundreds of microseconds for a
> > > large file.
> > > 
> > > Then the final flush does all that again.
> > > 
> > > Basically, two (or more!) passes through the file for exactly the
> > > same amount of work. Is there any measured improvement in latency
> > > or throughput?
> > > 
> > > And then... for a GC file, no-one is waiting on data persistence
> > > during nfsd_file_put() so I'm not sure what is gained by taking
> > > control of the flushing process away from the underlying filesystem.
> > > 
> > > 
> > > Remind me why the filecache is flushing? Shouldn't NFSD rely on
> > > COMMIT operations for that? (It's not obvious reading the code,
> > > maybe there should be a documenting comment somewhere that
> > > explains this arrangement).
> > > 
> > 
> > 
> > Fair point. I was trying to replicate the behaviors introduced in these
> > patches:
> > 
> > b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
> > 6b8a94332ee4 nfsd: Fix a write performance regression
> > 
> > AFAICT, the fsync is there to catch writeback errors so that we can
> > reset the write verifiers (AFAICT). The rationale for that is described
> > here:
> > 
> > 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
> 
> Yes, I've been confused about this since then :-)
> 
> So, the patch description says:
> 
>     If a file may contain unstable writes that can error out, then we want
>     to avoid garbage collecting the struct nfsd_file that may be
>     tracking those errors.
> 
> That doesn't explain why that's a problem, it just says what we plan to
> do about it.
> 
> 
> > The problem with not calling vfs_fsync is that we might miss writeback
> > errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> > nfsd would eventually reopen the file but it could miss seeing the error
> > if it got opened locally in the interim.
> 
> That helps. So we're surfacing writeback errors for local writers?
> 

Well for non-v3 writers anyway. I suppose you could hit the same
scenario with a mixed v3 and v4 workload if you were unlucky enough, or
mixed v3 and ksmbd workload, etc...

> I guess I would like this flushing to interfere as little as possible
> with the server's happy zone, since it's not something clients need to
> wait for, and an error is exceptionally rare.
> 
> But also, we can't let writeback errors hold onto a bunch of memory
> indefinitely. How much nfsd_file and page cache memory might be be
> pinned by a writeback error, and for how long?
> 

You mean if we were to stop trying to fsync out when closing? We don't
keep files in the cache indefinitely, even if they have writeback
errors.

In general, the kernel attempts to write things out, and if it fails it
sets a writeback error in the mapping and marks the pages clean. So if
we're talking about files that are no longer being used (since they're
being GC'ed), we only block reaping them for as long as writeback is in
progress.

Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
a final time, grab the error and reset the write verifier when it's
non-zero.

If we stop doing fsyncs, then that model sort of breaks down. I'm not
clear on what you'd like to do instead.

> 
> > I'm not sure we need to worry about that so much for v4 though. Maybe we
> > should just do this for GC files?
> 
> I'm not caffeinated yet. Why is it not a problem for v4? Is it because
> an open or delegation stateid will prevent the nfsd_file from going
> away?
> 


Yeah, more or less.

I think that for a error to be lost with v4, it would require the client
to have an application access pattern that would expose it to that
possibility on a local filesystem as well. I don't think we have any
obligation to do more there.

Maybe that's a false assumption though.

> Sorry for the noise. It's all a little subtle.
> 

Very subtle. The more we can get this fleshed out into comments the
better, so I welcome the questions.

> 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > > fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> > > > 1 file changed, 31 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > > index d2bbded805d4..491d3d9a1870 100644
> > > > --- a/fs/nfsd/filecache.c
> > > > +++ b/fs/nfsd/filecache.c
> > > > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > > }
> > > > 
> > > > static void
> > > > -nfsd_file_flush(struct nfsd_file *nf)
> > > > +nfsd_file_fsync(struct nfsd_file *nf)
> > > > {
> > > > 	struct file *file = nf->nf_file;
> > > > 
> > > > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> > > > 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > }
> > > > 
> > > > +static void
> > > > +nfsd_file_flush(struct nfsd_file *nf)
> > > > +{
> > > > +	struct file *file = nf->nf_file;
> > > > +	unsigned long nrpages;
> > > > +
> > > > +	if (!file || !(file->f_mode & FMODE_WRITE))
> > > > +		return;
> > > > +
> > > > +	nrpages = file->f_mapping->nrpages;
> > > > +	if (nrpages) {
> > > > +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > > > +		filemap_flush(file->f_mapping);
> > > > +	}
> > > > +}
> > > > +
> > > > static void
> > > > nfsd_file_free(struct nfsd_file *nf)
> > > > {
> > > > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> > > > 	this_cpu_inc(nfsd_file_releases);
> > > > 	this_cpu_add(nfsd_file_total_age, age);
> > > > 
> > > > -	nfsd_file_flush(nf);
> > > > +	nfsd_file_fsync(nf);
> > > > 
> > > > 	if (nf->nf_mark)
> > > > 		nfsd_file_mark_put(nf->nf_mark);
> > > > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> > > > 
> > > > 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > > 		/*
> > > > -		 * If this is the last reference (nf_ref == 1), then transfer
> > > > -		 * it to the LRU. If the add to the LRU fails, just put it as
> > > > -		 * usual.
> > > > +		 * If this is the last reference (nf_ref == 1), then try
> > > > +		 * to transfer it to the LRU.
> > > > +		 */
> > > > +		if (refcount_dec_not_one(&nf->nf_ref))
> > > > +			return;
> > > > +
> > > > +		/*
> > > > +		 * If the add to the list succeeds, try to kick off SYNC_NONE
> > > > +		 * writeback. If the add fails, then just fall through to
> > > > +		 * decrement as usual.
> > > 
> > > These comments simply repeat what the code does, so they seem
> > > redundant to me. Could they instead explain why?
> > > 
> > > 
> > > > 		 */
> > > > -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > > +		if (nfsd_file_lru_add(nf)) {
> > > > +			nfsd_file_flush(nf);
> > > > 			return;
> > > > +		}
> > > > 	}
> > > > 	__nfsd_file_put(nf);
> > > > }
> > > > -- 
> > > > 2.37.3
> > > > 
> > > 
> > > --
> > > Chuck Lever
> > > 
> > > 
> > > 
> > 
> > -- 
> > Jeff Layton <jlayton@kernel.org>
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever III Oct. 28, 2022, 5:21 p.m. UTC | #7
> On Oct 28, 2022, at 11:51 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
>> 
>>> On Oct 28, 2022, at 11:05 AM, Jeff Layton <jlayton@kernel.org> wrote:
>>> 
>>> On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
>>>> 
>>>>> On Oct 27, 2022, at 5:52 PM, Jeff Layton <jlayton@kernel.org> wrote:
>>>>> 
>>>>> When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
>>>>> so that we can be ready to close it out when the time comes.
>>>> 
>>>> For a large file, a background flush still has to walk the file's
>>>> pages to see if they are dirty, and that consumes time, CPU, and
>>>> memory bandwidth. We're talking hundreds of microseconds for a
>>>> large file.
>>>> 
>>>> Then the final flush does all that again.
>>>> 
>>>> Basically, two (or more!) passes through the file for exactly the
>>>> same amount of work. Is there any measured improvement in latency
>>>> or throughput?
>>>> 
>>>> And then... for a GC file, no-one is waiting on data persistence
>>>> during nfsd_file_put() so I'm not sure what is gained by taking
>>>> control of the flushing process away from the underlying filesystem.
>>>> 
>>>> 
>>>> Remind me why the filecache is flushing? Shouldn't NFSD rely on
>>>> COMMIT operations for that? (It's not obvious reading the code,
>>>> maybe there should be a documenting comment somewhere that
>>>> explains this arrangement).
>>>> 
>>> 
>>> 
>>> Fair point. I was trying to replicate the behaviors introduced in these
>>> patches:
>>> 
>>> b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
>>> 6b8a94332ee4 nfsd: Fix a write performance regression
>>> 
>>> AFAICT, the fsync is there to catch writeback errors so that we can
>>> reset the write verifiers (AFAICT). The rationale for that is described
>>> here:
>>> 
>>> 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
>> 
>> Yes, I've been confused about this since then :-)
>> 
>> So, the patch description says:
>> 
>>    If a file may contain unstable writes that can error out, then we want
>>    to avoid garbage collecting the struct nfsd_file that may be
>>    tracking those errors.
>> 
>> That doesn't explain why that's a problem, it just says what we plan to
>> do about it.
>> 
>> 
>>> The problem with not calling vfs_fsync is that we might miss writeback
>>> errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
>>> nfsd would eventually reopen the file but it could miss seeing the error
>>> if it got opened locally in the interim.
>> 
>> That helps. So we're surfacing writeback errors for local writers?
>> 
> 
> Well for non-v3 writers anyway. I suppose you could hit the same
> scenario with a mixed v3 and v4 workload if you were unlucky enough, or
> mixed v3 and ksmbd workload, etc...
> 
>> I guess I would like this flushing to interfere as little as possible
>> with the server's happy zone, since it's not something clients need to
>> wait for, and an error is exceptionally rare.
>> 
>> But also, we can't let writeback errors hold onto a bunch of memory
>> indefinitely. How much nfsd_file and page cache memory might be be
>> pinned by a writeback error, and for how long?
>> 
> 
> You mean if we were to stop trying to fsync out when closing? We don't
> keep files in the cache indefinitely, even if they have writeback
> errors.
> 
> In general, the kernel attempts to write things out, and if it fails it
> sets a writeback error in the mapping and marks the pages clean. So if
> we're talking about files that are no longer being used (since they're
> being GC'ed), we only block reaping them for as long as writeback is in
> progress.
> 
> Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
> a final time, grab the error and reset the write verifier when it's
> non-zero.
> 
> If we stop doing fsyncs, then that model sort of breaks down. I'm not
> clear on what you'd like to do instead.

I'm not clear either. I think I just have some hand-wavy requirements.

I think keeping the flushes in the nfsd threads and away from single-
threaded garbage collection makes sense. Keep I/O in nfsd context, not
in the filecache garbage collector. I'm not sure that's guaranteed
if the garbage collection thread does an nfsd_file_put() that flushes.

And, we need to ensure that an nfsd_file isn't pinned forever -- the
flush has to make forward progress so that the nfsd_file is eventually
released. Otherwise, writeback errors become a DoS vector.

But, back to the topic of this patch: my own experiments with background
syncing showed that it introduces significant overhead and it wasn't
really worth the trouble. Basically, on intensive workloads, the garbage
collector must not stall or live-lock because it's walking through
millions of pages trying to figure out which ones are dirty.


>>> I'm not sure we need to worry about that so much for v4 though. Maybe we
>>> should just do this for GC files?
>> 
>> I'm not caffeinated yet. Why is it not a problem for v4? Is it because
>> an open or delegation stateid will prevent the nfsd_file from going
>> away?
>> 
> 
> 
> Yeah, more or less.
> 
> I think that for a error to be lost with v4, it would require the client
> to have an application access pattern that would expose it to that
> possibility on a local filesystem as well. I don't think we have any
> obligation to do more there.
> 
> Maybe that's a false assumption though.
> 
>> Sorry for the noise. It's all a little subtle.
>> 
> 
> Very subtle. The more we can get this fleshed out into comments the
> better, so I welcome the questions.
> 
>> 
>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>>> ---
>>>>> fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
>>>>> 1 file changed, 31 insertions(+), 6 deletions(-)
>>>>> 
>>>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>>>> index d2bbded805d4..491d3d9a1870 100644
>>>>> --- a/fs/nfsd/filecache.c
>>>>> +++ b/fs/nfsd/filecache.c
>>>>> @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
>>>>> }
>>>>> 
>>>>> static void
>>>>> -nfsd_file_flush(struct nfsd_file *nf)
>>>>> +nfsd_file_fsync(struct nfsd_file *nf)
>>>>> {
>>>>> 	struct file *file = nf->nf_file;
>>>>> 
>>>>> @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
>>>>> 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
>>>>> }
>>>>> 
>>>>> +static void
>>>>> +nfsd_file_flush(struct nfsd_file *nf)
>>>>> +{
>>>>> +	struct file *file = nf->nf_file;
>>>>> +	unsigned long nrpages;
>>>>> +
>>>>> +	if (!file || !(file->f_mode & FMODE_WRITE))
>>>>> +		return;
>>>>> +
>>>>> +	nrpages = file->f_mapping->nrpages;
>>>>> +	if (nrpages) {
>>>>> +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
>>>>> +		filemap_flush(file->f_mapping);
>>>>> +	}
>>>>> +}
>>>>> +
>>>>> static void
>>>>> nfsd_file_free(struct nfsd_file *nf)
>>>>> {
>>>>> @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
>>>>> 	this_cpu_inc(nfsd_file_releases);
>>>>> 	this_cpu_add(nfsd_file_total_age, age);
>>>>> 
>>>>> -	nfsd_file_flush(nf);
>>>>> +	nfsd_file_fsync(nf);
>>>>> 
>>>>> 	if (nf->nf_mark)
>>>>> 		nfsd_file_mark_put(nf->nf_mark);
>>>>> @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
>>>>> 
>>>>> 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
>>>>> 		/*
>>>>> -		 * If this is the last reference (nf_ref == 1), then transfer
>>>>> -		 * it to the LRU. If the add to the LRU fails, just put it as
>>>>> -		 * usual.
>>>>> +		 * If this is the last reference (nf_ref == 1), then try
>>>>> +		 * to transfer it to the LRU.
>>>>> +		 */
>>>>> +		if (refcount_dec_not_one(&nf->nf_ref))
>>>>> +			return;
>>>>> +
>>>>> +		/*
>>>>> +		 * If the add to the list succeeds, try to kick off SYNC_NONE
>>>>> +		 * writeback. If the add fails, then just fall through to
>>>>> +		 * decrement as usual.
>>>> 
>>>> These comments simply repeat what the code does, so they seem
>>>> redundant to me. Could they instead explain why?
>>>> 
>>>> 
>>>>> 		 */
>>>>> -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
>>>>> +		if (nfsd_file_lru_add(nf)) {
>>>>> +			nfsd_file_flush(nf);
>>>>> 			return;
>>>>> +		}
>>>>> 	}
>>>>> 	__nfsd_file_put(nf);
>>>>> }
>>>>> -- 
>>>>> 2.37.3
>>>>> 
>>>> 
>>>> --
>>>> Chuck Lever
>>>> 
>>>> 
>>>> 
>>> 
>>> -- 
>>> Jeff Layton <jlayton@kernel.org>
>> 
>> --
>> Chuck Lever
>> 
>> 
>> 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>

--
Chuck Lever
Jeff Layton Oct. 28, 2022, 5:43 p.m. UTC | #8
On Fri, 2022-10-28 at 17:21 +0000, Chuck Lever III wrote:
> 
> > On Oct 28, 2022, at 11:51 AM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
> > > 
> > > > On Oct 28, 2022, at 11:05 AM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > 
> > > > On Fri, 2022-10-28 at 13:16 +0000, Chuck Lever III wrote:
> > > > > 
> > > > > > On Oct 27, 2022, at 5:52 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > 
> > > > > > When a GC entry gets added to the LRU, kick off SYNC_NONE writeback
> > > > > > so that we can be ready to close it out when the time comes.
> > > > > 
> > > > > For a large file, a background flush still has to walk the file's
> > > > > pages to see if they are dirty, and that consumes time, CPU, and
> > > > > memory bandwidth. We're talking hundreds of microseconds for a
> > > > > large file.
> > > > > 
> > > > > Then the final flush does all that again.
> > > > > 
> > > > > Basically, two (or more!) passes through the file for exactly the
> > > > > same amount of work. Is there any measured improvement in latency
> > > > > or throughput?
> > > > > 
> > > > > And then... for a GC file, no-one is waiting on data persistence
> > > > > during nfsd_file_put() so I'm not sure what is gained by taking
> > > > > control of the flushing process away from the underlying filesystem.
> > > > > 
> > > > > 
> > > > > Remind me why the filecache is flushing? Shouldn't NFSD rely on
> > > > > COMMIT operations for that? (It's not obvious reading the code,
> > > > > maybe there should be a documenting comment somewhere that
> > > > > explains this arrangement).
> > > > > 
> > > > 
> > > > 
> > > > Fair point. I was trying to replicate the behaviors introduced in these
> > > > patches:
> > > > 
> > > > b6669305d35a nfsd: Reduce the number of calls to nfsd_file_gc()
> > > > 6b8a94332ee4 nfsd: Fix a write performance regression
> > > > 
> > > > AFAICT, the fsync is there to catch writeback errors so that we can
> > > > reset the write verifiers (AFAICT). The rationale for that is described
> > > > here:
> > > > 
> > > > 055b24a8f230 nfsd: Don't garbage collect files that might contain write errors
> > > 
> > > Yes, I've been confused about this since then :-)
> > > 
> > > So, the patch description says:
> > > 
> > >    If a file may contain unstable writes that can error out, then we want
> > >    to avoid garbage collecting the struct nfsd_file that may be
> > >    tracking those errors.
> > > 
> > > That doesn't explain why that's a problem, it just says what we plan to
> > > do about it.
> > > 
> > > 
> > > > The problem with not calling vfs_fsync is that we might miss writeback
> > > > errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> > > > nfsd would eventually reopen the file but it could miss seeing the error
> > > > if it got opened locally in the interim.
> > > 
> > > That helps. So we're surfacing writeback errors for local writers?
> > > 
> > 
> > Well for non-v3 writers anyway. I suppose you could hit the same
> > scenario with a mixed v3 and v4 workload if you were unlucky enough, or
> > mixed v3 and ksmbd workload, etc...
> > 
> > > I guess I would like this flushing to interfere as little as possible
> > > with the server's happy zone, since it's not something clients need to
> > > wait for, and an error is exceptionally rare.
> > > 
> > > But also, we can't let writeback errors hold onto a bunch of memory
> > > indefinitely. How much nfsd_file and page cache memory might be be
> > > pinned by a writeback error, and for how long?
> > > 
> > 
> > You mean if we were to stop trying to fsync out when closing? We don't
> > keep files in the cache indefinitely, even if they have writeback
> > errors.
> > 
> > In general, the kernel attempts to write things out, and if it fails it
> > sets a writeback error in the mapping and marks the pages clean. So if
> > we're talking about files that are no longer being used (since they're
> > being GC'ed), we only block reaping them for as long as writeback is in
> > progress.
> > 
> > Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
> > a final time, grab the error and reset the write verifier when it's
> > non-zero.
> > 
> > If we stop doing fsyncs, then that model sort of breaks down. I'm not
> > clear on what you'd like to do instead.
> 
> I'm not clear either. I think I just have some hand-wavy requirements.
> 
> I think keeping the flushes in the nfsd threads and away from single-
> threaded garbage collection makes sense. Keep I/O in nfsd context, not
> in the filecache garbage collector. I'm not sure that's guaranteed
> if the garbage collection thread does an nfsd_file_put() that flushes.
> 

The garbage collector doesn't call nfsd_file_put directly, though it
will call nfsd_file_free, which now does a vfs_fsync.

> 

> And, we need to ensure that an nfsd_file isn't pinned forever -- the
> flush has to make forward progress so that the nfsd_file is eventually
> released. Otherwise, writeback errors become a DoS vector.
> 

IDGI. An outright writeback _failure_ won't block anything. Stuck (hung)
writeback could cause a nfsd_file to be pinned indefinitely, but that's
really no different than the situation with stuck read requests.

> But, back to the topic of this patch: my own experiments with background
> syncing showed that it introduces significant overhead and it wasn't
> really worth the trouble. Basically, on intensive workloads, the garbage
> collector must not stall or live-lock because it's walking through
> millions of pages trying to figure out which ones are dirty.
> 

If this is what you want, then kicking off SYNC_NONE writeback when we
put it on the LRU is the right thing to do.

We want to ensure that when the thing is reaped from the LRU, that the
final vfs_fsync has to write nothing back. The penultimate put that adds
it to the LRU is almost certainly going to come in the context of an
nfsd thread, so kicking off background writeback at that point seems
reasonable.

Only files that aren't touched again get reaped off the LRU eventually,
so there should be no danger of nfsd redirtying it again. By the time we
get to reaping it, everything should be written back and the inode will
be ready to close with little delay.

> 
> > > > I'm not sure we need to worry about that so much for v4 though. Maybe we
> > > > should just do this for GC files?
> > > 
> > > I'm not caffeinated yet. Why is it not a problem for v4? Is it because
> > > an open or delegation stateid will prevent the nfsd_file from going
> > > away?
> > > 
> > 
> > 
> > Yeah, more or less.
> > 
> > I think that for a error to be lost with v4, it would require the client
> > to have an application access pattern that would expose it to that
> > possibility on a local filesystem as well. I don't think we have any
> > obligation to do more there.
> > 
> > Maybe that's a false assumption though.
> > 
> > > Sorry for the noise. It's all a little subtle.
> > > 
> > 
> > Very subtle. The more we can get this fleshed out into comments the
> > better, so I welcome the questions.
> > 
> > > 
> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > ---
> > > > > > fs/nfsd/filecache.c | 37 +++++++++++++++++++++++++++++++------
> > > > > > 1 file changed, 31 insertions(+), 6 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > > > > index d2bbded805d4..491d3d9a1870 100644
> > > > > > --- a/fs/nfsd/filecache.c
> > > > > > +++ b/fs/nfsd/filecache.c
> > > > > > @@ -316,7 +316,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > > > > > }
> > > > > > 
> > > > > > static void
> > > > > > -nfsd_file_flush(struct nfsd_file *nf)
> > > > > > +nfsd_file_fsync(struct nfsd_file *nf)
> > > > > > {
> > > > > > 	struct file *file = nf->nf_file;
> > > > > > 
> > > > > > @@ -327,6 +327,22 @@ nfsd_file_flush(struct nfsd_file *nf)
> > > > > > 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> > > > > > }
> > > > > > 
> > > > > > +static void
> > > > > > +nfsd_file_flush(struct nfsd_file *nf)
> > > > > > +{
> > > > > > +	struct file *file = nf->nf_file;
> > > > > > +	unsigned long nrpages;
> > > > > > +
> > > > > > +	if (!file || !(file->f_mode & FMODE_WRITE))
> > > > > > +		return;
> > > > > > +
> > > > > > +	nrpages = file->f_mapping->nrpages;
> > > > > > +	if (nrpages) {
> > > > > > +		this_cpu_add(nfsd_file_pages_flushed, nrpages);
> > > > > > +		filemap_flush(file->f_mapping);
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > static void
> > > > > > nfsd_file_free(struct nfsd_file *nf)
> > > > > > {
> > > > > > @@ -337,7 +353,7 @@ nfsd_file_free(struct nfsd_file *nf)
> > > > > > 	this_cpu_inc(nfsd_file_releases);
> > > > > > 	this_cpu_add(nfsd_file_total_age, age);
> > > > > > 
> > > > > > -	nfsd_file_flush(nf);
> > > > > > +	nfsd_file_fsync(nf);
> > > > > > 
> > > > > > 	if (nf->nf_mark)
> > > > > > 		nfsd_file_mark_put(nf->nf_mark);
> > > > > > @@ -500,12 +516,21 @@ nfsd_file_put(struct nfsd_file *nf)
> > > > > > 
> > > > > > 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > > > > > 		/*
> > > > > > -		 * If this is the last reference (nf_ref == 1), then transfer
> > > > > > -		 * it to the LRU. If the add to the LRU fails, just put it as
> > > > > > -		 * usual.
> > > > > > +		 * If this is the last reference (nf_ref == 1), then try
> > > > > > +		 * to transfer it to the LRU.
> > > > > > +		 */
> > > > > > +		if (refcount_dec_not_one(&nf->nf_ref))
> > > > > > +			return;
> > > > > > +
> > > > > > +		/*
> > > > > > +		 * If the add to the list succeeds, try to kick off SYNC_NONE
> > > > > > +		 * writeback. If the add fails, then just fall through to
> > > > > > +		 * decrement as usual.
> > > > > 
> > > > > These comments simply repeat what the code does, so they seem
> > > > > redundant to me. Could they instead explain why?
> > > > > 
> > > > > 
> > > > > > 		 */
> > > > > > -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > > > > > +		if (nfsd_file_lru_add(nf)) {
> > > > > > +			nfsd_file_flush(nf);
> > > > > > 			return;
> > > > > > +		}
> > > > > > 	}
> > > > > > 	__nfsd_file_put(nf);
> > > > > > }
> > > > > > -- 
> > > > > > 2.37.3
> > > > > > 
> > > > > 
> > > > > --
> > > > > Chuck Lever
> > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > -- 
> > > > Jeff Layton <jlayton@kernel.org>
> > > 
> > > --
> > > Chuck Lever
> > > 
> > > 
> > > 
> > 
> > -- 
> > Jeff Layton <jlayton@kernel.org>
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever III Oct. 28, 2022, 6:53 p.m. UTC | #9
> On Oct 28, 2022, at 1:43 PM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Fri, 2022-10-28 at 17:21 +0000, Chuck Lever III wrote:
>> 
>>> On Oct 28, 2022, at 11:51 AM, Jeff Layton <jlayton@kernel.org> wrote:
>>> 
>>> On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
>>>> 
>>>>> On Oct 28, 2022, at 11:05 AM, Jeff Layton <jlayton@kernel.org> wrote:
>>>>> 
>>>>> The problem with not calling vfs_fsync is that we might miss writeback
>>>>> errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
>>>>> nfsd would eventually reopen the file but it could miss seeing the error
>>>>> if it got opened locally in the interim.
>>>> 
>>>> That helps. So we're surfacing writeback errors for local writers?
>>>> 
>>> 
>>> Well for non-v3 writers anyway. I suppose you could hit the same
>>> scenario with a mixed v3 and v4 workload if you were unlucky enough, or
>>> mixed v3 and ksmbd workload, etc...
>>> 
>>>> I guess I would like this flushing to interfere as little as possible
>>>> with the server's happy zone, since it's not something clients need to
>>>> wait for, and an error is exceptionally rare.
>>>> 
>>>> But also, we can't let writeback errors hold onto a bunch of memory
>>>> indefinitely. How much nfsd_file and page cache memory might be be
>>>> pinned by a writeback error, and for how long?
>>>> 
>>> 
>>> You mean if we were to stop trying to fsync out when closing? We don't
>>> keep files in the cache indefinitely, even if they have writeback
>>> errors.
>>> 
>>> In general, the kernel attempts to write things out, and if it fails it
>>> sets a writeback error in the mapping and marks the pages clean. So if
>>> we're talking about files that are no longer being used (since they're
>>> being GC'ed), we only block reaping them for as long as writeback is in
>>> progress.
>>> 
>>> Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
>>> a final time, grab the error and reset the write verifier when it's
>>> non-zero.
>>> 
>>> If we stop doing fsyncs, then that model sort of breaks down. I'm not
>>> clear on what you'd like to do instead.
>> 
>> I'm not clear either. I think I just have some hand-wavy requirements.
>> 
>> I think keeping the flushes in the nfsd threads and away from single-
>> threaded garbage collection makes sense. Keep I/O in nfsd context, not
>> in the filecache garbage collector. I'm not sure that's guaranteed
>> if the garbage collection thread does an nfsd_file_put() that flushes.
>> 
> 
> The garbage collector doesn't call nfsd_file_put directly, though it
> will call nfsd_file_free, which now does a vfs_fsync.

OK, thought I saw some email fly by that suggested using nfsd_file_put
in the garbage collector. But... the vfs_fsync you mention can possibly
trigger I/O and wait for it (it's not the SYNC_NONE flush) in the GC
thread. Rare, but I'd rather not have even that possibility if we can
avoid it.


>> But, back to the topic of this patch: my own experiments with background
>> syncing showed that it introduces significant overhead and it wasn't
>> really worth the trouble. Basically, on intensive workloads, the garbage
>> collector must not stall or live-lock because it's walking through
>> millions of pages trying to figure out which ones are dirty.
>> 
> 
> If this is what you want, then kicking off SYNC_NONE writeback when we
> put it on the LRU is the right thing to do.
> 
> We want to ensure that when the thing is reaped from the LRU, that the
> final vfs_fsync has to write nothing back. The penultimate put that adds
> it to the LRU is almost certainly going to come in the context of an
> nfsd thread, so kicking off background writeback at that point seems
> reasonable.

IIUC the opposing idea is to do a synchronous writeback in nfsd_file_put
and then nothing in nfsd_file_free. Why isn't that adequate to achieve
the same result ?

Thinking aloud:

- Suppose a client does some UNSTABLE NFSv3 WRITEs to a file
- The client then waits long enough for the nfsd_file to get aged out
  of the filecache
- A local writer on the server triggers a writeback error on the file
- The error is cleared by other activity
- The client sends a COMMIT

Wouldn't the server miss the chance to bump its write verifier in that
case?


> Only files that aren't touched again get reaped off the LRU eventually,
> so there should be no danger of nfsd redirtying it again.

At the risk of rat-holing... IIUC the only case we should care about
is if there are outstanding NFSv3 WRITEs that haven't been COMMITed.
Seems like NFSv3-specific code, and not the filecache, should deal
with that case, and leave nfsd_file_put/free out of it...? Again, no
clear idea how it would, but just thinking about the layering here.


> By the time we
> get to reaping it, everything should be written back and the inode will
> be ready to close with little delay.



--
Chuck Lever
Jeff Layton Oct. 28, 2022, 8:04 p.m. UTC | #10
On Fri, 2022-10-28 at 18:53 +0000, Chuck Lever III wrote:
> 
> > On Oct 28, 2022, at 1:43 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Fri, 2022-10-28 at 17:21 +0000, Chuck Lever III wrote:
> > > 
> > > > On Oct 28, 2022, at 11:51 AM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > 
> > > > On Fri, 2022-10-28 at 15:29 +0000, Chuck Lever III wrote:
> > > > > 
> > > > > > On Oct 28, 2022, at 11:05 AM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > 
> > > > > > The problem with not calling vfs_fsync is that we might miss writeback
> > > > > > errors. The nfsd_file could get reaped before a v3 COMMIT ever comes in.
> > > > > > nfsd would eventually reopen the file but it could miss seeing the error
> > > > > > if it got opened locally in the interim.
> > > > > 
> > > > > That helps. So we're surfacing writeback errors for local writers?
> > > > > 
> > > > 
> > > > Well for non-v3 writers anyway. I suppose you could hit the same
> > > > scenario with a mixed v3 and v4 workload if you were unlucky enough, or
> > > > mixed v3 and ksmbd workload, etc...
> > > > 
> > > > > I guess I would like this flushing to interfere as little as possible
> > > > > with the server's happy zone, since it's not something clients need to
> > > > > wait for, and an error is exceptionally rare.
> > > > > 
> > > > > But also, we can't let writeback errors hold onto a bunch of memory
> > > > > indefinitely. How much nfsd_file and page cache memory might be be
> > > > > pinned by a writeback error, and for how long?
> > > > > 
> > > > 
> > > > You mean if we were to stop trying to fsync out when closing? We don't
> > > > keep files in the cache indefinitely, even if they have writeback
> > > > errors.
> > > > 
> > > > In general, the kernel attempts to write things out, and if it fails it
> > > > sets a writeback error in the mapping and marks the pages clean. So if
> > > > we're talking about files that are no longer being used (since they're
> > > > being GC'ed), we only block reaping them for as long as writeback is in
> > > > progress.
> > > > 
> > > > Once writeback ends and it's eligible for reaping, we'll call vfs_fsync
> > > > a final time, grab the error and reset the write verifier when it's
> > > > non-zero.
> > > > 
> > > > If we stop doing fsyncs, then that model sort of breaks down. I'm not
> > > > clear on what you'd like to do instead.
> > > 
> > > I'm not clear either. I think I just have some hand-wavy requirements.
> > > 
> > > I think keeping the flushes in the nfsd threads and away from single-
> > > threaded garbage collection makes sense. Keep I/O in nfsd context, not
> > > in the filecache garbage collector. I'm not sure that's guaranteed
> > > if the garbage collection thread does an nfsd_file_put() that flushes.
> > > 
> > 
> > The garbage collector doesn't call nfsd_file_put directly, though it
> > will call nfsd_file_free, which now does a vfs_fsync.
> 
> OK, thought I saw some email fly by that suggested using nfsd_file_put
> in the garbage collector. But... the vfs_fsync you mention can possibly
> trigger I/O and wait for it (it's not the SYNC_NONE flush) in the GC
> thread. Rare, but I'd rather not have even that possibility if we can
> avoid it.
> 
> 
> > > But, back to the topic of this patch: my own experiments with background
> > > syncing showed that it introduces significant overhead and it wasn't
> > > really worth the trouble. Basically, on intensive workloads, the garbage
> > > collector must not stall or live-lock because it's walking through
> > > millions of pages trying to figure out which ones are dirty.
> > > 
> > 
> > If this is what you want, then kicking off SYNC_NONE writeback when we
> > put it on the LRU is the right thing to do.
> > 
> > We want to ensure that when the thing is reaped from the LRU, that the
> > final vfs_fsync has to write nothing back. The penultimate put that adds
> > it to the LRU is almost certainly going to come in the context of an
> > nfsd thread, so kicking off background writeback at that point seems
> > reasonable.
> 
> IIUC the opposing idea is to do a synchronous writeback in nfsd_file_put
> and then nothing in nfsd_file_free. Why isn't that adequate to achieve
> the same result ?
> 

To make sure I understand:

For the GC case (v3), you basically want to do a vfs_fsync after we put
it onto the LRU list? We'd also do a vfs_fsync after the refcount goes
to 0 in nfsd_file_put.

That seems...worse than what I'm proposing. We'll end up with a bunch of
blocked nfsd threads for no reason. The writeback in most cases could
proceed asynchronously, and we'll be idling an nfsd thread for the sole
purpose of getting the result of that writeback.

I see no need to block an nfsd thread for this. If we kick off
WB_SYNC_NONE writeback when we put it on the list, then by the time we
get around to calling vfs_fsync for reaping the thing, it'll basically
be a no-op. PAGECACHE_TAG_DIRTY should be clear and vfs_fsync will just
scrape the wb error code and return without walking anything.

I get the goal of not idling the garbage collector for too long, but
some of that may just be unavoidable. Tearing down a nfsd_file can just
take a significant amount of time, between flushing data and close.

> Thinking aloud:
> 
> - Suppose a client does some UNSTABLE NFSv3 WRITEs to a file
> - The client then waits long enough for the nfsd_file to get aged out
>   of the filecache
> - A local writer on the server triggers a writeback error on the file
> - The error is cleared by other activity
> - The client sends a COMMIT
> 
> Wouldn't the server miss the chance to bump its write verifier in that
> case?
> 

Yep. That is the danger.

> 
> > Only files that aren't touched again get reaped off the LRU eventually,
> > so there should be no danger of nfsd redirtying it again.
> 
> At the risk of rat-holing... IIUC the only case we should care about
> is if there are outstanding NFSv3 WRITEs that haven't been COMMITed.
> Seems like NFSv3-specific code, and not the filecache, should deal
> with that case, and leave nfsd_file_put/free out of it...? Again, no
> clear idea how it would, but just thinking about the layering here.
> 

No idea how we'd do that. The filecache is what gives us persistent
"state" across disparate RPCs with v3. I think this is where the
solution has to be handled.


> 
> > By the time we
> > get to reaping it, everything should be written back and the inode will
> > be ready to close with little delay.
> 
> 
> 
> --
> Chuck Lever
> 
> 
>
diff mbox series

Patch

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index d2bbded805d4..491d3d9a1870 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -316,7 +316,7 @@  nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
 }
 
 static void
-nfsd_file_flush(struct nfsd_file *nf)
+nfsd_file_fsync(struct nfsd_file *nf)
 {
 	struct file *file = nf->nf_file;
 
@@ -327,6 +327,22 @@  nfsd_file_flush(struct nfsd_file *nf)
 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
 }
 
+static void
+nfsd_file_flush(struct nfsd_file *nf)
+{
+	struct file *file = nf->nf_file;
+	unsigned long nrpages;
+
+	if (!file || !(file->f_mode & FMODE_WRITE))
+		return;
+
+	nrpages = file->f_mapping->nrpages;
+	if (nrpages) {
+		this_cpu_add(nfsd_file_pages_flushed, nrpages);
+		filemap_flush(file->f_mapping);
+	}
+}
+
 static void
 nfsd_file_free(struct nfsd_file *nf)
 {
@@ -337,7 +353,7 @@  nfsd_file_free(struct nfsd_file *nf)
 	this_cpu_inc(nfsd_file_releases);
 	this_cpu_add(nfsd_file_total_age, age);
 
-	nfsd_file_flush(nf);
+	nfsd_file_fsync(nf);
 
 	if (nf->nf_mark)
 		nfsd_file_mark_put(nf->nf_mark);
@@ -500,12 +516,21 @@  nfsd_file_put(struct nfsd_file *nf)
 
 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
 		/*
-		 * If this is the last reference (nf_ref == 1), then transfer
-		 * it to the LRU. If the add to the LRU fails, just put it as
-		 * usual.
+		 * If this is the last reference (nf_ref == 1), then try
+		 * to transfer it to the LRU.
+		 */
+		if (refcount_dec_not_one(&nf->nf_ref))
+			return;
+
+		/*
+		 * If the add to the list succeeds, try to kick off SYNC_NONE
+		 * writeback. If the add fails, then just fall through to
+		 * decrement as usual.
 		 */
-		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
+		if (nfsd_file_lru_add(nf)) {
+			nfsd_file_flush(nf);
 			return;
+		}
 	}
 	__nfsd_file_put(nf);
 }