diff mbox series

[v3,3/4] nfsd: close race between unhashing and LRU addition

Message ID 20221028185712.79863-4-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series nfsd: clean up refcounting in the filecache | expand

Commit Message

Jeff Layton Oct. 28, 2022, 6:57 p.m. UTC
The list_lru_add and list_lru_del functions use list_empty checks to see
whether the object is already on the LRU. That's fine in most cases, but
we occasionally repurpose nf_lru after unhashing. It's possible for an
LRU removal to remove it from a different list altogether if we lose a
race.

Add a new NFSD_FILE_LRU flag, which indicates that the object actually
resides on the LRU and not some other list. Use that when adding and
removing it from the LRU instead of just relying on list_empty checks.

Add an extra HASHED check after adding the entry to the LRU. If it's now
clear, just remove it from the LRU again and put the reference if that
remove is successful.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/filecache.c | 44 +++++++++++++++++++++++++++++---------------
 fs/nfsd/filecache.h |  1 +
 2 files changed, 30 insertions(+), 15 deletions(-)

Comments

Chuck Lever III Oct. 28, 2022, 7:50 p.m. UTC | #1
> On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> The list_lru_add and list_lru_del functions use list_empty checks to see
> whether the object is already on the LRU. That's fine in most cases, but
> we occasionally repurpose nf_lru after unhashing. It's possible for an
> LRU removal to remove it from a different list altogether if we lose a
> race.

I've never seen that happen. lru field re-use is actually used in other
places in the kernel. Shouldn't we try to find and fix such races?

Wasn't the idea to reduce the complexity of nfsd_file_put ?


> Add a new NFSD_FILE_LRU flag, which indicates that the object actually
> resides on the LRU and not some other list. Use that when adding and
> removing it from the LRU instead of just relying on list_empty checks.
> 
> Add an extra HASHED check after adding the entry to the LRU. If it's now
> clear, just remove it from the LRU again and put the reference if that
> remove is successful.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> fs/nfsd/filecache.c | 44 +++++++++++++++++++++++++++++---------------
> fs/nfsd/filecache.h |  1 +
> 2 files changed, 30 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index d928c5e38eeb..47cdc6129a7b 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -403,18 +403,22 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
> static bool nfsd_file_lru_add(struct nfsd_file *nf)
> {
> 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> -	if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> -		trace_nfsd_file_lru_add(nf);
> -		return true;
> +	if (!test_and_set_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
> +		if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> +			trace_nfsd_file_lru_add(nf);
> +			return true;
> +		}
> 	}
> 	return false;
> }
> 
> static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> {
> -	if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> -		trace_nfsd_file_lru_del(nf);
> -		return true;
> +	if (test_and_clear_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
> +		if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> +			trace_nfsd_file_lru_del(nf);
> +			return true;
> +		}
> 	}
> 	return false;
> }
> @@ -469,20 +473,30 @@ nfsd_file_put(struct nfsd_file *nf)
> {
> 	trace_nfsd_file_put(nf);
> 
> -	/*
> -	 * The HASHED check is racy. We may end up with the occasional
> -	 * unhashed entry on the LRU, but they should get cleaned up
> -	 * like any other.
> -	 */
> 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
> 	    test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> 		/*
> -		 * If this is the last reference (nf_ref == 1), then transfer
> -		 * it to the LRU. If the add to the LRU fails, just put it as
> -		 * usual.
> +		 * If this is the last reference (nf_ref == 1), then try to
> +		 * transfer it to the LRU.
> 		 */
> -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> +		if (refcount_dec_not_one(&nf->nf_ref))
> 			return;
> +
> +		/* Try to add it to the LRU.  If that fails, decrement. */
> +		if (nfsd_file_lru_add(nf)) {
> +			/* If it's still hashed, we're done */
> +			if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
> +				return;
> +
> +			/*
> +			 * We're racing with unhashing, so try to remove it from
> +			 * the LRU. If removal fails, then someone else already
> +			 * has our reference and we're done. If it succeeds,
> +			 * fall through to decrement.
> +			 */
> +			if (!nfsd_file_lru_remove(nf))
> +				return;
> +		}
> 	}
> 	if (refcount_dec_and_test(&nf->nf_ref))
> 		nfsd_file_free(nf);
> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> index b7efb2c3ddb1..e52ab7d5a44c 100644
> --- a/fs/nfsd/filecache.h
> +++ b/fs/nfsd/filecache.h
> @@ -39,6 +39,7 @@ struct nfsd_file {
> #define NFSD_FILE_PENDING	(1)
> #define NFSD_FILE_REFERENCED	(2)
> #define NFSD_FILE_GC		(3)
> +#define NFSD_FILE_LRU		(4)	/* file is on LRU */
> 	unsigned long		nf_flags;
> 	struct inode		*nf_inode;	/* don't deref */
> 	refcount_t		nf_ref;
> -- 
> 2.37.3
> 

--
Chuck Lever
Jeff Layton Oct. 28, 2022, 8:04 p.m. UTC | #2
On Fri, 2022-10-28 at 19:50 +0000, Chuck Lever III wrote:
> 
> > On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > The list_lru_add and list_lru_del functions use list_empty checks to see
> > whether the object is already on the LRU. That's fine in most cases, but
> > we occasionally repurpose nf_lru after unhashing. It's possible for an
> > LRU removal to remove it from a different list altogether if we lose a
> > race.
> 
> I've never seen that happen. lru field re-use is actually used in other
> places in the kernel. Shouldn't we try to find and fix such races?
> 
> Wasn't the idea to reduce the complexity of nfsd_file_put ?
> 

It certainly seems theoretically possible here. Maybe those other places
have other ways to ensure that it doesn't occur. We are dealing with RCU
freed structures here, so we can't always rely on locks to keep things
nice and serialized.


FWIW, I left this as a separate patch just to illustrate the race and
fix, but we probably would want to squash this one into the first.

> > Add a new NFSD_FILE_LRU flag, which indicates that the object actually
> > resides on the LRU and not some other list. Use that when adding and
> > removing it from the LRU instead of just relying on list_empty checks.
> > 
> > Add an extra HASHED check after adding the entry to the LRU. If it's now
> > clear, just remove it from the LRU again and put the reference if that
> > remove is successful.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> > fs/nfsd/filecache.c | 44 +++++++++++++++++++++++++++++---------------
> > fs/nfsd/filecache.h |  1 +
> > 2 files changed, 30 insertions(+), 15 deletions(-)
> > 
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index d928c5e38eeb..47cdc6129a7b 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -403,18 +403,22 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
> > static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > {
> > 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > -	if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > -		trace_nfsd_file_lru_add(nf);
> > -		return true;
> > +	if (!test_and_set_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
> > +		if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > +			trace_nfsd_file_lru_add(nf);
> > +			return true;
> > +		}
> > 	}
> > 	return false;
> > }
> > 
> > static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > {
> > -	if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > -		trace_nfsd_file_lru_del(nf);
> > -		return true;
> > +	if (test_and_clear_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
> > +		if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > +			trace_nfsd_file_lru_del(nf);
> > +			return true;
> > +		}
> > 	}
> > 	return false;
> > }
> > @@ -469,20 +473,30 @@ nfsd_file_put(struct nfsd_file *nf)
> > {
> > 	trace_nfsd_file_put(nf);
> > 
> > -	/*
> > -	 * The HASHED check is racy. We may end up with the occasional
> > -	 * unhashed entry on the LRU, but they should get cleaned up
> > -	 * like any other.
> > -	 */
> > 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
> > 	    test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > 		/*
> > -		 * If this is the last reference (nf_ref == 1), then transfer
> > -		 * it to the LRU. If the add to the LRU fails, just put it as
> > -		 * usual.
> > +		 * If this is the last reference (nf_ref == 1), then try to
> > +		 * transfer it to the LRU.
> > 		 */
> > -		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > +		if (refcount_dec_not_one(&nf->nf_ref))
> > 			return;
> > +
> > +		/* Try to add it to the LRU.  If that fails, decrement. */
> > +		if (nfsd_file_lru_add(nf)) {
> > +			/* If it's still hashed, we're done */
> > +			if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
> > +				return;
> > +
> > +			/*
> > +			 * We're racing with unhashing, so try to remove it from
> > +			 * the LRU. If removal fails, then someone else already
> > +			 * has our reference and we're done. If it succeeds,
> > +			 * fall through to decrement.
> > +			 */
> > +			if (!nfsd_file_lru_remove(nf))
> > +				return;
> > +		}
> > 	}
> > 	if (refcount_dec_and_test(&nf->nf_ref))
> > 		nfsd_file_free(nf);
> > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> > index b7efb2c3ddb1..e52ab7d5a44c 100644
> > --- a/fs/nfsd/filecache.h
> > +++ b/fs/nfsd/filecache.h
> > @@ -39,6 +39,7 @@ struct nfsd_file {
> > #define NFSD_FILE_PENDING	(1)
> > #define NFSD_FILE_REFERENCED	(2)
> > #define NFSD_FILE_GC		(3)
> > +#define NFSD_FILE_LRU		(4)	/* file is on LRU */
> > 	unsigned long		nf_flags;
> > 	struct inode		*nf_inode;	/* don't deref */
> > 	refcount_t		nf_ref;
> > -- 
> > 2.37.3
> > 
> 
> --
> Chuck Lever
> 
> 
>
NeilBrown Oct. 30, 2022, 9:45 p.m. UTC | #3
On Sat, 29 Oct 2022, Chuck Lever III wrote:
> 
> > On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > The list_lru_add and list_lru_del functions use list_empty checks to see
> > whether the object is already on the LRU. That's fine in most cases, but
> > we occasionally repurpose nf_lru after unhashing. It's possible for an
> > LRU removal to remove it from a different list altogether if we lose a
> > race.
> 
> I've never seen that happen. lru field re-use is actually used in other
> places in the kernel. Shouldn't we try to find and fix such races?
> 
> Wasn't the idea to reduce the complexity of nfsd_file_put ?
> 

I think the nfsd filecache is different from those "other places"
because of nfsd_file_close_inode() and related code.  If I understand
correctly, nfsd can remove a file from the filecache while it is still
in use.  In this case it needs to be unhashed and removed from the lru -
and then added to a dispose list - while it might still be active for
some IO request.

Prior to Commit 8d0d254b15cc ("nfsd: fix nfsd_file_unhash_and_dispose")
unhash_and_dispose() wouldn't add to the dispose list unless the
refcount was one.  I'm not sure how racy this test was, but it would
mean that it is unlikely for an nfsd_file to be added to the dispose list
if it was still in use.

After that commit it seems more likely that a nfsd_file will be added to
a dispose list while it is in use.
As we add/remove things to a dispose list without a lock, we need to be
careful that no other thread will add the nfsd_file to an lru at the
same time.  refcounts will no longer provide that protection.  Maybe we
should restore the refcount protection, or maybe we need a bit as Jeff
has added here.

NeilBrown
Chuck Lever III Oct. 31, 2022, 2:51 a.m. UTC | #4
> On Oct 30, 2022, at 5:45 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Sat, 29 Oct 2022, Chuck Lever III wrote:
>> 
>>> On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
>>> 
>>> The list_lru_add and list_lru_del functions use list_empty checks to see
>>> whether the object is already on the LRU. That's fine in most cases, but
>>> we occasionally repurpose nf_lru after unhashing. It's possible for an
>>> LRU removal to remove it from a different list altogether if we lose a
>>> race.

Can that issue be resolved by simply adding a "struct list_head nf_dispose"
field? That might be more straightforward than adding conditional logic.


>> I've never seen that happen. lru field re-use is actually used in other
>> places in the kernel. Shouldn't we try to find and fix such races?
>> 
>> Wasn't the idea to reduce the complexity of nfsd_file_put ?
>> 
> 
> I think the nfsd filecache is different from those "other places"
> because of nfsd_file_close_inode() and related code.  If I understand
> correctly, nfsd can remove a file from the filecache while it is still
> in use.

Not sure about that; I think nfsd_file_close_inode() is invoked only
when a file is deleted. I could be remembering incorrectly, but that
seems like a really difficult race to hit.


> In this case it needs to be unhashed and removed from the lru -
> and then added to a dispose list - while it might still be active for
> some IO request.
> 
> Prior to Commit 8d0d254b15cc ("nfsd: fix nfsd_file_unhash_and_dispose")
> unhash_and_dispose() wouldn't add to the dispose list unless the
> refcount was one.  I'm not sure how racy this test was, but it would
> mean that it is unlikely for an nfsd_file to be added to the dispose list
> if it was still in use.
> 
> After that commit it seems more likely that a nfsd_file will be added to
> a dispose list while it is in use.

After it's linked to a dispose list via nf_lru, list_lru_add won't put
it on the LRU -- it becomes a no-op because nf_lru is now !empty. I
think we would have seen LRU corruption pretty quickly. Re-reading
Jeff's patch description, that might not be the problem he's trying
to address here.

But, it would be easy to do some reality testing. I think you could
add a WARN_ON or tracepoint in nfsd_file_free() or somewhere in the
dispose-list path to catch an in-use nfsd_file?


> As we add/remove things to a dispose list without a lock, we need to be
> careful that no other thread will add the nfsd_file to an lru at the
> same time.  refcounts will no longer provide that protection.  Maybe we
> should restore the refcount protection, or maybe we need a bit as Jeff
> has added here.

I'm not opposed to defensive changes, in general. This one seems to be
adding significant complexity without a clear hazard. I'd like to have
a better understanding of exactly what misbehavior is being addressed.


--
Chuck Lever
Jeff Layton Oct. 31, 2022, 10:01 a.m. UTC | #5
On Mon, 2022-10-31 at 08:45 +1100, NeilBrown wrote:
> On Sat, 29 Oct 2022, Chuck Lever III wrote:
> > 
> > > On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > 
> > > The list_lru_add and list_lru_del functions use list_empty checks to see
> > > whether the object is already on the LRU. That's fine in most cases, but
> > > we occasionally repurpose nf_lru after unhashing. It's possible for an
> > > LRU removal to remove it from a different list altogether if we lose a
> > > race.
> > 
> > I've never seen that happen. lru field re-use is actually used in other
> > places in the kernel. Shouldn't we try to find and fix such races?
> > 
> > Wasn't the idea to reduce the complexity of nfsd_file_put ?
> > 
> 
> I think the nfsd filecache is different from those "other places"
> because of nfsd_file_close_inode() and related code.  If I understand
> correctly, nfsd can remove a file from the filecache while it is still
> in use.  In this case it needs to be unhashed and removed from the lru -
> and then added to a dispose list - while it might still be active for
> some IO request.
> 
> Prior to Commit 8d0d254b15cc ("nfsd: fix nfsd_file_unhash_and_dispose")
> unhash_and_dispose() wouldn't add to the dispose list unless the
> refcount was one.  I'm not sure how racy this test was, but it would
> mean that it is unlikely for an nfsd_file to be added to the dispose list
> if it was still in use.
> 

I'm pretty sure that was racy. By the time you go to put it on the
dispose list, the refcount may no longer be one since it can be
incremented at any time.

> After that commit it seems more likely that a nfsd_file will be added to
> a dispose list while it is in use.
> As we add/remove things to a dispose list without a lock, we need to be
> careful that no other thread will add the nfsd_file to an lru at the
> same time.  refcounts will no longer provide that protection.  Maybe we
> should restore the refcount protection, or maybe we need a bit as Jeff
> has added here.
> 

If we have an entry on a list, then it can't be added to the LRU (since
the list_empty check would fail). The real danger is that we could end
up trying to remove it from the LRU and inadvertantly remove it from a
private dispose list instead. Since that is done without a lock, we
could easily corrupt memory.
Jeff Layton Oct. 31, 2022, 10:08 a.m. UTC | #6
On Mon, 2022-10-31 at 02:51 +0000, Chuck Lever III wrote:
> 
> > On Oct 30, 2022, at 5:45 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Sat, 29 Oct 2022, Chuck Lever III wrote:
> > > 
> > > > On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > 
> > > > The list_lru_add and list_lru_del functions use list_empty checks to see
> > > > whether the object is already on the LRU. That's fine in most cases, but
> > > > we occasionally repurpose nf_lru after unhashing. It's possible for an
> > > > LRU removal to remove it from a different list altogether if we lose a
> > > > race.
> 
> Can that issue be resolved by simply adding a "struct list_head nf_dispose"
> field? That might be more straightforward than adding conditional logic.
> 

Yes, though that would take more memory.

> 
> > > I've never seen that happen. lru field re-use is actually used in other
> > > places in the kernel. Shouldn't we try to find and fix such races?
> > > 
> > > Wasn't the idea to reduce the complexity of nfsd_file_put ?
> > > 
> > 
> > I think the nfsd filecache is different from those "other places"
> > because of nfsd_file_close_inode() and related code.  If I understand
> > correctly, nfsd can remove a file from the filecache while it is still
> > in use.
> 
> Not sure about that; I think nfsd_file_close_inode() is invoked only
> when a file is deleted. I could be remembering incorrectly, but that
> seems like a really difficult race to hit.
> 

I think that's correct. We have a notifier set up that tells us when the
link count goes to zero. At that point we want to unhash the thing and
try to get it out of the cache ASAP.

FWIW, the main impetus for this was NFS reexport. Keeping unlinked files
in the cache could cause the reexporting server to do a bunch of
unnecessary silly-renaming.

> > In this case it needs to be unhashed and removed from the lru -
> > and then added to a dispose list - while it might still be active for
> > some IO request.
> > 
> > Prior to Commit 8d0d254b15cc ("nfsd: fix nfsd_file_unhash_and_dispose")
> > unhash_and_dispose() wouldn't add to the dispose list unless the
> > refcount was one.  I'm not sure how racy this test was, but it would
> > mean that it is unlikely for an nfsd_file to be added to the dispose list
> > if it was still in use.
> > 
> > After that commit it seems more likely that a nfsd_file will be added to
> > a dispose list while it is in use.
> 
> After it's linked to a dispose list via nf_lru, list_lru_add won't put
> it on the LRU -- it becomes a no-op because nf_lru is now !empty. I
> think we would have seen LRU corruption pretty quickly. Re-reading
> Jeff's patch description, that might not be the problem he's trying
> to address here.
> 
> But, it would be easy to do some reality testing. I think you could
> add a WARN_ON or tracepoint in nfsd_file_free() or somewhere in the
> dispose-list path to catch an in-use nfsd_file?
> 
> 
> > As we add/remove things to a dispose list without a lock, we need to be
> > careful that no other thread will add the nfsd_file to an lru at the
> > same time.  refcounts will no longer provide that protection.  Maybe we
> > should restore the refcount protection, or maybe we need a bit as Jeff
> > has added here.
> 
> I'm not opposed to defensive changes, in general. This one seems to be
> adding significant complexity without a clear hazard. I'd like to have
> a better understanding of exactly what misbehavior is being addressed.
> 

The real danger is that we could end up trying to remove an entry from
the LRU, when it's actually sitting on a dispose list. Without some way
to distinguish where it is, we could cause memory corruption.
Chuck Lever III Oct. 31, 2022, 1:14 p.m. UTC | #7
> On Oct 31, 2022, at 6:08 AM, Jeff Layton <jlayton@kernel.org> wrote:
> 
> On Mon, 2022-10-31 at 02:51 +0000, Chuck Lever III wrote:
>> 
>>> On Oct 30, 2022, at 5:45 PM, NeilBrown <neilb@suse.de> wrote:
>>> 
>>> On Sat, 29 Oct 2022, Chuck Lever III wrote:
>>>> 
>>>>> On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
>>>>> 
>>>>> The list_lru_add and list_lru_del functions use list_empty checks to see
>>>>> whether the object is already on the LRU. That's fine in most cases, but
>>>>> we occasionally repurpose nf_lru after unhashing. It's possible for an
>>>>> LRU removal to remove it from a different list altogether if we lose a
>>>>> race.
>> 
>> Can that issue be resolved by simply adding a "struct list_head nf_dispose"
>> field? That might be more straightforward than adding conditional logic.
>> 
> 
> Yes, though that would take more memory.

Not really. pahole says struct nfsd_file is currently 40 bytes short
of two cache lines. So adding a list_head field should not push the
size of nfsd_file to the point where kmalloc would have to allocate
more memory per object.

I'm wondering if a separate list_head field would help simplify
nfsd_file_put() ?


--
Chuck Lever
Jeff Layton Oct. 31, 2022, 1:28 p.m. UTC | #8
On Mon, 2022-10-31 at 13:14 +0000, Chuck Lever III wrote:
> 
> > On Oct 31, 2022, at 6:08 AM, Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > On Mon, 2022-10-31 at 02:51 +0000, Chuck Lever III wrote:
> > > 
> > > > On Oct 30, 2022, at 5:45 PM, NeilBrown <neilb@suse.de> wrote:
> > > > 
> > > > On Sat, 29 Oct 2022, Chuck Lever III wrote:
> > > > > 
> > > > > > On Oct 28, 2022, at 2:57 PM, Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > 
> > > > > > The list_lru_add and list_lru_del functions use list_empty checks to see
> > > > > > whether the object is already on the LRU. That's fine in most cases, but
> > > > > > we occasionally repurpose nf_lru after unhashing. It's possible for an
> > > > > > LRU removal to remove it from a different list altogether if we lose a
> > > > > > race.
> > > 
> > > Can that issue be resolved by simply adding a "struct list_head nf_dispose"
> > > field? That might be more straightforward than adding conditional logic.
> > > 
> > 
> > Yes, though that would take more memory.
> 
> Not really. pahole says struct nfsd_file is currently 40 bytes short
> of two cache lines. So adding a list_head field should not push the
> size of nfsd_file to the point where kmalloc would have to allocate
> more memory per object.
> 
> I'm wondering if a separate list_head field would help simplify
> nfsd_file_put() ?
> 

Probably not. It wouldn't need the flag anymore, but the logic would
still be roughly the same. We still have to check for the race between
unhashing and adding to the LRU either way.
diff mbox series

Patch

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index d928c5e38eeb..47cdc6129a7b 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -403,18 +403,22 @@  nfsd_file_check_writeback(struct nfsd_file *nf)
 static bool nfsd_file_lru_add(struct nfsd_file *nf)
 {
 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
-	if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
-		trace_nfsd_file_lru_add(nf);
-		return true;
+	if (!test_and_set_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
+		if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
+			trace_nfsd_file_lru_add(nf);
+			return true;
+		}
 	}
 	return false;
 }
 
 static bool nfsd_file_lru_remove(struct nfsd_file *nf)
 {
-	if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
-		trace_nfsd_file_lru_del(nf);
-		return true;
+	if (test_and_clear_bit(NFSD_FILE_LRU, &nf->nf_flags)) {
+		if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
+			trace_nfsd_file_lru_del(nf);
+			return true;
+		}
 	}
 	return false;
 }
@@ -469,20 +473,30 @@  nfsd_file_put(struct nfsd_file *nf)
 {
 	trace_nfsd_file_put(nf);
 
-	/*
-	 * The HASHED check is racy. We may end up with the occasional
-	 * unhashed entry on the LRU, but they should get cleaned up
-	 * like any other.
-	 */
 	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
 	    test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
 		/*
-		 * If this is the last reference (nf_ref == 1), then transfer
-		 * it to the LRU. If the add to the LRU fails, just put it as
-		 * usual.
+		 * If this is the last reference (nf_ref == 1), then try to
+		 * transfer it to the LRU.
 		 */
-		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
+		if (refcount_dec_not_one(&nf->nf_ref))
 			return;
+
+		/* Try to add it to the LRU.  If that fails, decrement. */
+		if (nfsd_file_lru_add(nf)) {
+			/* If it's still hashed, we're done */
+			if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+				return;
+
+			/*
+			 * We're racing with unhashing, so try to remove it from
+			 * the LRU. If removal fails, then someone else already
+			 * has our reference and we're done. If it succeeds,
+			 * fall through to decrement.
+			 */
+			if (!nfsd_file_lru_remove(nf))
+				return;
+		}
 	}
 	if (refcount_dec_and_test(&nf->nf_ref))
 		nfsd_file_free(nf);
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index b7efb2c3ddb1..e52ab7d5a44c 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -39,6 +39,7 @@  struct nfsd_file {
 #define NFSD_FILE_PENDING	(1)
 #define NFSD_FILE_REFERENCED	(2)
 #define NFSD_FILE_GC		(3)
+#define NFSD_FILE_LRU		(4)	/* file is on LRU */
 	unsigned long		nf_flags;
 	struct inode		*nf_inode;	/* don't deref */
 	refcount_t		nf_ref;