diff mbox series

Leaked nfsd_file due to race condition and early unhash (fs/nfsd/filecache.c)

Message ID CADpNCvYGqA3a51OH=AcqmKyAmnx3yoZjYPo7US+qk-OMX789vA@mail.gmail.com (mailing list archive)
State New
Headers show
Series Leaked nfsd_file due to race condition and early unhash (fs/nfsd/filecache.c) | expand

Commit Message

Youzhong Yang July 3, 2024, 2:12 p.m. UTC
Hello,

I'd like to report a nfsd_file leaking issue and propose a fix for it.

When I tested Linux kernel 6.8 and 6.6, I noticed nfsd_file leaks
which led to undestroyable file systems (zfs), here are some examples:

crash> struct nfsd_file -x ffff88e160db0460
struct nfsd_file {
  nf_rlist = {
    rhead = {
      next = 0xffff8921fa2392f1
    },
    next = 0x0
  },
  nf_inode = 0xffff8882bc312ef8,
  nf_file = 0xffff88e2015b1500,
  nf_cred = 0xffff88e3ab0e7800,
  nf_net = 0xffffffff83d41600 <init_net>,
  nf_flags = 0x8,
  nf_ref = {
    refs = {
      counter = 0xc0000000
    }
  },
  nf_may = 0x4,
  nf_mark = 0xffff88e1bddfb320,
  nf_lru = {
    next = 0xffff88e160db04a8,
    prev = 0xffff88e160db04a8
  },
  nf_rcu = {
    next = 0x10000000000,
    func = 0x0
  },
  nf_birthtime = 0x73d22fc1728
}

crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
ffff88839a53d850
  nf_flags = 0x8,
  nf_ref.refs.counter = 0x0
  nf_lru = {
    next = 0xffff88839a53d898,
    prev = 0xffff88839a53d898
  },
  nf_file = 0xffff88810ede8700,

crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
ffff88c32b11e850
  nf_flags = 0x8,
  nf_ref.refs.counter = 0x0
  nf_lru = {
    next = 0xffff88c32b11e898,
    prev = 0xffff88c32b11e898
  },
  nf_file = 0xffff88c20a701c00,

crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
ffff88e372709700
  nf_flags = 0xc,
  nf_ref.refs.counter = 0x0
  nf_lru = {
    next = 0xffff88e372709748,
    prev = 0xffff88e372709748
  },
  nf_file = 0xffff88e0725e6400,

crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
ffff8982864944d0
  nf_flags = 0xc,
  nf_ref.refs.counter = 0x0
  nf_lru = {
    next = 0xffff898286494518,
    prev = 0xffff898286494518
  },
  nf_file = 0xffff89803c0ff700,

The leak occurs when nfsd_file_put() races with nfsd_file_cond_queue()
or nfsd_file_lru_cb(). With the following patch, I haven't observed
any leak after a few days heavy nfs load:

  /* If we can't get a reference, ignore it */
@@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
list_head *dispose)
  /* If refcount goes to 0, then put on the dispose list */
  if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
  list_add(&nf->nf_lru, dispose);
+ nfsd_file_unhash(nf);
  trace_nfsd_file_closing(nf);
  }
 }

Please kindly review the patch and let me know if it makes sense.

Thanks,

-Youzhong

Comments

Chuck Lever July 3, 2024, 6:20 p.m. UTC | #1
On Wed, Jul 03, 2024 at 10:12:33AM -0400, Youzhong Yang wrote:
> Hello,
> 
> I'd like to report a nfsd_file leaking issue and propose a fix for it.
> 
> When I tested Linux kernel 6.8 and 6.6, I noticed nfsd_file leaks
> which led to undestroyable file systems (zfs),

Thanks for the report. Some initial comments:

- Do you have a specific reproducer? In other words, what is the
  simplest program that can run on an NFS client that will trigger
  this leak, and can you post it?

- "zfs" is an out-of-tree file system, so it's not directly
  supported for NFSD.

- The guidelines for patch submission require us to fix issues in
  upstream Linux first (currently that's v6.10-rc6). Then that fix
  can be backported to older stable kernels like 6.6.

Can you reproduce the leak with one of the in-kernel filesystems
(either xfs or btrfs would be great) and with NFSD in 6.10-rc6?

One more comment below.


> here are some examples:
> 
> crash> struct nfsd_file -x ffff88e160db0460
> struct nfsd_file {
>   nf_rlist = {
>     rhead = {
>       next = 0xffff8921fa2392f1
>     },
>     next = 0x0
>   },
>   nf_inode = 0xffff8882bc312ef8,
>   nf_file = 0xffff88e2015b1500,
>   nf_cred = 0xffff88e3ab0e7800,
>   nf_net = 0xffffffff83d41600 <init_net>,
>   nf_flags = 0x8,
>   nf_ref = {
>     refs = {
>       counter = 0xc0000000
>     }
>   },
>   nf_may = 0x4,
>   nf_mark = 0xffff88e1bddfb320,
>   nf_lru = {
>     next = 0xffff88e160db04a8,
>     prev = 0xffff88e160db04a8
>   },
>   nf_rcu = {
>     next = 0x10000000000,
>     func = 0x0
>   },
>   nf_birthtime = 0x73d22fc1728
> }
> 
> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> ffff88839a53d850
>   nf_flags = 0x8,
>   nf_ref.refs.counter = 0x0
>   nf_lru = {
>     next = 0xffff88839a53d898,
>     prev = 0xffff88839a53d898
>   },
>   nf_file = 0xffff88810ede8700,
> 
> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> ffff88c32b11e850
>   nf_flags = 0x8,
>   nf_ref.refs.counter = 0x0
>   nf_lru = {
>     next = 0xffff88c32b11e898,
>     prev = 0xffff88c32b11e898
>   },
>   nf_file = 0xffff88c20a701c00,
> 
> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> ffff88e372709700
>   nf_flags = 0xc,
>   nf_ref.refs.counter = 0x0
>   nf_lru = {
>     next = 0xffff88e372709748,
>     prev = 0xffff88e372709748
>   },
>   nf_file = 0xffff88e0725e6400,
> 
> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> ffff8982864944d0
>   nf_flags = 0xc,
>   nf_ref.refs.counter = 0x0
>   nf_lru = {
>     next = 0xffff898286494518,
>     prev = 0xffff898286494518
>   },
>   nf_file = 0xffff89803c0ff700,
> 
> The leak occurs when nfsd_file_put() races with nfsd_file_cond_queue()
> or nfsd_file_lru_cb(). With the following patch, I haven't observed
> any leak after a few days heavy nfs load:

Our patch submission guidelines require a Signed-off-by:
line at the end of the patch description. See the "Sign your work -
the Developer's Certificate of Origin" section of

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.10-rc6

(Needed here in case your fix is acceptable).


> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 1a6d5d000b85..2323829f7208 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -389,6 +389,17 @@ nfsd_file_put(struct nfsd_file *nf)
>   if (!nfsd_file_lru_remove(nf))
>   return;
>   }
> + /*
> + * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
> + * it's unhashed but then removed from the dispose list,
> + * so we need to free it.
> + */
> + if (refcount_read(&nf->nf_ref) == 0 &&
> +     !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
> +     list_empty(&nf->nf_lru)) {
> + nfsd_file_free(nf);
> + return;
> + }
>   }
>   if (refcount_dec_and_test(&nf->nf_ref))
>   nfsd_file_free(nf);
> @@ -576,7 +587,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> list_head *dispose)
>   int decrement = 1;
> 
>   /* If we raced with someone else unhashing, ignore it */
> - if (!nfsd_file_unhash(nf))
> + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
>   return;
> 
>   /* If we can't get a reference, ignore it */
> @@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> list_head *dispose)
>   /* If refcount goes to 0, then put on the dispose list */
>   if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
>   list_add(&nf->nf_lru, dispose);
> + nfsd_file_unhash(nf);
>   trace_nfsd_file_closing(nf);
>   }
>  }
> 
> Please kindly review the patch and let me know if it makes sense.
> 
> Thanks,
> 
> -Youzhong
>
Youzhong Yang July 3, 2024, 8:46 p.m. UTC | #2
Thank you Chuck. Here are my quick answers to your comments:

- I don't have a quick reproducer. I reproduced it by using hundreds
of nfs clients generating +600K ops under our workload in the testing
environment. Theoretically it should be possible to simplify the
reproduction but I am still working on it.

-  I understand zfs is an out-of-tree file system. That's fine. But
this leaking can happen to any file system, and leaking is not a good
thing no matter what file system it is.

-  I will try to come up with a reproducer using xfs or btrfs if possible.

Now back to the problem itself, here are my findings:

- nfsd_file_put() in one thread can race with another thread doing
garbage collection (running nfsd_file_gc() -> list_lru_walk() ->
nfsd_file_lru_cb()):

  * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add().
  * nfsd_file_lru_add() returns true (thus NFSD_FILE_REFERENCED bit
set for nf->nf_flags)
  * garbage collector kicks in, nfsd_file_lru_cb() clears REFERENCED
bit and returns LRU_ROTATE.
  * garbage collector kicks in again, nfsd_file_lru_cb() now
decrements nf->nf_ref to 0, runs nfsd_file_unhash(), removes it from
the LRU and adds to the dispose list [list_lru_isolate_move(lru,
&nf->nf_lru, head);]
  * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
tries to remove the 'nf' from the LRU [if (!nfsd_file_lru_remove(nf))]
  * The 'nf' has been added to the 'dispose' list by
nfsd_file_lru_cb(), so nfsd_file_lru_remove(nf) simply treats it as
part of the LRU and removes it, which leads it to be removed from the
'dispose' list.
  * At this moment, nf->nf_ref is 0, it's unhashed, and not on the
LRU. nfsd_file_put() continues its execution [if
(refcount_dec_and_test(&nf->nf_ref))], as nf->nf_ref is already 0, now
bad thing happens: nf->nf_ref is set to REFCOUNT_SATURATED, and the
'nf' is leaked.

To make this happen, the right timing is crucial. It can be reproduced
by adding artifical delays in filecache.c, or hammering the nfsd with
tons of ops.

- Let's see how nfsd_file_put() can race with nfsd_file_cond_queue():
  * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add().
  * nfsd_file_lru_add() sets REFERENCED bit and returns true.
  * 'exportfs -f' or something like that triggers
__nfsd_file_cache_purge() -> nfsd_file_cond_queue().
  * In nfsd_file_cond_queue(), it runs [if (!nfsd_file_unhash(nf))],
unhash is done successfully.
  * nfsd_file_cond_queue() runs [if (!nfsd_file_get(nf))], now
nf->nf_ref goes to 2.
  * nfsd_file_cond_queue() runs [if (nfsd_file_lru_remove(nf))], it succeeds.
  * nfsd_file_cond_queue() runs [if (refcount_sub_and_test(decrement,
&nf->nf_ref))] (with "decrement" being 2), so the nf->nf_ref goes to
0, the 'nf' is added to the dispost list [list_add(&nf->nf_lru,
dispose)]
  * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
tries to remove the 'nf' from the LRU [if
(!nfsd_file_lru_remove(nf))], although the 'nf' is not in the LRU, but
it is linked in the 'dispose' list, nfsd_file_lru_remove() simply
treats it as part of the LRU and removes it. This leads to its removal
from the 'dispose' list!
  * Now nf->ref is 0, unhashed. nfsd_file_put() continues its
execution and sets nf->nf_ref to REFCOUNT_SATURATED.

The purpose of nf->nf_lru is problematic. As you can see, it is used
for the LRU list, and also the 'dispose' list. Adding another 'struct
list_head' specifically for the 'dispose' list seems to be a better
way of fixing this race condition. Either way works for me.

Would you agree my above analysis makes sense? Thanks.

Here is my patch with signed-off-by:

From: Youzhong Yang <youzhong@gmail.com>
Date: Mon, 1 Jul 2024 06:45:22 -0400
Subject: [PATCH] nfsd: fix nfsd_file leaking due to race condition and early
 unhash

Signed-off-by: Youzhong Yang <youzhong@gmail.com>
---
 fs/nfsd/filecache.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 1a6d5d000b85..2323829f7208 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -389,6 +389,17 @@ nfsd_file_put(struct nfsd_file *nf)
                        if (!nfsd_file_lru_remove(nf))
                                return;
                }
+               /*
+                * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
+                * it's unhashed but then removed from the dispose list,
+                * so we need to free it.
+                */
+               if (refcount_read(&nf->nf_ref) == 0 &&
+                   !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
+                   list_empty(&nf->nf_lru)) {
+                       nfsd_file_free(nf);
+                       return;
+               }
        }
        if (refcount_dec_and_test(&nf->nf_ref))
                nfsd_file_free(nf);
@@ -576,7 +587,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
list_head *dispose)
        int decrement = 1;

        /* If we raced with someone else unhashing, ignore it */
-       if (!nfsd_file_unhash(nf))
+       if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
                return;

        /* If we can't get a reference, ignore it */
@@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
list_head *dispose)
        /* If refcount goes to 0, then put on the dispose list */
        if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
                list_add(&nf->nf_lru, dispose);
+               nfsd_file_unhash(nf);
                trace_nfsd_file_closing(nf);
        }
 }
--
2.43.0

On Wed, Jul 3, 2024 at 2:21 PM Chuck Lever <chuck.lever@oracle.com> wrote:
>
> On Wed, Jul 03, 2024 at 10:12:33AM -0400, Youzhong Yang wrote:
> > Hello,
> >
> > I'd like to report a nfsd_file leaking issue and propose a fix for it.
> >
> > When I tested Linux kernel 6.8 and 6.6, I noticed nfsd_file leaks
> > which led to undestroyable file systems (zfs),
>
> Thanks for the report. Some initial comments:
>
> - Do you have a specific reproducer? In other words, what is the
>   simplest program that can run on an NFS client that will trigger
>   this leak, and can you post it?
>
> - "zfs" is an out-of-tree file system, so it's not directly
>   supported for NFSD.
>
> - The guidelines for patch submission require us to fix issues in
>   upstream Linux first (currently that's v6.10-rc6). Then that fix
>   can be backported to older stable kernels like 6.6.
>
> Can you reproduce the leak with one of the in-kernel filesystems
> (either xfs or btrfs would be great) and with NFSD in 6.10-rc6?
>
> One more comment below.
>
>
> > here are some examples:
> >
> > crash> struct nfsd_file -x ffff88e160db0460
> > struct nfsd_file {
> >   nf_rlist = {
> >     rhead = {
> >       next = 0xffff8921fa2392f1
> >     },
> >     next = 0x0
> >   },
> >   nf_inode = 0xffff8882bc312ef8,
> >   nf_file = 0xffff88e2015b1500,
> >   nf_cred = 0xffff88e3ab0e7800,
> >   nf_net = 0xffffffff83d41600 <init_net>,
> >   nf_flags = 0x8,
> >   nf_ref = {
> >     refs = {
> >       counter = 0xc0000000
> >     }
> >   },
> >   nf_may = 0x4,
> >   nf_mark = 0xffff88e1bddfb320,
> >   nf_lru = {
> >     next = 0xffff88e160db04a8,
> >     prev = 0xffff88e160db04a8
> >   },
> >   nf_rcu = {
> >     next = 0x10000000000,
> >     func = 0x0
> >   },
> >   nf_birthtime = 0x73d22fc1728
> > }
> >
> > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > ffff88839a53d850
> >   nf_flags = 0x8,
> >   nf_ref.refs.counter = 0x0
> >   nf_lru = {
> >     next = 0xffff88839a53d898,
> >     prev = 0xffff88839a53d898
> >   },
> >   nf_file = 0xffff88810ede8700,
> >
> > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > ffff88c32b11e850
> >   nf_flags = 0x8,
> >   nf_ref.refs.counter = 0x0
> >   nf_lru = {
> >     next = 0xffff88c32b11e898,
> >     prev = 0xffff88c32b11e898
> >   },
> >   nf_file = 0xffff88c20a701c00,
> >
> > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > ffff88e372709700
> >   nf_flags = 0xc,
> >   nf_ref.refs.counter = 0x0
> >   nf_lru = {
> >     next = 0xffff88e372709748,
> >     prev = 0xffff88e372709748
> >   },
> >   nf_file = 0xffff88e0725e6400,
> >
> > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > ffff8982864944d0
> >   nf_flags = 0xc,
> >   nf_ref.refs.counter = 0x0
> >   nf_lru = {
> >     next = 0xffff898286494518,
> >     prev = 0xffff898286494518
> >   },
> >   nf_file = 0xffff89803c0ff700,
> >
> > The leak occurs when nfsd_file_put() races with nfsd_file_cond_queue()
> > or nfsd_file_lru_cb(). With the following patch, I haven't observed
> > any leak after a few days heavy nfs load:
>
> Our patch submission guidelines require a Signed-off-by:
> line at the end of the patch description. See the "Sign your work -
> the Developer's Certificate of Origin" section of
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.10-rc6
>
> (Needed here in case your fix is acceptable).
>
>
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index 1a6d5d000b85..2323829f7208 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -389,6 +389,17 @@ nfsd_file_put(struct nfsd_file *nf)
> >   if (!nfsd_file_lru_remove(nf))
> >   return;
> >   }
> > + /*
> > + * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
> > + * it's unhashed but then removed from the dispose list,
> > + * so we need to free it.
> > + */
> > + if (refcount_read(&nf->nf_ref) == 0 &&
> > +     !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
> > +     list_empty(&nf->nf_lru)) {
> > + nfsd_file_free(nf);
> > + return;
> > + }
> >   }
> >   if (refcount_dec_and_test(&nf->nf_ref))
> >   nfsd_file_free(nf);
> > @@ -576,7 +587,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> > list_head *dispose)
> >   int decrement = 1;
> >
> >   /* If we raced with someone else unhashing, ignore it */
> > - if (!nfsd_file_unhash(nf))
> > + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
> >   return;
> >
> >   /* If we can't get a reference, ignore it */
> > @@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> > list_head *dispose)
> >   /* If refcount goes to 0, then put on the dispose list */
> >   if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
> >   list_add(&nf->nf_lru, dispose);
> > + nfsd_file_unhash(nf);
> >   trace_nfsd_file_closing(nf);
> >   }
> >  }
> >
> > Please kindly review the patch and let me know if it makes sense.
> >
> > Thanks,
> >
> > -Youzhong
> >
>
> --
> Chuck Lever
Chuck Lever July 3, 2024, 8:57 p.m. UTC | #3
> On Jul 3, 2024, at 4:46 PM, Youzhong Yang <youzhong@gmail.com> wrote:
> 
> Thank you Chuck. Here are my quick answers to your comments:
> 
> - I don't have a quick reproducer. I reproduced it by using hundreds
> of nfs clients generating +600K ops under our workload in the testing
> environment. Theoretically it should be possible to simplify the
> reproduction but I am still working on it.
> 
> -  I understand zfs is an out-of-tree file system. That's fine. But
> this leaking can happen to any file system, and leaking is not a good
> thing no matter what file system it is.

No doubt, but we need to establish that this is not an issue
in the Linux port of zfs.


> -  I will try to come up with a reproducer using xfs or btrfs if possible.

Thanks. A reliable reproducer will demonstrate very clearly that
the bug is addressed when the (final) fix has been applied.


> Now back to the problem itself, here are my findings:
> 
> - nfsd_file_put() in one thread can race with another thread doing
> garbage collection (running nfsd_file_gc() -> list_lru_walk() ->
> nfsd_file_lru_cb()):
> 
>  * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add().
>  * nfsd_file_lru_add() returns true (thus NFSD_FILE_REFERENCED bit
> set for nf->nf_flags)
>  * garbage collector kicks in, nfsd_file_lru_cb() clears REFERENCED
> bit and returns LRU_ROTATE.
>  * garbage collector kicks in again, nfsd_file_lru_cb() now
> decrements nf->nf_ref to 0, runs nfsd_file_unhash(), removes it from
> the LRU and adds to the dispose list [list_lru_isolate_move(lru,
> &nf->nf_lru, head);]
>  * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
> tries to remove the 'nf' from the LRU [if (!nfsd_file_lru_remove(nf))]
>  * The 'nf' has been added to the 'dispose' list by
> nfsd_file_lru_cb(), so nfsd_file_lru_remove(nf) simply treats it as
> part of the LRU and removes it, which leads it to be removed from the
> 'dispose' list.
>  * At this moment, nf->nf_ref is 0, it's unhashed, and not on the
> LRU. nfsd_file_put() continues its execution [if
> (refcount_dec_and_test(&nf->nf_ref))], as nf->nf_ref is already 0, now
> bad thing happens: nf->nf_ref is set to REFCOUNT_SATURATED, and the
> 'nf' is leaked.
> 
> To make this happen, the right timing is crucial. It can be reproduced
> by adding artifical delays in filecache.c, or hammering the nfsd with
> tons of ops.
> 
> - Let's see how nfsd_file_put() can race with nfsd_file_cond_queue():
>  * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add().
>  * nfsd_file_lru_add() sets REFERENCED bit and returns true.
>  * 'exportfs -f' or something like that triggers
> __nfsd_file_cache_purge() -> nfsd_file_cond_queue().
>  * In nfsd_file_cond_queue(), it runs [if (!nfsd_file_unhash(nf))],
> unhash is done successfully.
>  * nfsd_file_cond_queue() runs [if (!nfsd_file_get(nf))], now
> nf->nf_ref goes to 2.
>  * nfsd_file_cond_queue() runs [if (nfsd_file_lru_remove(nf))], it succeeds.
>  * nfsd_file_cond_queue() runs [if (refcount_sub_and_test(decrement,
> &nf->nf_ref))] (with "decrement" being 2), so the nf->nf_ref goes to
> 0, the 'nf' is added to the dispost list [list_add(&nf->nf_lru,
> dispose)]
>  * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
> tries to remove the 'nf' from the LRU [if
> (!nfsd_file_lru_remove(nf))], although the 'nf' is not in the LRU, but
> it is linked in the 'dispose' list, nfsd_file_lru_remove() simply
> treats it as part of the LRU and removes it. This leads to its removal
> from the 'dispose' list!
>  * Now nf->ref is 0, unhashed. nfsd_file_put() continues its
> execution and sets nf->nf_ref to REFCOUNT_SATURATED.
> 
> The purpose of nf->nf_lru is problematic. As you can see, it is used
> for the LRU list, and also the 'dispose' list. Adding another 'struct
> list_head' specifically for the 'dispose' list seems to be a better
> way of fixing this race condition. Either way works for me.

Generally this double-use of nf_lru has been determined to be
safe. If there is a bug in the list manipulation logic, it will
still be there if there are two list_head fields; it will simply
be masked.


> Would you agree my above analysis makes sense? Thanks.

Jeff is the expert in this area. I will also study it.


> Here is my patch with signed-off-by:
> 
> From: Youzhong Yang <youzhong@gmail.com>
> Date: Mon, 1 Jul 2024 06:45:22 -0400
> Subject: [PATCH] nfsd: fix nfsd_file leaking due to race condition and early
> unhash

Much better, but you also need a patch description. Two
paragraphs that explain the bug and the proposed solution
should be adequate.


> Signed-off-by: Youzhong Yang <youzhong@gmail.com>
> ---
> fs/nfsd/filecache.c | 14 +++++++++++++-
> 1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 1a6d5d000b85..2323829f7208 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -389,6 +389,17 @@ nfsd_file_put(struct nfsd_file *nf)
>                        if (!nfsd_file_lru_remove(nf))
>                                return;
>                }
> +               /*
> +                * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
> +                * it's unhashed but then removed from the dispose list,
> +                * so we need to free it.
> +                */
> +               if (refcount_read(&nf->nf_ref) == 0 &&
> +                   !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
> +                   list_empty(&nf->nf_lru)) {
> +                       nfsd_file_free(nf);
> +                       return;
> +               }
>        }
>        if (refcount_dec_and_test(&nf->nf_ref))
>                nfsd_file_free(nf);
> @@ -576,7 +587,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> list_head *dispose)
>        int decrement = 1;
> 
>        /* If we raced with someone else unhashing, ignore it */
> -       if (!nfsd_file_unhash(nf))
> +       if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
>                return;
> 
>        /* If we can't get a reference, ignore it */
> @@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> list_head *dispose)
>        /* If refcount goes to 0, then put on the dispose list */
>        if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
>                list_add(&nf->nf_lru, dispose);
> +               nfsd_file_unhash(nf);
>                trace_nfsd_file_closing(nf);
>        }
> }
> --
> 2.43.0
> 
> On Wed, Jul 3, 2024 at 2:21 PM Chuck Lever <chuck.lever@oracle.com> wrote:
>> 
>> On Wed, Jul 03, 2024 at 10:12:33AM -0400, Youzhong Yang wrote:
>>> Hello,
>>> 
>>> I'd like to report a nfsd_file leaking issue and propose a fix for it.
>>> 
>>> When I tested Linux kernel 6.8 and 6.6, I noticed nfsd_file leaks
>>> which led to undestroyable file systems (zfs),
>> 
>> Thanks for the report. Some initial comments:
>> 
>> - Do you have a specific reproducer? In other words, what is the
>>  simplest program that can run on an NFS client that will trigger
>>  this leak, and can you post it?
>> 
>> - "zfs" is an out-of-tree file system, so it's not directly
>>  supported for NFSD.
>> 
>> - The guidelines for patch submission require us to fix issues in
>>  upstream Linux first (currently that's v6.10-rc6). Then that fix
>>  can be backported to older stable kernels like 6.6.
>> 
>> Can you reproduce the leak with one of the in-kernel filesystems
>> (either xfs or btrfs would be great) and with NFSD in 6.10-rc6?
>> 
>> One more comment below.
>> 
>> 
>>> here are some examples:
>>> 
>>> crash> struct nfsd_file -x ffff88e160db0460
>>> struct nfsd_file {
>>>  nf_rlist = {
>>>    rhead = {
>>>      next = 0xffff8921fa2392f1
>>>    },
>>>    next = 0x0
>>>  },
>>>  nf_inode = 0xffff8882bc312ef8,
>>>  nf_file = 0xffff88e2015b1500,
>>>  nf_cred = 0xffff88e3ab0e7800,
>>>  nf_net = 0xffffffff83d41600 <init_net>,
>>>  nf_flags = 0x8,
>>>  nf_ref = {
>>>    refs = {
>>>      counter = 0xc0000000
>>>    }
>>>  },
>>>  nf_may = 0x4,
>>>  nf_mark = 0xffff88e1bddfb320,
>>>  nf_lru = {
>>>    next = 0xffff88e160db04a8,
>>>    prev = 0xffff88e160db04a8
>>>  },
>>>  nf_rcu = {
>>>    next = 0x10000000000,
>>>    func = 0x0
>>>  },
>>>  nf_birthtime = 0x73d22fc1728
>>> }
>>> 
>>> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
>>> ffff88839a53d850
>>>  nf_flags = 0x8,
>>>  nf_ref.refs.counter = 0x0
>>>  nf_lru = {
>>>    next = 0xffff88839a53d898,
>>>    prev = 0xffff88839a53d898
>>>  },
>>>  nf_file = 0xffff88810ede8700,
>>> 
>>> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
>>> ffff88c32b11e850
>>>  nf_flags = 0x8,
>>>  nf_ref.refs.counter = 0x0
>>>  nf_lru = {
>>>    next = 0xffff88c32b11e898,
>>>    prev = 0xffff88c32b11e898
>>>  },
>>>  nf_file = 0xffff88c20a701c00,
>>> 
>>> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
>>> ffff88e372709700
>>>  nf_flags = 0xc,
>>>  nf_ref.refs.counter = 0x0
>>>  nf_lru = {
>>>    next = 0xffff88e372709748,
>>>    prev = 0xffff88e372709748
>>>  },
>>>  nf_file = 0xffff88e0725e6400,
>>> 
>>> crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
>>> ffff8982864944d0
>>>  nf_flags = 0xc,
>>>  nf_ref.refs.counter = 0x0
>>>  nf_lru = {
>>>    next = 0xffff898286494518,
>>>    prev = 0xffff898286494518
>>>  },
>>>  nf_file = 0xffff89803c0ff700,
>>> 
>>> The leak occurs when nfsd_file_put() races with nfsd_file_cond_queue()
>>> or nfsd_file_lru_cb(). With the following patch, I haven't observed
>>> any leak after a few days heavy nfs load:
>> 
>> Our patch submission guidelines require a Signed-off-by:
>> line at the end of the patch description. See the "Sign your work -
>> the Developer's Certificate of Origin" section of
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.10-rc6
>> 
>> (Needed here in case your fix is acceptable).
>> 
>> 
>>> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
>>> index 1a6d5d000b85..2323829f7208 100644
>>> --- a/fs/nfsd/filecache.c
>>> +++ b/fs/nfsd/filecache.c
>>> @@ -389,6 +389,17 @@ nfsd_file_put(struct nfsd_file *nf)
>>>  if (!nfsd_file_lru_remove(nf))
>>>  return;
>>>  }
>>> + /*
>>> + * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
>>> + * it's unhashed but then removed from the dispose list,
>>> + * so we need to free it.
>>> + */
>>> + if (refcount_read(&nf->nf_ref) == 0 &&
>>> +     !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
>>> +     list_empty(&nf->nf_lru)) {
>>> + nfsd_file_free(nf);
>>> + return;
>>> + }
>>>  }
>>>  if (refcount_dec_and_test(&nf->nf_ref))
>>>  nfsd_file_free(nf);
>>> @@ -576,7 +587,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
>>> list_head *dispose)
>>>  int decrement = 1;
>>> 
>>>  /* If we raced with someone else unhashing, ignore it */
>>> - if (!nfsd_file_unhash(nf))
>>> + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
>>>  return;
>>> 
>>>  /* If we can't get a reference, ignore it */
>>> @@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
>>> list_head *dispose)
>>>  /* If refcount goes to 0, then put on the dispose list */
>>>  if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
>>>  list_add(&nf->nf_lru, dispose);
>>> + nfsd_file_unhash(nf);
>>>  trace_nfsd_file_closing(nf);
>>>  }
>>> }
>>> 
>>> Please kindly review the patch and let me know if it makes sense.
>>> 
>>> Thanks,
>>> 
>>> -Youzhong
>>> 
>> 
>> --
>> Chuck Lever

--
Chuck Lever
Jeff Layton July 4, 2024, 11:14 a.m. UTC | #4
On Wed, 2024-07-03 at 16:46 -0400, Youzhong Yang wrote:
> Thank you Chuck. Here are my quick answers to your comments:
> 
> - I don't have a quick reproducer. I reproduced it by using hundreds
> of nfs clients generating +600K ops under our workload in the testing
> environment. Theoretically it should be possible to simplify the
> reproduction but I am still working on it.
> 
> -  I understand zfs is an out-of-tree file system. That's fine. But
> this leaking can happen to any file system, and leaking is not a good
> thing no matter what file system it is.
> 
> -  I will try to come up with a reproducer using xfs or btrfs if possible.
> 
> Now back to the problem itself, here are my findings:
> 
> - nfsd_file_put() in one thread can race with another thread doing
> garbage collection (running nfsd_file_gc() -> list_lru_walk() ->
> nfsd_file_lru_cb()):
> 
>   * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add().
>   * nfsd_file_lru_add() returns true (thus NFSD_FILE_REFERENCED bit
> set for nf->nf_flags)
>   * garbage collector kicks in, nfsd_file_lru_cb() clears REFERENCED
> bit and returns LRU_ROTATE.
>   * garbage collector kicks in again, nfsd_file_lru_cb() now
> decrements nf->nf_ref to 0, runs nfsd_file_unhash(), removes it from
> the LRU and adds to the dispose list [list_lru_isolate_move(lru,
> &nf->nf_lru, head);]
>   * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
> tries to remove the 'nf' from the LRU [if (!nfsd_file_lru_remove(nf))]
>   * The 'nf' has been added to the 'dispose' list by
> nfsd_file_lru_cb(), so nfsd_file_lru_remove(nf) simply treats it as
> part of the LRU and removes it, which leads it to be removed from the
> 'dispose' list.
>   * At this moment, nf->nf_ref is 0, it's unhashed, and not on the
> LRU. nfsd_file_put() continues its execution [if
> (refcount_dec_and_test(&nf->nf_ref))], as nf->nf_ref is already 0, now
> bad thing happens: nf->nf_ref is set to REFCOUNT_SATURATED, and the
> 'nf' is leaked.
> 
> To make this happen, the right timing is crucial. It can be reproduced
> by adding artifical delays in filecache.c, or hammering the nfsd with
> tons of ops.
> 
> - Let's see how nfsd_file_put() can race with nfsd_file_cond_queue():
>   * In nfsd_file_put(), nf->nf_ref is 1, so it tries to do nfsd_file_lru_add().
>   * nfsd_file_lru_add() sets REFERENCED bit and returns true.
>   * 'exportfs -f' or something like that triggers
> __nfsd_file_cache_purge() -> nfsd_file_cond_queue().
>   * In nfsd_file_cond_queue(), it runs [if (!nfsd_file_unhash(nf))],
> unhash is done successfully.
>   * nfsd_file_cond_queue() runs [if (!nfsd_file_get(nf))], now
> nf->nf_ref goes to 2.
>   * nfsd_file_cond_queue() runs [if (nfsd_file_lru_remove(nf))], it succeeds.
>   * nfsd_file_cond_queue() runs [if (refcount_sub_and_test(decrement,
> &nf->nf_ref))] (with "decrement" being 2), so the nf->nf_ref goes to
> 0, the 'nf' is added to the dispost list [list_add(&nf->nf_lru,
> dispose)]
>   * nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
> tries to remove the 'nf' from the LRU [if
> (!nfsd_file_lru_remove(nf))], although the 'nf' is not in the LRU, but
> it is linked in the 'dispose' list, nfsd_file_lru_remove() simply
> treats it as part of the LRU and removes it. This leads to its removal
> from the 'dispose' list!
>   * Now nf->ref is 0, unhashed. nfsd_file_put() continues its
> execution and sets nf->nf_ref to REFCOUNT_SATURATED.
> 
> The purpose of nf->nf_lru is problematic. As you can see, it is used
> for the LRU list, and also the 'dispose' list. Adding another 'struct
> list_head' specifically for the 'dispose' list seems to be a better
> way of fixing this race condition. Either way works for me.
> 
> Would you agree my above analysis makes sense? Thanks.
> 

I think so. It's been a while since I've done much work in this code,
but it does sound like there is a race in the LRU handling.


Like Chuck said, the nf->nf_lru list should be safe to use for multiple
purposes, but that's only the case if we're not using that list as an
indicator.

The list_lru code does check this:

    if (!list_empty(item)) {

...so if we ever check this while it's sitting on the dispose list, it
will handle it incorrectly. It sounds like that's the root cause of the
problem you're seeing?

If so, then maybe a separate list_head for disposal would be better.

> Here is my patch with signed-off-by:
> 
> From: Youzhong Yang <youzhong@gmail.com>
> Date: Mon, 1 Jul 2024 06:45:22 -0400
> Subject: [PATCH] nfsd: fix nfsd_file leaking due to race condition and early
>  unhash
> 
> Signed-off-by: Youzhong Yang <youzhong@gmail.com>
> ---
>  fs/nfsd/filecache.c | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 1a6d5d000b85..2323829f7208 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -389,6 +389,17 @@ nfsd_file_put(struct nfsd_file *nf)
>                         if (!nfsd_file_lru_remove(nf))
>                                 return;
>                 }
> +               /*
> +                * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
> +                * it's unhashed but then removed from the dispose list,
> +                * so we need to free it.
> +                */
> +               if (refcount_read(&nf->nf_ref) == 0 &&

A refcount_read in this path is a red flag to me. Anytime you're just
looking at the refcount without changing anything just screams out
"race condition".

In this case, what guarantee is there that this won't run afoul of the
timing? We could check this and find out it's 1 just before it goes to
0 and you check the other conditions.

Does anything prevent that?

> +                   !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
> +                   list_empty(&nf->nf_lru)) {
> +                       nfsd_file_free(nf);
> +                       return;
> +               }
>         }
>         if (refcount_dec_and_test(&nf->nf_ref))
>                 nfsd_file_free(nf);
> @@ -576,7 +587,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> list_head *dispose)
>         int decrement = 1;
> 
>         /* If we raced with someone else unhashing, ignore it */
> -       if (!nfsd_file_unhash(nf))
> +       if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
>                 return;

Same here: you're just testing for the HASHED bit, but could this also
race with someone who is setting it just after you get here. Why is
that not a problem?

> 
>         /* If we can't get a reference, ignore it */
> @@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> list_head *dispose)
>         /* If refcount goes to 0, then put on the dispose list */
>         if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
>                 list_add(&nf->nf_lru, dispose);
> +               nfsd_file_unhash(nf);
>                 trace_nfsd_file_closing(nf);
>         }
>  }
> --
> 2.43.0
> 
> On Wed, Jul 3, 2024 at 2:21 PM Chuck Lever <chuck.lever@oracle.com> wrote:
> > 
> > On Wed, Jul 03, 2024 at 10:12:33AM -0400, Youzhong Yang wrote:
> > > Hello,
> > > 
> > > I'd like to report a nfsd_file leaking issue and propose a fix for it.
> > > 
> > > When I tested Linux kernel 6.8 and 6.6, I noticed nfsd_file leaks
> > > which led to undestroyable file systems (zfs),
> > 
> > Thanks for the report. Some initial comments:
> > 
> > - Do you have a specific reproducer? In other words, what is the
> >   simplest program that can run on an NFS client that will trigger
> >   this leak, and can you post it?
> > 
> > - "zfs" is an out-of-tree file system, so it's not directly
> >   supported for NFSD.
> > 
> > - The guidelines for patch submission require us to fix issues in
> >   upstream Linux first (currently that's v6.10-rc6). Then that fix
> >   can be backported to older stable kernels like 6.6.
> > 
> > Can you reproduce the leak with one of the in-kernel filesystems
> > (either xfs or btrfs would be great) and with NFSD in 6.10-rc6?
> > 
> > One more comment below.
> > 
> > 
> > > here are some examples:
> > > 
> > > crash> struct nfsd_file -x ffff88e160db0460
> > > struct nfsd_file {
> > >   nf_rlist = {
> > >     rhead = {
> > >       next = 0xffff8921fa2392f1
> > >     },
> > >     next = 0x0
> > >   },
> > >   nf_inode = 0xffff8882bc312ef8,
> > >   nf_file = 0xffff88e2015b1500,
> > >   nf_cred = 0xffff88e3ab0e7800,
> > >   nf_net = 0xffffffff83d41600 <init_net>,
> > >   nf_flags = 0x8,
> > >   nf_ref = {
> > >     refs = {
> > >       counter = 0xc0000000
> > >     }
> > >   },
> > >   nf_may = 0x4,
> > >   nf_mark = 0xffff88e1bddfb320,
> > >   nf_lru = {
> > >     next = 0xffff88e160db04a8,
> > >     prev = 0xffff88e160db04a8
> > >   },
> > >   nf_rcu = {
> > >     next = 0x10000000000,
> > >     func = 0x0
> > >   },
> > >   nf_birthtime = 0x73d22fc1728
> > > }
> > > 
> > > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > > ffff88839a53d850
> > >   nf_flags = 0x8,
> > >   nf_ref.refs.counter = 0x0
> > >   nf_lru = {
> > >     next = 0xffff88839a53d898,
> > >     prev = 0xffff88839a53d898
> > >   },
> > >   nf_file = 0xffff88810ede8700,
> > > 
> > > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > > ffff88c32b11e850
> > >   nf_flags = 0x8,
> > >   nf_ref.refs.counter = 0x0
> > >   nf_lru = {
> > >     next = 0xffff88c32b11e898,
> > >     prev = 0xffff88c32b11e898
> > >   },
> > >   nf_file = 0xffff88c20a701c00,
> > > 
> > > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > > ffff88e372709700
> > >   nf_flags = 0xc,
> > >   nf_ref.refs.counter = 0x0
> > >   nf_lru = {
> > >     next = 0xffff88e372709748,
> > >     prev = 0xffff88e372709748
> > >   },
> > >   nf_file = 0xffff88e0725e6400,
> > > 
> > > crash> struct nfsd_file.nf_flags,nf_ref.refs.counter,nf_lru,nf_file -x
> > > ffff8982864944d0
> > >   nf_flags = 0xc,
> > >   nf_ref.refs.counter = 0x0
> > >   nf_lru = {
> > >     next = 0xffff898286494518,
> > >     prev = 0xffff898286494518
> > >   },
> > >   nf_file = 0xffff89803c0ff700,
> > > 
> > > The leak occurs when nfsd_file_put() races with nfsd_file_cond_queue()
> > > or nfsd_file_lru_cb(). With the following patch, I haven't observed
> > > any leak after a few days heavy nfs load:
> > 
> > Our patch submission guidelines require a Signed-off-by:
> > line at the end of the patch description. See the "Sign your work -
> > the Developer's Certificate of Origin" section of
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.10-rc6
> > 
> > (Needed here in case your fix is acceptable).
> > 
> > 
> > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > > index 1a6d5d000b85..2323829f7208 100644
> > > --- a/fs/nfsd/filecache.c
> > > +++ b/fs/nfsd/filecache.c
> > > @@ -389,6 +389,17 @@ nfsd_file_put(struct nfsd_file *nf)
> > >   if (!nfsd_file_lru_remove(nf))
> > >   return;
> > >   }
> > > + /*
> > > + * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
> > > + * it's unhashed but then removed from the dispose list,
> > > + * so we need to free it.
> > > + */
> > > + if (refcount_read(&nf->nf_ref) == 0 &&
> > > +     !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
> > > +     list_empty(&nf->nf_lru)) {
> > > + nfsd_file_free(nf);
> > > + return;
> > > + }
> > >   }
> > >   if (refcount_dec_and_test(&nf->nf_ref))
> > >   nfsd_file_free(nf);
> > > @@ -576,7 +587,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> > > list_head *dispose)
> > >   int decrement = 1;
> > > 
> > >   /* If we raced with someone else unhashing, ignore it */
> > > - if (!nfsd_file_unhash(nf))
> > > + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
> > >   return;
> > > 
> > >   /* If we can't get a reference, ignore it */
> > > @@ -590,6 +601,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
> > > list_head *dispose)
> > >   /* If refcount goes to 0, then put on the dispose list */
> > >   if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
> > >   list_add(&nf->nf_lru, dispose);
> > > + nfsd_file_unhash(nf);
> > >   trace_nfsd_file_closing(nf);
> > >   }
> > >  }
> > > 
> > > Please kindly review the patch and let me know if it makes sense.
> > > 
> > > Thanks,
> > > 
> > > -Youzhong
> > > 
> > 
> > --
> > Chuck Lever
>
diff mbox series

Patch

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 1a6d5d000b85..2323829f7208 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -389,6 +389,17 @@  nfsd_file_put(struct nfsd_file *nf)
  if (!nfsd_file_lru_remove(nf))
  return;
  }
+ /*
+ * Racing with nfsd_file_cond_queue() or nfsd_file_lru_cb(),
+ * it's unhashed but then removed from the dispose list,
+ * so we need to free it.
+ */
+ if (refcount_read(&nf->nf_ref) == 0 &&
+     !test_bit(NFSD_FILE_HASHED, &nf->nf_flags) &&
+     list_empty(&nf->nf_lru)) {
+ nfsd_file_free(nf);
+ return;
+ }
  }
  if (refcount_dec_and_test(&nf->nf_ref))
  nfsd_file_free(nf);
@@ -576,7 +587,7 @@  nfsd_file_cond_queue(struct nfsd_file *nf, struct
list_head *dispose)
  int decrement = 1;

  /* If we raced with someone else unhashing, ignore it */
- if (!nfsd_file_unhash(nf))
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
  return;