diff mbox series

vfs: Don't evict inode under the inode lru traversing context

Message ID 20240805013446.814357-1-chengzhihao@huaweicloud.com (mailing list archive)
State New
Headers show
Series vfs: Don't evict inode under the inode lru traversing context | expand

Commit Message

Zhihao Cheng Aug. 5, 2024, 1:34 a.m. UTC
From: Zhihao Cheng <chengzhihao1@huawei.com>

The inode reclaiming process(See function prune_icache_sb) collects all
reclaimable inodes and mark them with I_FREEING flag at first, at that
time, other processes will be stuck if they try getting these inodes
(See function find_inode_fast), then the reclaiming process destroy the
inodes by function dispose_list(). Some filesystems(eg. ext4 with
ea_inode feature, ubifs with xattr) may do inode lookup in the inode
evicting callback function, if the inode lookup is operated under the
inode lru traversing context, deadlock problems may happen.

Case 1: In function ext4_evict_inode(), the ea inode lookup could happen
        if ea_inode feature is enabled, the lookup process will be stuck
	under the evicting context like this:

 1. File A has inode i_reg and an ea inode i_ea
 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea
 3. Then, following three processes running like this:

    PA                              PB
 echo 2 > /proc/sys/vm/drop_caches
  shrink_slab
   prune_dcache_sb
   // i_reg is added into lru, lru->i_ea->i_reg
   prune_icache_sb
    list_lru_walk_one
     inode_lru_isolate
      i_ea->i_state |= I_FREEING // set inode state
     inode_lru_isolate
      __iget(i_reg)
      spin_unlock(&i_reg->i_lock)
      spin_unlock(lru_lock)
                                     rm file A
                                      i_reg->nlink = 0
      iput(i_reg) // i_reg->nlink is 0, do evict
       ext4_evict_inode
        ext4_xattr_delete_inode
         ext4_xattr_inode_dec_ref_all
          ext4_xattr_inode_iget
           ext4_iget(i_ea->i_ino)
            iget_locked
             find_inode_fast
              __wait_on_freeing_inode(i_ea) ----→ AA deadlock
    dispose_list // cannot be executed by prune_icache_sb
     wake_up_bit(&i_ea->i_state)

Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file
        deleting process holds BASEHD's wbuf->io_mutex while getting the
	xattr inode, which could race with inode reclaiming process(The
        reclaiming process could try locking BASEHD's wbuf->io_mutex in
	inode evicting function), then an ABBA deadlock problem would
	happen as following:

 1. File A has inode ia and a xattr(with inode ixa), regular file B has
    inode ib and a xattr.
 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa
 3. Then, following three processes running like this:

        PA                PB                        PC
                echo 2 > /proc/sys/vm/drop_caches
                 shrink_slab
                  prune_dcache_sb
                  // ib and ia are added into lru, lru->ixa->ib->ia
                  prune_icache_sb
                   list_lru_walk_one
                    inode_lru_isolate
                     ixa->i_state |= I_FREEING // set inode state
                    inode_lru_isolate
                     __iget(ib)
                     spin_unlock(&ib->i_lock)
                     spin_unlock(lru_lock)
                                                   rm file B
                                                    ib->nlink = 0
 rm file A
  iput(ia)
   ubifs_evict_inode(ia)
    ubifs_jnl_delete_inode(ia)
     ubifs_jnl_write_inode(ia)
      make_reservation(BASEHD) // Lock wbuf->io_mutex
      ubifs_iget(ixa->i_ino)
       iget_locked
        find_inode_fast
         __wait_on_freeing_inode(ixa)
          |          iput(ib) // ib->nlink is 0, do evict
          |           ubifs_evict_inode
          |            ubifs_jnl_delete_inode(ib)
          ↓             ubifs_jnl_write_inode
     ABBA deadlock ←-----make_reservation(BASEHD)
                   dispose_list // cannot be executed by prune_icache_sb
                    wake_up_bit(&ixa->i_state)

Fix it by forbidding inode evicting under the inode lru traversing
context. In details, we import a new inode state flag 'I_LRU_ISOLATING'
to pin inode without holding i_count under the inode lru traversing
context, the inode evicting process will wait until this flag is
cleared from i_state.

Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022
Fixes: e50e5129f384 ("ext4: xattr-in-inode support")
Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files")
Cc: stable@vger.kernel.org
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Mateusz Guzik <mjguzik@gmail.com>
---
 fs/inode.c         | 37 +++++++++++++++++++++++++++++++++++--
 include/linux/fs.h |  5 +++++
 2 files changed, 40 insertions(+), 2 deletions(-)

Comments

Jan Kara Aug. 5, 2024, 3:30 p.m. UTC | #1
On Mon 05-08-24 09:34:46, Zhihao Cheng wrote:
> From: Zhihao Cheng <chengzhihao1@huawei.com>
> 
> The inode reclaiming process(See function prune_icache_sb) collects all
> reclaimable inodes and mark them with I_FREEING flag at first, at that
> time, other processes will be stuck if they try getting these inodes
> (See function find_inode_fast), then the reclaiming process destroy the
> inodes by function dispose_list(). Some filesystems(eg. ext4 with
> ea_inode feature, ubifs with xattr) may do inode lookup in the inode
> evicting callback function, if the inode lookup is operated under the
> inode lru traversing context, deadlock problems may happen.
> 
> Case 1: In function ext4_evict_inode(), the ea inode lookup could happen
>         if ea_inode feature is enabled, the lookup process will be stuck
> 	under the evicting context like this:
> 
>  1. File A has inode i_reg and an ea inode i_ea
>  2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea
>  3. Then, following three processes running like this:
> 
>     PA                              PB
>  echo 2 > /proc/sys/vm/drop_caches
>   shrink_slab
>    prune_dcache_sb
>    // i_reg is added into lru, lru->i_ea->i_reg
>    prune_icache_sb
>     list_lru_walk_one
>      inode_lru_isolate
>       i_ea->i_state |= I_FREEING // set inode state
>      inode_lru_isolate
>       __iget(i_reg)
>       spin_unlock(&i_reg->i_lock)
>       spin_unlock(lru_lock)
>                                      rm file A
>                                       i_reg->nlink = 0
>       iput(i_reg) // i_reg->nlink is 0, do evict
>        ext4_evict_inode
>         ext4_xattr_delete_inode
>          ext4_xattr_inode_dec_ref_all
>           ext4_xattr_inode_iget
>            ext4_iget(i_ea->i_ino)
>             iget_locked
>              find_inode_fast
>               __wait_on_freeing_inode(i_ea) ----→ AA deadlock
>     dispose_list // cannot be executed by prune_icache_sb
>      wake_up_bit(&i_ea->i_state)
> 
> Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file
>         deleting process holds BASEHD's wbuf->io_mutex while getting the
> 	xattr inode, which could race with inode reclaiming process(The
>         reclaiming process could try locking BASEHD's wbuf->io_mutex in
> 	inode evicting function), then an ABBA deadlock problem would
> 	happen as following:
> 
>  1. File A has inode ia and a xattr(with inode ixa), regular file B has
>     inode ib and a xattr.
>  2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa
>  3. Then, following three processes running like this:
> 
>         PA                PB                        PC
>                 echo 2 > /proc/sys/vm/drop_caches
>                  shrink_slab
>                   prune_dcache_sb
>                   // ib and ia are added into lru, lru->ixa->ib->ia
>                   prune_icache_sb
>                    list_lru_walk_one
>                     inode_lru_isolate
>                      ixa->i_state |= I_FREEING // set inode state
>                     inode_lru_isolate
>                      __iget(ib)
>                      spin_unlock(&ib->i_lock)
>                      spin_unlock(lru_lock)
>                                                    rm file B
>                                                     ib->nlink = 0
>  rm file A
>   iput(ia)
>    ubifs_evict_inode(ia)
>     ubifs_jnl_delete_inode(ia)
>      ubifs_jnl_write_inode(ia)
>       make_reservation(BASEHD) // Lock wbuf->io_mutex
>       ubifs_iget(ixa->i_ino)
>        iget_locked
>         find_inode_fast
>          __wait_on_freeing_inode(ixa)
>           |          iput(ib) // ib->nlink is 0, do evict
>           |           ubifs_evict_inode
>           |            ubifs_jnl_delete_inode(ib)
>           ↓             ubifs_jnl_write_inode
>      ABBA deadlock ←-----make_reservation(BASEHD)
>                    dispose_list // cannot be executed by prune_icache_sb
>                     wake_up_bit(&ixa->i_state)
> 
> Fix it by forbidding inode evicting under the inode lru traversing
> context. In details, we import a new inode state flag 'I_LRU_ISOLATING'
> to pin inode without holding i_count under the inode lru traversing
> context, the inode evicting process will wait until this flag is
> cleared from i_state.

Thanks for the patch and sorry for not getting to this myself!  Let me
rephrase the above paragraph a bit for better readability:

Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING to
pin the inode in memory while inode_lru_isolate() reclaims its pages
instead of using ordinary inode reference. This way inode deletion cannot
be triggered from inode_lru_isolate() thus avoiding the deadlock. evict()
is made to wait for I_LRU_ISOLATING to be cleared before proceeding with
inode cleanup.

> @@ -488,6 +488,36 @@ static void inode_lru_list_del(struct inode *inode)
>  		this_cpu_dec(nr_unused);
>  }
>  
> +static void inode_lru_isolating(struct inode *inode)

Perhaps call this inode_pin_lru_isolating()

> +{
> +	BUG_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
> +	inode->i_state |= I_LRU_ISOLATING;
> +}
> +
> +static void inode_lru_finish_isolating(struct inode *inode)

And call this inode_unpin_lru_isolating()?

Otherwise the patch looks good so feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
Mateusz Guzik Aug. 5, 2024, 4:10 p.m. UTC | #2
On Mon, Aug 5, 2024 at 3:24 AM Zhihao Cheng <chengzhihao@huaweicloud.com> wrote:
>
> From: Zhihao Cheng <chengzhihao1@huawei.com>
>
> The inode reclaiming process(See function prune_icache_sb) collects all
> reclaimable inodes and mark them with I_FREEING flag at first, at that
> time, other processes will be stuck if they try getting these inodes
> (See function find_inode_fast), then the reclaiming process destroy the
> inodes by function dispose_list(). Some filesystems(eg. ext4 with
> ea_inode feature, ubifs with xattr) may do inode lookup in the inode
> evicting callback function, if the inode lookup is operated under the
> inode lru traversing context, deadlock problems may happen.
>
> Case 1: In function ext4_evict_inode(), the ea inode lookup could happen
>         if ea_inode feature is enabled, the lookup process will be stuck
>         under the evicting context like this:
>
>  1. File A has inode i_reg and an ea inode i_ea
>  2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea
>  3. Then, following three processes running like this:
>
>     PA                              PB
>  echo 2 > /proc/sys/vm/drop_caches
>   shrink_slab
>    prune_dcache_sb
>    // i_reg is added into lru, lru->i_ea->i_reg
>    prune_icache_sb
>     list_lru_walk_one
>      inode_lru_isolate
>       i_ea->i_state |= I_FREEING // set inode state
>      inode_lru_isolate
>       __iget(i_reg)
>       spin_unlock(&i_reg->i_lock)
>       spin_unlock(lru_lock)
>                                      rm file A
>                                       i_reg->nlink = 0
>       iput(i_reg) // i_reg->nlink is 0, do evict
>        ext4_evict_inode
>         ext4_xattr_delete_inode
>          ext4_xattr_inode_dec_ref_all
>           ext4_xattr_inode_iget
>            ext4_iget(i_ea->i_ino)
>             iget_locked
>              find_inode_fast
>               __wait_on_freeing_inode(i_ea) ----→ AA deadlock
>     dispose_list // cannot be executed by prune_icache_sb
>      wake_up_bit(&i_ea->i_state)
>
> Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file
>         deleting process holds BASEHD's wbuf->io_mutex while getting the
>         xattr inode, which could race with inode reclaiming process(The
>         reclaiming process could try locking BASEHD's wbuf->io_mutex in
>         inode evicting function), then an ABBA deadlock problem would
>         happen as following:
>
>  1. File A has inode ia and a xattr(with inode ixa), regular file B has
>     inode ib and a xattr.
>  2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa
>  3. Then, following three processes running like this:
>
>         PA                PB                        PC
>                 echo 2 > /proc/sys/vm/drop_caches
>                  shrink_slab
>                   prune_dcache_sb
>                   // ib and ia are added into lru, lru->ixa->ib->ia
>                   prune_icache_sb
>                    list_lru_walk_one
>                     inode_lru_isolate
>                      ixa->i_state |= I_FREEING // set inode state
>                     inode_lru_isolate
>                      __iget(ib)
>                      spin_unlock(&ib->i_lock)
>                      spin_unlock(lru_lock)
>                                                    rm file B
>                                                     ib->nlink = 0
>  rm file A
>   iput(ia)
>    ubifs_evict_inode(ia)
>     ubifs_jnl_delete_inode(ia)
>      ubifs_jnl_write_inode(ia)
>       make_reservation(BASEHD) // Lock wbuf->io_mutex
>       ubifs_iget(ixa->i_ino)
>        iget_locked
>         find_inode_fast
>          __wait_on_freeing_inode(ixa)
>           |          iput(ib) // ib->nlink is 0, do evict
>           |           ubifs_evict_inode
>           |            ubifs_jnl_delete_inode(ib)
>           ↓             ubifs_jnl_write_inode
>      ABBA deadlock ←-----make_reservation(BASEHD)
>                    dispose_list // cannot be executed by prune_icache_sb
>                     wake_up_bit(&ixa->i_state)
>
> Fix it by forbidding inode evicting under the inode lru traversing
> context. In details, we import a new inode state flag 'I_LRU_ISOLATING'
> to pin inode without holding i_count under the inode lru traversing
> context, the inode evicting process will wait until this flag is
> cleared from i_state.
>
> Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022
> Fixes: e50e5129f384 ("ext4: xattr-in-inode support")
> Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files")

I only have some tidy-ups with stuff I neglected to mention when
typing up my proposal.

> ---
>  fs/inode.c         | 37 +++++++++++++++++++++++++++++++++++--
>  include/linux/fs.h |  5 +++++
>  2 files changed, 40 insertions(+), 2 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 86670941884b..f1c6e8072f39 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -488,6 +488,36 @@ static void inode_lru_list_del(struct inode *inode)
>                 this_cpu_dec(nr_unused);
>  }
>
> +static void inode_lru_isolating(struct inode *inode)
> +{

        lockdep_assert_held(&inode->i_lock);

> +       BUG_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
> +       inode->i_state |= I_LRU_ISOLATING;
> +}
> +
> +static void inode_lru_finish_isolating(struct inode *inode)
> +{
> +       spin_lock(&inode->i_lock);
> +       BUG_ON(!(inode->i_state & I_LRU_ISOLATING));
> +       inode->i_state &= ~I_LRU_ISOLATING;
> +       wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
> +       spin_unlock(&inode->i_lock);
> +}
> +
> +static void inode_wait_for_lru_isolating(struct inode *inode)
> +{
> +       DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
> +       wait_queue_head_t *wqh;
> +

Top of evict() asserts on I_FREEING being set, used to decide it's not
legit to pin above. This dependency can be documented it in the
routine as well:

BUG_ON(!(inode->i_state & I_FREEING));

> +       spin_lock(&inode->i_lock);

This lock acquire is avoidable, which is always nice to do for
single-threaded perf. Probably can be also done for the writeback code
below. Maybe I'll massage it myself after the patch lands.

> +       wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
> +       while (inode->i_state & I_LRU_ISOLATING) {
> +               spin_unlock(&inode->i_lock);
> +               __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
> +               spin_lock(&inode->i_lock);
> +       }
> +       spin_unlock(&inode->i_lock);
> +}

So new arrivals *are* blocked by this point thanks to I_FREEING being
set on entry to evict(). This also means the flag can show up at most
once.

Thus instead of looping this should merely go to sleep once and assert
the I_LRU_ISOLATING flag is no longer set after waking up.

> +
>  /**
>   * inode_sb_list_add - add inode to the superblock list of inodes
>   * @inode: inode to add
> @@ -657,6 +687,9 @@ static void evict(struct inode *inode)
>
>         inode_sb_list_del(inode);
>
> +       /* Wait for LRU isolating to finish. */

I don't think this comment adds anything given the name of the func.

> +       inode_wait_for_lru_isolating(inode);
> +
>         /*
>          * Wait for flusher thread to be done with the inode so that filesystem
>          * does not start destroying it while writeback is still running. Since
> @@ -855,7 +888,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>          * be under pressure before the cache inside the highmem zone.
>          */
>         if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
> -               __iget(inode);
> +               inode_lru_isolating(inode);
>                 spin_unlock(&inode->i_lock);
>                 spin_unlock(lru_lock);
>                 if (remove_inode_buffers(inode)) {
> @@ -867,7 +900,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>                                 __count_vm_events(PGINODESTEAL, reap);
>                         mm_account_reclaimed_pages(reap);
>                 }
> -               iput(inode);
> +               inode_lru_finish_isolating(inode);
>                 spin_lock(lru_lock);
>                 return LRU_RETRY;
>         }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fd34b5755c0b..fb0426f349fc 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
>   *
>   * I_PINNING_FSCACHE_WB        Inode is pinning an fscache object for writeback.
>   *
> + * I_LRU_ISOLATING     Inode is pinned being isolated from LRU without holding
> + *                     i_count.
> + *
>   * Q: What is the difference between I_WILL_FREE and I_FREEING?
>   */
>  #define I_DIRTY_SYNC           (1 << 0)
> @@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
>  #define I_DONTCACHE            (1 << 16)
>  #define I_SYNC_QUEUED          (1 << 17)
>  #define I_PINNING_NETFS_WB     (1 << 18)
> +#define __I_LRU_ISOLATING      19
> +#define I_LRU_ISOLATING                (1 << __I_LRU_ISOLATING)
>
>  #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
>  #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
> --
> 2.39.2
>
Zhihao Cheng Aug. 6, 2024, 3:31 a.m. UTC | #3
在 2024/8/5 23:30, Jan Kara 写道:
> On Mon 05-08-24 09:34:46, Zhihao Cheng wrote:
>> From: Zhihao Cheng <chengzhihao1@huawei.com>
>>
>> The inode reclaiming process(See function prune_icache_sb) collects all
>> reclaimable inodes and mark them with I_FREEING flag at first, at that
>> time, other processes will be stuck if they try getting these inodes
>> (See function find_inode_fast), then the reclaiming process destroy the
>> inodes by function dispose_list(). Some filesystems(eg. ext4 with
>> ea_inode feature, ubifs with xattr) may do inode lookup in the inode
>> evicting callback function, if the inode lookup is operated under the
>> inode lru traversing context, deadlock problems may happen.
>>
>> Case 1: In function ext4_evict_inode(), the ea inode lookup could happen
>>          if ea_inode feature is enabled, the lookup process will be stuck
>> 	under the evicting context like this:
>>
>>   1. File A has inode i_reg and an ea inode i_ea
>>   2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea
>>   3. Then, following three processes running like this:
>>
>>      PA                              PB
>>   echo 2 > /proc/sys/vm/drop_caches
>>    shrink_slab
>>     prune_dcache_sb
>>     // i_reg is added into lru, lru->i_ea->i_reg
>>     prune_icache_sb
>>      list_lru_walk_one
>>       inode_lru_isolate
>>        i_ea->i_state |= I_FREEING // set inode state
>>       inode_lru_isolate
>>        __iget(i_reg)
>>        spin_unlock(&i_reg->i_lock)
>>        spin_unlock(lru_lock)
>>                                       rm file A
>>                                        i_reg->nlink = 0
>>        iput(i_reg) // i_reg->nlink is 0, do evict
>>         ext4_evict_inode
>>          ext4_xattr_delete_inode
>>           ext4_xattr_inode_dec_ref_all
>>            ext4_xattr_inode_iget
>>             ext4_iget(i_ea->i_ino)
>>              iget_locked
>>               find_inode_fast
>>                __wait_on_freeing_inode(i_ea) ----→ AA deadlock
>>      dispose_list // cannot be executed by prune_icache_sb
>>       wake_up_bit(&i_ea->i_state)
>>
>> Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file
>>          deleting process holds BASEHD's wbuf->io_mutex while getting the
>> 	xattr inode, which could race with inode reclaiming process(The
>>          reclaiming process could try locking BASEHD's wbuf->io_mutex in
>> 	inode evicting function), then an ABBA deadlock problem would
>> 	happen as following:
>>
>>   1. File A has inode ia and a xattr(with inode ixa), regular file B has
>>      inode ib and a xattr.
>>   2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa
>>   3. Then, following three processes running like this:
>>
>>          PA                PB                        PC
>>                  echo 2 > /proc/sys/vm/drop_caches
>>                   shrink_slab
>>                    prune_dcache_sb
>>                    // ib and ia are added into lru, lru->ixa->ib->ia
>>                    prune_icache_sb
>>                     list_lru_walk_one
>>                      inode_lru_isolate
>>                       ixa->i_state |= I_FREEING // set inode state
>>                      inode_lru_isolate
>>                       __iget(ib)
>>                       spin_unlock(&ib->i_lock)
>>                       spin_unlock(lru_lock)
>>                                                     rm file B
>>                                                      ib->nlink = 0
>>   rm file A
>>    iput(ia)
>>     ubifs_evict_inode(ia)
>>      ubifs_jnl_delete_inode(ia)
>>       ubifs_jnl_write_inode(ia)
>>        make_reservation(BASEHD) // Lock wbuf->io_mutex
>>        ubifs_iget(ixa->i_ino)
>>         iget_locked
>>          find_inode_fast
>>           __wait_on_freeing_inode(ixa)
>>            |          iput(ib) // ib->nlink is 0, do evict
>>            |           ubifs_evict_inode
>>            |            ubifs_jnl_delete_inode(ib)
>>            ↓             ubifs_jnl_write_inode
>>       ABBA deadlock ←-----make_reservation(BASEHD)
>>                     dispose_list // cannot be executed by prune_icache_sb
>>                      wake_up_bit(&ixa->i_state)
>>
>> Fix it by forbidding inode evicting under the inode lru traversing
>> context. In details, we import a new inode state flag 'I_LRU_ISOLATING'
>> to pin inode without holding i_count under the inode lru traversing
>> context, the inode evicting process will wait until this flag is
>> cleared from i_state.
> 
> Thanks for the patch and sorry for not getting to this myself!  Let me
> rephrase the above paragraph a bit for better readability:
> 
> Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING to
> pin the inode in memory while inode_lru_isolate() reclaims its pages
> instead of using ordinary inode reference. This way inode deletion cannot
> be triggered from inode_lru_isolate() thus avoiding the deadlock. evict()
> is made to wait for I_LRU_ISOLATING to be cleared before proceeding with
> inode cleanup.
> 

Looks clearer, thanks for the rephrasing, you're really nice.
>> @@ -488,6 +488,36 @@ static void inode_lru_list_del(struct inode *inode)
>>   		this_cpu_dec(nr_unused);
>>   }
>>   
>> +static void inode_lru_isolating(struct inode *inode)
> 
> Perhaps call this inode_pin_lru_isolating()

Adopt. Will change in v2.
> 
>> +{
>> +	BUG_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
>> +	inode->i_state |= I_LRU_ISOLATING;
>> +}
>> +
>> +static void inode_lru_finish_isolating(struct inode *inode)
> 
> And call this inode_unpin_lru_isolating()?

Adopt. Will change in v2.
> 
> Otherwise the patch looks good so feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> 
> 								Honza
>
Zhihao Cheng Aug. 6, 2024, 3:48 a.m. UTC | #4
在 2024/8/6 0:10, Mateusz Guzik 写道:
> On Mon, Aug 5, 2024 at 3:24 AM Zhihao Cheng <chengzhihao@huaweicloud.com> wrote:
>>
>> From: Zhihao Cheng <chengzhihao1@huawei.com>
>>
>> The inode reclaiming process(See function prune_icache_sb) collects all
>> reclaimable inodes and mark them with I_FREEING flag at first, at that
>> time, other processes will be stuck if they try getting these inodes
>> (See function find_inode_fast), then the reclaiming process destroy the
>> inodes by function dispose_list(). Some filesystems(eg. ext4 with
>> ea_inode feature, ubifs with xattr) may do inode lookup in the inode
>> evicting callback function, if the inode lookup is operated under the
>> inode lru traversing context, deadlock problems may happen.
>>
[...]
> 
> I only have some tidy-ups with stuff I neglected to mention when
> typing up my proposal.
> 
>> ---
>>   fs/inode.c         | 37 +++++++++++++++++++++++++++++++++++--
>>   include/linux/fs.h |  5 +++++
>>   2 files changed, 40 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/inode.c b/fs/inode.c
>> index 86670941884b..f1c6e8072f39 100644
>> --- a/fs/inode.c
>> +++ b/fs/inode.c
>> @@ -488,6 +488,36 @@ static void inode_lru_list_del(struct inode *inode)
>>                  this_cpu_dec(nr_unused);
>>   }
>>
>> +static void inode_lru_isolating(struct inode *inode)
>> +{
> 
>          lockdep_assert_held(&inode->i_lock);

Adopt, will change in v2.
> 
>> +       BUG_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
>> +       inode->i_state |= I_LRU_ISOLATING;
>> +}
>> +
>> +static void inode_lru_finish_isolating(struct inode *inode)
>> +{
>> +       spin_lock(&inode->i_lock);
>> +       BUG_ON(!(inode->i_state & I_LRU_ISOLATING));
>> +       inode->i_state &= ~I_LRU_ISOLATING;
>> +       wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
>> +       spin_unlock(&inode->i_lock);
>> +}
>> +
>> +static void inode_wait_for_lru_isolating(struct inode *inode)
>> +{
>> +       DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
>> +       wait_queue_head_t *wqh;
>> +
> 
> Top of evict() asserts on I_FREEING being set, used to decide it's not
> legit to pin above. This dependency can be documented it in the
> routine as well:
> 
> BUG_ON(!(inode->i_state & I_FREEING));

Sorry, I don't understand it, do you mean add the 
'BUG_ON(!(inode->i_state & I_FREEING))' in 
inode_wait_for_lru_isolating()? Just like you talked, evict() already 
has one. So far, the inode_wait_for_lru_isolating() is called only in 
evict(), we can add the BUG_ON if it is called more than one places in 
future.>
>> +       spin_lock(&inode->i_lock);
> 
> This lock acquire is avoidable, which is always nice to do for
> single-threaded perf. Probably can be also done for the writeback code
> below. Maybe I'll massage it myself after the patch lands.

Yes, maybe inode_wait_for_writeback() and inode_wait_for_lru_isolating() 
can be lockless, and a smp_rmb() is needed, I think.
> 
>> +       wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
>> +       while (inode->i_state & I_LRU_ISOLATING) {
>> +               spin_unlock(&inode->i_lock);
>> +               __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
>> +               spin_lock(&inode->i_lock);
>> +       }
>> +       spin_unlock(&inode->i_lock);
>> +}
> 
> So new arrivals *are* blocked by this point thanks to I_FREEING being
> set on entry to evict(). This also means the flag can show up at most
> once.
> 
> Thus instead of looping this should merely go to sleep once and assert
> the I_LRU_ISOLATING flag is no longer set after waking up.

Good improvement, will change in v2.
> 
>> +
>>   /**
>>    * inode_sb_list_add - add inode to the superblock list of inodes
>>    * @inode: inode to add
>> @@ -657,6 +687,9 @@ static void evict(struct inode *inode)
>>
>>          inode_sb_list_del(inode);
>>
>> +       /* Wait for LRU isolating to finish. */
> 
> I don't think this comment adds anything given the name of the func.

Adopt, will be deleted in v2.
> 
>> +       inode_wait_for_lru_isolating(inode);
>> +
>>          /*
>>           * Wait for flusher thread to be done with the inode so that filesystem
>>           * does not start destroying it while writeback is still running. Since
>> @@ -855,7 +888,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>>           * be under pressure before the cache inside the highmem zone.
>>           */
>>          if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
>> -               __iget(inode);
>> +               inode_lru_isolating(inode);
>>                  spin_unlock(&inode->i_lock);
>>                  spin_unlock(lru_lock);
>>                  if (remove_inode_buffers(inode)) {
>> @@ -867,7 +900,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>>                                  __count_vm_events(PGINODESTEAL, reap);
>>                          mm_account_reclaimed_pages(reap);
>>                  }
>> -               iput(inode);
>> +               inode_lru_finish_isolating(inode);
>>                  spin_lock(lru_lock);
>>                  return LRU_RETRY;
>>          }
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index fd34b5755c0b..fb0426f349fc 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
>>    *
>>    * I_PINNING_FSCACHE_WB        Inode is pinning an fscache object for writeback.
>>    *
>> + * I_LRU_ISOLATING     Inode is pinned being isolated from LRU without holding
>> + *                     i_count.
>> + *
>>    * Q: What is the difference between I_WILL_FREE and I_FREEING?
>>    */
>>   #define I_DIRTY_SYNC           (1 << 0)
>> @@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
>>   #define I_DONTCACHE            (1 << 16)
>>   #define I_SYNC_QUEUED          (1 << 17)
>>   #define I_PINNING_NETFS_WB     (1 << 18)
>> +#define __I_LRU_ISOLATING      19
>> +#define I_LRU_ISOLATING                (1 << __I_LRU_ISOLATING)
>>
>>   #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
>>   #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
>> --
>> 2.39.2
>>
> 
>
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index 86670941884b..f1c6e8072f39 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -488,6 +488,36 @@  static void inode_lru_list_del(struct inode *inode)
 		this_cpu_dec(nr_unused);
 }
 
+static void inode_lru_isolating(struct inode *inode)
+{
+	BUG_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
+	inode->i_state |= I_LRU_ISOLATING;
+}
+
+static void inode_lru_finish_isolating(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	BUG_ON(!(inode->i_state & I_LRU_ISOLATING));
+	inode->i_state &= ~I_LRU_ISOLATING;
+	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
+	spin_unlock(&inode->i_lock);
+}
+
+static void inode_wait_for_lru_isolating(struct inode *inode)
+{
+	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
+	wait_queue_head_t *wqh;
+
+	spin_lock(&inode->i_lock);
+	wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
+	while (inode->i_state & I_LRU_ISOLATING) {
+		spin_unlock(&inode->i_lock);
+		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
+		spin_lock(&inode->i_lock);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
 /**
  * inode_sb_list_add - add inode to the superblock list of inodes
  * @inode: inode to add
@@ -657,6 +687,9 @@  static void evict(struct inode *inode)
 
 	inode_sb_list_del(inode);
 
+	/* Wait for LRU isolating to finish. */
+	inode_wait_for_lru_isolating(inode);
+
 	/*
 	 * Wait for flusher thread to be done with the inode so that filesystem
 	 * does not start destroying it while writeback is still running. Since
@@ -855,7 +888,7 @@  static enum lru_status inode_lru_isolate(struct list_head *item,
 	 * be under pressure before the cache inside the highmem zone.
 	 */
 	if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
-		__iget(inode);
+		inode_lru_isolating(inode);
 		spin_unlock(&inode->i_lock);
 		spin_unlock(lru_lock);
 		if (remove_inode_buffers(inode)) {
@@ -867,7 +900,7 @@  static enum lru_status inode_lru_isolate(struct list_head *item,
 				__count_vm_events(PGINODESTEAL, reap);
 			mm_account_reclaimed_pages(reap);
 		}
-		iput(inode);
+		inode_lru_finish_isolating(inode);
 		spin_lock(lru_lock);
 		return LRU_RETRY;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..fb0426f349fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2392,6 +2392,9 @@  static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *
  * I_PINNING_FSCACHE_WB	Inode is pinning an fscache object for writeback.
  *
+ * I_LRU_ISOLATING	Inode is pinned being isolated from LRU without holding
+ *			i_count.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -2415,6 +2418,8 @@  static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define I_DONTCACHE		(1 << 16)
 #define I_SYNC_QUEUED		(1 << 17)
 #define I_PINNING_NETFS_WB	(1 << 18)
+#define __I_LRU_ISOLATING	19
+#define I_LRU_ISOLATING		(1 << __I_LRU_ISOLATING)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)