diff mbox series

mm: fix a race scenario in folio_isolate_lru

Message ID 20240314083921.1146937-1-zhaoyang.huang@unisoc.com (mailing list archive)
State New
Headers show
Series mm: fix a race scenario in folio_isolate_lru | expand

Commit Message

zhaoyang.huang March 14, 2024, 8:39 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Panic[1] reported which is caused by lruvec->list break. Fix the race
between folio_isolate_lru and release_pages.

race condition:
release_pages could meet a non-refered folio which escaped from being
deleted from LRU but add to another list_head
  #0 folio_isolate_lru          #1 release_pages
  if (folio_test_clear_lru())
                                   if (folio_put_testzero())
                                   if (!folio_test_lru())
                                    <failed to delete folio from LRU>
    folio_get(folio)
                                       list_add(&folio->lru,)
    list_del(&folio->lru,)

fix action 1:
have folio_get prior to folio_test_clear_lru is not enough as there
could be concurrent folio_put(filemap_remove_folios) to make
release_pages pass refcnt check and failed in delete from LRU

  #0 folio_isolate_lru          #1 release_pages
  folio_get(folio)
  if (folio_test_clear_lru())
                                   if (folio_put_testzero())
                                   if (!folio_test_lru())
                                    <failed to delete folio from LRU>
                                       list_add(&folio->lru,)
    list_del(&folio->lru,)

fix action 2:
folio_test_clear_lru should be considered as part of critical section of
lruvec which require be within lruvec->lock.

  #0 folio_isolate_lru          #1 release_pages
  spin_lock(lruvec->lock)
  folio_get(folio)
  if (folio_test_clear_lru())
    list_del(&folio->lru,)
    <delete folio from LRU>
  spin_unlock(lruvec->lock)        spin_lock(lruvec->lock)
                                   if (folio_put_testzero())
                                   if (!folio_test_lru())
                                       list_add(&folio->lru,)

[1]
[   37.562326] pc : __list_del_entry_valid_or_report+0xec/0xf0
[   37.562344] lr : __list_del_entry_valid_or_report+0xec/0xf0
[   37.562351] sp : ffffffc085953990
[   37.562355] x29: ffffffc0859539d0 x28: ffffffc082144000 x27: 000000000000000f
[   37.562367] x26: 000000000000000d x25: 000000000000000d x24: 00000000ffffffff
[   37.562377] x23: ffffffc085953a08 x22: ffffff8080389000 x21: ffffff8080389000
[   37.562388] x20: fffffffe05c54180 x19: ffffffc085953b30 x18: ffffffc085989098
[   37.562399] x17: 20747562202c3838 x16: ffffffffffffffff x15: 0000000000000004
[   37.562409] x14: ffffff8176980000 x13: 0000000000007fff x12: 0000000000000003
[   37.562420] x11: 00000000ffff7fff x10: ffffffc0820f51c4 x9 : 53b71233d5d50e00
[   37.562431] x8 : 53b71233d5d50e00 x7 : ffffffc081161ff0 x6 : 0000000000000000
[   37.562441] x5 : 0000000000000001 x4 : 0000000000000001 x3 : 0000000000000000
[   37.562451] x2 : ffffff817f2c4178 x1 : ffffff817f2b71c8 x0 : 000000000000006d
[   37.562461] Call trace:
[   37.562465]  __list_del_entry_valid_or_report+0xec/0xf0
[   37.562472]  release_pages+0x410/0x4c0
[   37.562482]  __folio_batch_release+0x34/0x4c
[   37.562490]  truncate_inode_pages_range+0x368/0x63c
[   37.562497]  truncate_inode_pages+0x14/0x24
[   37.562504]  blkdev_flush_mapping+0x60/0x120
[   37.562513]  blkdev_put+0x114/0x298
[   37.562520]  blkdev_release+0x28/0x40
[   37.562526]  __fput+0xf8/0x2a8
[   37.562533]  ____fput+0x10/0x20
[   37.562539]  task_work_run+0xc4/0xec
[   37.562546]  do_exit+0x32c/0xa3c
[   37.562554]  do_group_exit+0x98/0x9c
[   37.562561]  __arm64_sys_exit_group+0x18/0x1c
[   37.562568]  invoke_syscall+0x58/0x114
[   37.562575]  el0_svc_common+0xac/0xe0
[   37.562582]  do_el0_svc+0x1c/0x28
[   37.562588]  el0_svc+0x50/0xe4
[   37.562593]  el0t_64_sync_handler+0x68/0xbc
[   37.562599]  el0t_64_sync+0x1a8/0x1ac

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 mm/swap.c   | 25 +++++++++++++++++--------
 mm/vmscan.c | 25 +++++++++++++++++++------
 2 files changed, 36 insertions(+), 14 deletions(-)

Comments

Zhaoyang Huang March 15, 2024, 11:03 a.m. UTC | #1
On Thu, Mar 14, 2024 at 4:39 PM zhaoyang.huang
<zhaoyang.huang@unisoc.com> wrote:
>
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
>
> Panic[1] reported which is caused by lruvec->list break. Fix the race
> between folio_isolate_lru and release_pages.
>
> race condition:
> release_pages could meet a non-refered folio which escaped from being
> deleted from LRU but add to another list_head
>   #0 folio_isolate_lru          #1 release_pages
>   if (folio_test_clear_lru())
>                                    if (folio_put_testzero())
>                                    if (!folio_test_lru())
>                                     <failed to delete folio from LRU>
>     folio_get(folio)
>                                        list_add(&folio->lru,)
>     list_del(&folio->lru,)
>
> fix action 1:
> have folio_get prior to folio_test_clear_lru is not enough as there
> could be concurrent folio_put(filemap_remove_folios) to make
> release_pages pass refcnt check and failed in delete from LRU
>
>   #0 folio_isolate_lru          #1 release_pages
>   folio_get(folio)
>   if (folio_test_clear_lru())
>                                    if (folio_put_testzero())
>                                    if (!folio_test_lru())
>                                     <failed to delete folio from LRU>
>                                        list_add(&folio->lru,)
>     list_del(&folio->lru,)

correct the timing description of fix action 1, that is moving
folio_get prior to folio_test_clear_lru, which can't guarantee it
happens before folio_put_testzero of release_pages
   #0 folio_isolate_lru          #1 release_pages
BUG_ON(!folio_refcnt)
                                         if (folio_put_testzero())
   folio_get(folio)
   if (folio_test_clear_lru())
                                         if (!folio_test_lru())
                                          <failed to delete folio from LRU>
                                            list_add(&folio->lru,)
     list_del(&folio->lru,)

>
> fix action 2:
> folio_test_clear_lru should be considered as part of critical section of
> lruvec which require be within lruvec->lock.
>
>   #0 folio_isolate_lru          #1 release_pages
>   spin_lock(lruvec->lock)
>   folio_get(folio)
>   if (folio_test_clear_lru())
>     list_del(&folio->lru,)
>     <delete folio from LRU>
>   spin_unlock(lruvec->lock)        spin_lock(lruvec->lock)
>                                    if (folio_put_testzero())
>                                    if (!folio_test_lru())
>                                        list_add(&folio->lru,)
>
> [1]
> [   37.562326] pc : __list_del_entry_valid_or_report+0xec/0xf0
> [   37.562344] lr : __list_del_entry_valid_or_report+0xec/0xf0
> [   37.562351] sp : ffffffc085953990
> [   37.562355] x29: ffffffc0859539d0 x28: ffffffc082144000 x27: 000000000000000f
> [   37.562367] x26: 000000000000000d x25: 000000000000000d x24: 00000000ffffffff
> [   37.562377] x23: ffffffc085953a08 x22: ffffff8080389000 x21: ffffff8080389000
> [   37.562388] x20: fffffffe05c54180 x19: ffffffc085953b30 x18: ffffffc085989098
> [   37.562399] x17: 20747562202c3838 x16: ffffffffffffffff x15: 0000000000000004
> [   37.562409] x14: ffffff8176980000 x13: 0000000000007fff x12: 0000000000000003
> [   37.562420] x11: 00000000ffff7fff x10: ffffffc0820f51c4 x9 : 53b71233d5d50e00
> [   37.562431] x8 : 53b71233d5d50e00 x7 : ffffffc081161ff0 x6 : 0000000000000000
> [   37.562441] x5 : 0000000000000001 x4 : 0000000000000001 x3 : 0000000000000000
> [   37.562451] x2 : ffffff817f2c4178 x1 : ffffff817f2b71c8 x0 : 000000000000006d
> [   37.562461] Call trace:
> [   37.562465]  __list_del_entry_valid_or_report+0xec/0xf0
> [   37.562472]  release_pages+0x410/0x4c0
> [   37.562482]  __folio_batch_release+0x34/0x4c
> [   37.562490]  truncate_inode_pages_range+0x368/0x63c
> [   37.562497]  truncate_inode_pages+0x14/0x24
> [   37.562504]  blkdev_flush_mapping+0x60/0x120
> [   37.562513]  blkdev_put+0x114/0x298
> [   37.562520]  blkdev_release+0x28/0x40
> [   37.562526]  __fput+0xf8/0x2a8
> [   37.562533]  ____fput+0x10/0x20
> [   37.562539]  task_work_run+0xc4/0xec
> [   37.562546]  do_exit+0x32c/0xa3c
> [   37.562554]  do_group_exit+0x98/0x9c
> [   37.562561]  __arm64_sys_exit_group+0x18/0x1c
> [   37.562568]  invoke_syscall+0x58/0x114
> [   37.562575]  el0_svc_common+0xac/0xe0
> [   37.562582]  do_el0_svc+0x1c/0x28
> [   37.562588]  el0_svc+0x50/0xe4
> [   37.562593]  el0t_64_sync_handler+0x68/0xbc
> [   37.562599]  el0t_64_sync+0x1a8/0x1ac
>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> ---
>  mm/swap.c   | 25 +++++++++++++++++--------
>  mm/vmscan.c | 25 +++++++++++++++++++------
>  2 files changed, 36 insertions(+), 14 deletions(-)
>
> diff --git a/mm/swap.c b/mm/swap.c
> index cd8f0150ba3a..287cf7379927 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -968,6 +968,7 @@ void release_pages(release_pages_arg arg, int nr)
>
>         for (i = 0; i < nr; i++) {
>                 struct folio *folio;
> +               struct lruvec *prev_lruvec;
>
>                 /* Turn any of the argument types into a folio */
>                 folio = page_folio(encoded_page_ptr(encoded[i]));
> @@ -996,9 +997,24 @@ void release_pages(release_pages_arg arg, int nr)
>                                 free_zone_device_page(&folio->page);
>                         continue;
>                 }
> +               /*
> +                * lruvec->lock need to be prior to folio_put_testzero to
> +                * prevent race with folio_isolate_lru
> +                */
> +               prev_lruvec = lruvec;
> +               lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
> +                               &flags);
> +
> +               if (prev_lruvec != lruvec)
> +                       lock_batch = 0;
>
> -               if (!folio_put_testzero(folio))
> +               if (!folio_put_testzero(folio)) {
> +                       if (lruvec) {
> +                               unlock_page_lruvec_irqrestore(lruvec, flags);
> +                               lruvec = NULL;
> +                       }
>                         continue;
> +               }
>
>                 if (folio_test_large(folio)) {
>                         if (lruvec) {
> @@ -1010,13 +1026,6 @@ void release_pages(release_pages_arg arg, int nr)
>                 }
>
>                 if (folio_test_lru(folio)) {
> -                       struct lruvec *prev_lruvec = lruvec;
> -
> -                       lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
> -                                                                       &flags);
> -                       if (prev_lruvec != lruvec)
> -                               lock_batch = 0;
> -
>                         lruvec_del_folio(lruvec, folio);
>                         __folio_clear_lru_flags(folio);
>                 }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 4255619a1a31..13a4a716c67a 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1721,18 +1721,31 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
>  bool folio_isolate_lru(struct folio *folio)
>  {
>         bool ret = false;
> +       struct lruvec *lruvec;
>
>         VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
>
> -       if (folio_test_clear_lru(folio)) {
> -               struct lruvec *lruvec;
> +       /*
> +        * The folio_get needs to be prior to clear lru for list integrity.
> +        * Otherwise:
> +        *   #0  folio_isolate_lru            #1 release_pages
> +        *   if (folio_test_clear_lru())
> +        *                                    if (folio_put_testzero())
> +        *                                    if (!folio_test_lru())
> +        *                                     <failed to del folio from LRU>
> +        *     folio_get(folio)
> +        *                                        list_add(&folio->lru,)
> +        *     list_del(&folio->lru,)
> +        */
> +       lruvec = folio_lruvec_lock_irq(folio);
> +       folio_get(folio);
>
> -               folio_get(folio);
> -               lruvec = folio_lruvec_lock_irq(folio);
> +       if (folio_test_clear_lru(folio)) {
>                 lruvec_del_folio(lruvec, folio);
> -               unlock_page_lruvec_irq(lruvec);
>                 ret = true;
> -       }
> +       } else
> +               folio_put(folio);
> +       unlock_page_lruvec_irq(lruvec);
>
>         return ret;
>  }
> --
> 2.25.1
>
Matthew Wilcox (Oracle) March 15, 2024, 12:46 p.m. UTC | #2
On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> 
> Panic[1] reported which is caused by lruvec->list break. Fix the race
> between folio_isolate_lru and release_pages.
> 
> race condition:
> release_pages could meet a non-refered folio which escaped from being
> deleted from LRU but add to another list_head

I don't think the bug is in folio_isolate_lru() but rather in its
caller.

 * Context:
 *
 * (1) Must be called with an elevated refcount on the folio. This is a
 *     fundamental difference from isolate_lru_folios() (which is called
 *     without a stable reference).

So when release_pages() runs, it must not see a refcount decremented to
zero, because the caller of folio_isolate_lru() is supposed to hold one.

Your stack trace is for the thread which is calling release_pages(), not
the one calling folio_isolate_lru(), so I can't help you debug further.
Zhaoyang Huang March 16, 2024, 8:53 a.m. UTC | #3
On Fri, Mar 15, 2024 at 8:46 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> >
> > Panic[1] reported which is caused by lruvec->list break. Fix the race
> > between folio_isolate_lru and release_pages.
> >
> > race condition:
> > release_pages could meet a non-refered folio which escaped from being
> > deleted from LRU but add to another list_head
>
> I don't think the bug is in folio_isolate_lru() but rather in its
> caller.
>
>  * Context:
>  *
>  * (1) Must be called with an elevated refcount on the folio. This is a
>  *     fundamental difference from isolate_lru_folios() (which is called
>  *     without a stable reference).
>
> So when release_pages() runs, it must not see a refcount decremented to
> zero, because the caller of folio_isolate_lru() is supposed to hold one.
>
> Your stack trace is for the thread which is calling release_pages(), not
> the one calling folio_isolate_lru(), so I can't help you debug further.
Thanks for the comments.  According to my understanding,
folio_put_testzero does the decrement before test which makes it
possible to have release_pages see refcnt equal zero and proceed
further(folio_get in folio_isolate_lru has not run yet).

   #0 folio_isolate_lru          #1 release_pages
BUG_ON(!folio_refcnt)
                                         if (folio_put_testzero())
   folio_get(folio)
   if (folio_test_clear_lru())
Matthew Wilcox (Oracle) March 16, 2024, 2:59 p.m. UTC | #4
On Sat, Mar 16, 2024 at 04:53:09PM +0800, Zhaoyang Huang wrote:
> On Fri, Mar 15, 2024 at 8:46 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote:
> > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > >
> > > Panic[1] reported which is caused by lruvec->list break. Fix the race
> > > between folio_isolate_lru and release_pages.
> > >
> > > race condition:
> > > release_pages could meet a non-refered folio which escaped from being
> > > deleted from LRU but add to another list_head
> >
> > I don't think the bug is in folio_isolate_lru() but rather in its
> > caller.
> >
> >  * Context:
> >  *
> >  * (1) Must be called with an elevated refcount on the folio. This is a
> >  *     fundamental difference from isolate_lru_folios() (which is called
> >  *     without a stable reference).
> >
> > So when release_pages() runs, it must not see a refcount decremented to
> > zero, because the caller of folio_isolate_lru() is supposed to hold one.
> >
> > Your stack trace is for the thread which is calling release_pages(), not
> > the one calling folio_isolate_lru(), so I can't help you debug further.
> Thanks for the comments.  According to my understanding,
> folio_put_testzero does the decrement before test which makes it
> possible to have release_pages see refcnt equal zero and proceed
> further(folio_get in folio_isolate_lru has not run yet).

No, that's not possible.

In the scenario below, at entry to folio_isolate_lru(), the folio has
refcount 2.  It has one refcount from thread 0 (because it must own one
before calling folio_isolate_lru()) and it has one refcount from thread 1
(because it's about to call release_pages()).  If release_pages() were
not running, the folio would have refcount 3 when folio_isolate_lru()
returned.

>    #0 folio_isolate_lru          #1 release_pages
> BUG_ON(!folio_refcnt)
>                                          if (folio_put_testzero())
>    folio_get(folio)
>    if (folio_test_clear_lru())
Zhaoyang Huang March 17, 2024, 4:07 a.m. UTC | #5
On Sat, Mar 16, 2024 at 10:59 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Sat, Mar 16, 2024 at 04:53:09PM +0800, Zhaoyang Huang wrote:
> > On Fri, Mar 15, 2024 at 8:46 PM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Thu, Mar 14, 2024 at 04:39:21PM +0800, zhaoyang.huang wrote:
> > > > From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
> > > >
> > > > Panic[1] reported which is caused by lruvec->list break. Fix the race
> > > > between folio_isolate_lru and release_pages.
> > > >
> > > > race condition:
> > > > release_pages could meet a non-refered folio which escaped from being
> > > > deleted from LRU but add to another list_head
> > >
> > > I don't think the bug is in folio_isolate_lru() but rather in its
> > > caller.
> > >
> > >  * Context:
> > >  *
> > >  * (1) Must be called with an elevated refcount on the folio. This is a
> > >  *     fundamental difference from isolate_lru_folios() (which is called
> > >  *     without a stable reference).
> > >
> > > So when release_pages() runs, it must not see a refcount decremented to
> > > zero, because the caller of folio_isolate_lru() is supposed to hold one.
> > >
> > > Your stack trace is for the thread which is calling release_pages(), not
> > > the one calling folio_isolate_lru(), so I can't help you debug further.
> > Thanks for the comments.  According to my understanding,
> > folio_put_testzero does the decrement before test which makes it
> > possible to have release_pages see refcnt equal zero and proceed
> > further(folio_get in folio_isolate_lru has not run yet).
>
> No, that's not possible.
>
> In the scenario below, at entry to folio_isolate_lru(), the folio has
> refcount 2.  It has one refcount from thread 0 (because it must own one
> before calling folio_isolate_lru()) and it has one refcount from thread 1
> (because it's about to call release_pages()).  If release_pages() were
> not running, the folio would have refcount 3 when folio_isolate_lru()
> returned.
Could it be this scenario, where folio comes from pte(thread 0), local
fbatch(thread 1) and page cache(thread 2) concurrently and proceed
intermixed without lock's protection? Actually, IMO, thread 1 also
could see the folio with refcnt==1 since it doesn't care if the page
is on the page cache or not.

madivise_cold_and_pageout does no explicit folio_get thing since the
folio comes from pte which implies it has one refcnt from pagecache

#thread 0(madivise_cold_and_pageout)        #1
(lru_add_drain->fbatch_release_pages)
#2(read_pages->filemap_remove_folios)
refcnt == 1(represent page cache)

refcnt==2(another one represent LRU)
   folio comes from page cache
folio_isolate_lru
release_pages
                 filemap_free_folio


                             refcnt==1(decrease the one of page cache)

 folio_put_testzero == true

  <No lruvec_del_folio>

 list_add(folio->lru, pages_to_free) //current folio will break LRU's
integrity since it has not been deleted

In case of gmail's wrap, split above chart to two parts

#thread 0(madivise_cold_and_pageout)        #1
(lru_add_drain->fbatch_release_pages)
refcnt == 1(represent page cache)

refcnt==2(another one represent LRU)
folio_isolate_lru                                               release_pages

 folio_put_testzero == true

  <No lruvec_del_folio>

 list_add(folio->lru, pages_to_free)

 //current folio will break LRU's integrity since it has not been
deleted

#1 (lru_add_drain->fbatch_release_pages)
#2(read_pages->filemap_remove_folios)
refcnt==2(another one represent LRU)
   folio comes from page cache
release_pages
                 filemap_free_folio

                            refcnt==1(decrease the one of page cache)
 folio_put_testzero == true
 <No lruvec_del_folio>
 list_add(folio->lru, pages_to_free)
//current folio will break LRU's integrity since it has not been deleted
>
> >    #0 folio_isolate_lru          #1 release_pages
> > BUG_ON(!folio_refcnt)
> >                                          if (folio_put_testzero())
> >    folio_get(folio)
> >    if (folio_test_clear_lru())
Matthew Wilcox (Oracle) March 17, 2024, 8:46 p.m. UTC | #6
On Sun, Mar 17, 2024 at 12:07:40PM +0800, Zhaoyang Huang wrote:
> Could it be this scenario, where folio comes from pte(thread 0), local
> fbatch(thread 1) and page cache(thread 2) concurrently and proceed
> intermixed without lock's protection? Actually, IMO, thread 1 also
> could see the folio with refcnt==1 since it doesn't care if the page
> is on the page cache or not.
> 
> madivise_cold_and_pageout does no explicit folio_get thing since the
> folio comes from pte which implies it has one refcnt from pagecache

Mmm, no.  It's implicit, but madvise_cold_or_pageout_pte_range()
does guarantee that the folio has at least one refcount.

Since we get the folio from vm_normal_folio(vma, addr, ptent);
we know that there is at least one mapcount on the folio.  refcount
is always >= mapcount.  Since we hold pte_offset_map_lock(), we
know that mapcount (and therefore refcount) cannot be decremented 
until we call pte_unmap_unlock(), which we don't do until we have
called folio_isolate_lru().

Good try though, took me a few minutes of looking at it to convince
myself that it was safe.

Something to bear in mind is that if the race you outline is real,
failing to hold a refcount on the folio leaves the caller susceptible
to the VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); if the other
thread calls folio_put().

I can't understand any of the scenarios you outline below.
Please try again without relying on indentation.

> #thread 0(madivise_cold_and_pageout)        #1
> (lru_add_drain->fbatch_release_pages)
> #2(read_pages->filemap_remove_folios)
> refcnt == 1(represent page cache)
> 
> refcnt==2(another one represent LRU)
>    folio comes from page cache
> folio_isolate_lru
> release_pages
>                  filemap_free_folio
> 
> 
>                              refcnt==1(decrease the one of page cache)
> 
>  folio_put_testzero == true
> 
>   <No lruvec_del_folio>
> 
>  list_add(folio->lru, pages_to_free) //current folio will break LRU's
> integrity since it has not been deleted
> 
> In case of gmail's wrap, split above chart to two parts
> 
> #thread 0(madivise_cold_and_pageout)        #1
> (lru_add_drain->fbatch_release_pages)
> refcnt == 1(represent page cache)
> 
> refcnt==2(another one represent LRU)
> folio_isolate_lru                                               release_pages
> 
>  folio_put_testzero == true
> 
>   <No lruvec_del_folio>
> 
>  list_add(folio->lru, pages_to_free)
> 
>  //current folio will break LRU's integrity since it has not been
> deleted
> 
> #1 (lru_add_drain->fbatch_release_pages)
> #2(read_pages->filemap_remove_folios)
> refcnt==2(another one represent LRU)
>    folio comes from page cache
> release_pages
>                  filemap_free_folio
> 
>                             refcnt==1(decrease the one of page cache)
>  folio_put_testzero == true
>  <No lruvec_del_folio>
>  list_add(folio->lru, pages_to_free)
> //current folio will break LRU's integrity since it has not been deleted
> >
> > >    #0 folio_isolate_lru          #1 release_pages
> > > BUG_ON(!folio_refcnt)
> > >                                          if (folio_put_testzero())
> > >    folio_get(folio)
> > >    if (folio_test_clear_lru())
diff mbox series

Patch

diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba3a..287cf7379927 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -968,6 +968,7 @@  void release_pages(release_pages_arg arg, int nr)
 
 	for (i = 0; i < nr; i++) {
 		struct folio *folio;
+		struct lruvec *prev_lruvec;
 
 		/* Turn any of the argument types into a folio */
 		folio = page_folio(encoded_page_ptr(encoded[i]));
@@ -996,9 +997,24 @@  void release_pages(release_pages_arg arg, int nr)
 				free_zone_device_page(&folio->page);
 			continue;
 		}
+		/*
+		 * lruvec->lock need to be prior to folio_put_testzero to
+		 * prevent race with folio_isolate_lru
+		 */
+		prev_lruvec = lruvec;
+		lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
+				&flags);
+
+		if (prev_lruvec != lruvec)
+			lock_batch = 0;
 
-		if (!folio_put_testzero(folio))
+		if (!folio_put_testzero(folio)) {
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
+			}
 			continue;
+		}
 
 		if (folio_test_large(folio)) {
 			if (lruvec) {
@@ -1010,13 +1026,6 @@  void release_pages(release_pages_arg arg, int nr)
 		}
 
 		if (folio_test_lru(folio)) {
-			struct lruvec *prev_lruvec = lruvec;
-
-			lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
-									&flags);
-			if (prev_lruvec != lruvec)
-				lock_batch = 0;
-
 			lruvec_del_folio(lruvec, folio);
 			__folio_clear_lru_flags(folio);
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4255619a1a31..13a4a716c67a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1721,18 +1721,31 @@  static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
 bool folio_isolate_lru(struct folio *folio)
 {
 	bool ret = false;
+	struct lruvec *lruvec;
 
 	VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
 
-	if (folio_test_clear_lru(folio)) {
-		struct lruvec *lruvec;
+	/*
+	 * The folio_get needs to be prior to clear lru for list integrity.
+	 * Otherwise:
+	 *   #0  folio_isolate_lru	      #1 release_pages
+	 *   if (folio_test_clear_lru())
+	 *				      if (folio_put_testzero())
+	 *				      if (!folio_test_lru())
+	 *				       <failed to del folio from LRU>
+	 *     folio_get(folio)
+	 *                                        list_add(&folio->lru,)
+	 *     list_del(&folio->lru,)
+	 */
+	lruvec = folio_lruvec_lock_irq(folio);
+	folio_get(folio);
 
-		folio_get(folio);
-		lruvec = folio_lruvec_lock_irq(folio);
+	if (folio_test_clear_lru(folio)) {
 		lruvec_del_folio(lruvec, folio);
-		unlock_page_lruvec_irq(lruvec);
 		ret = true;
-	}
+	} else
+		folio_put(folio);
+	unlock_page_lruvec_irq(lruvec);
 
 	return ret;
 }