diff mbox series

[v16,16/22] mm/mlock: reorder isolation sequence during munlock

Message ID 1594429136-20002-17-git-send-email-alex.shi@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series per memcg lru_lock | expand

Commit Message

Alex Shi July 11, 2020, 12:58 a.m. UTC
This patch reorder the isolation steps during munlock, move the lru lock
to guard each pages, unfold __munlock_isolate_lru_page func, to do the
preparation for lru lock change.

__split_huge_page_refcount doesn't exist, but we still have to guard
PageMlocked and PageLRU for tail page in __split_huge_page_tail.

[lkp@intel.com: found a sleeping function bug ... at mm/rmap.c]
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
 mm/mlock.c | 93 ++++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 51 insertions(+), 42 deletions(-)

Comments

Alexander Duyck July 17, 2020, 8:30 p.m. UTC | #1
On Fri, Jul 10, 2020 at 5:59 PM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>
> This patch reorder the isolation steps during munlock, move the lru lock
> to guard each pages, unfold __munlock_isolate_lru_page func, to do the
> preparation for lru lock change.
>
> __split_huge_page_refcount doesn't exist, but we still have to guard
> PageMlocked and PageLRU for tail page in __split_huge_page_tail.
>
> [lkp@intel.com: found a sleeping function bug ... at mm/rmap.c]
> Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: linux-mm@kvack.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  mm/mlock.c | 93 ++++++++++++++++++++++++++++++++++----------------------------
>  1 file changed, 51 insertions(+), 42 deletions(-)
>
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 228ba5a8e0a5..0bdde88b4438 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -103,25 +103,6 @@ void mlock_vma_page(struct page *page)
>  }
>
>  /*
> - * Isolate a page from LRU with optional get_page() pin.
> - * Assumes lru_lock already held and page already pinned.
> - */
> -static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
> -{
> -       if (TestClearPageLRU(page)) {
> -               struct lruvec *lruvec;
> -
> -               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> -               if (getpage)
> -                       get_page(page);
> -               del_page_from_lru_list(page, lruvec, page_lru(page));
> -               return true;
> -       }
> -
> -       return false;
> -}
> -
> -/*
>   * Finish munlock after successful page isolation
>   *
>   * Page must be locked. This is a wrapper for try_to_munlock()
> @@ -181,6 +162,7 @@ static void __munlock_isolation_failed(struct page *page)
>  unsigned int munlock_vma_page(struct page *page)
>  {
>         int nr_pages;
> +       bool clearlru = false;
>         pg_data_t *pgdat = page_pgdat(page);
>
>         /* For try_to_munlock() and to serialize with page migration */
> @@ -189,32 +171,42 @@ unsigned int munlock_vma_page(struct page *page)
>         VM_BUG_ON_PAGE(PageTail(page), page);
>
>         /*
> -        * Serialize with any parallel __split_huge_page_refcount() which
> +        * Serialize split tail pages in __split_huge_page_tail() which
>          * might otherwise copy PageMlocked to part of the tail pages before
>          * we clear it in the head page. It also stabilizes hpage_nr_pages().
>          */
> +       get_page(page);

I don't think this get_page() call needs to be up here. It could be
left down before we delete the page from the LRU list as it is really
needed to take a reference on the page before we call
__munlock_isolated_page(), or at least that is the way it looks to me.
By doing that you can avoid a bunch of cleanup in these exception
cases.

> +       clearlru = TestClearPageLRU(page);

I'm not sure I fully understand the reason for moving this here. By
clearing this flag before you clear Mlocked does this give you some
sort of extra protection? I don't see how since Mlocked doesn't
necessarily imply the page is on LRU.

>         spin_lock_irq(&pgdat->lru_lock);
>
>         if (!TestClearPageMlocked(page)) {
> -               /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
> -               nr_pages = 1;
> -               goto unlock_out;
> +               if (clearlru)
> +                       SetPageLRU(page);
> +               /*
> +                * Potentially, PTE-mapped THP: do not skip the rest PTEs
> +                * Reuse lock as memory barrier for release_pages racing.
> +                */
> +               spin_unlock_irq(&pgdat->lru_lock);
> +               put_page(page);
> +               return 0;
>         }
>
>         nr_pages = hpage_nr_pages(page);
>         __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
>
> -       if (__munlock_isolate_lru_page(page, true)) {
> +       if (clearlru) {
> +               struct lruvec *lruvec;
> +

You could just place the get_page() call here.

> +               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> +               del_page_from_lru_list(page, lruvec, page_lru(page));
>                 spin_unlock_irq(&pgdat->lru_lock);
>                 __munlock_isolated_page(page);
> -               goto out;
> +       } else {
> +               spin_unlock_irq(&pgdat->lru_lock);
> +               put_page(page);
> +               __munlock_isolation_failed(page);

If you move the get_page() as I suggested above there wouldn't be a
need for the put_page(). It then becomes possible to simplify the code
a bit by merging the unlock paths and doing an if/else with the
__munlock functions like so:
if (clearlru) {
    ...
    del_page_from_lru..
}

spin_unlock_irq()

if (clearlru)
    __munlock_isolated_page();
else
    __munlock_isolated_failed();

>         }
> -       __munlock_isolation_failed(page);
> -
> -unlock_out:
> -       spin_unlock_irq(&pgdat->lru_lock);
>
> -out:
>         return nr_pages - 1;
>  }
>
> @@ -297,34 +289,51 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
>         pagevec_init(&pvec_putback);
>
>         /* Phase 1: page isolation */
> -       spin_lock_irq(&zone->zone_pgdat->lru_lock);
>         for (i = 0; i < nr; i++) {
>                 struct page *page = pvec->pages[i];
> +               struct lruvec *lruvec;
> +               bool clearlru;
>
> -               if (TestClearPageMlocked(page)) {
> -                       /*
> -                        * We already have pin from follow_page_mask()
> -                        * so we can spare the get_page() here.
> -                        */
> -                       if (__munlock_isolate_lru_page(page, false))
> -                               continue;
> -                       else
> -                               __munlock_isolation_failed(page);
> -               } else {
> +               clearlru = TestClearPageLRU(page);
> +               spin_lock_irq(&zone->zone_pgdat->lru_lock);

I still don't see what you are gaining by moving the bit test up to
this point. Seems like it would be better left below with the lock
just being used to prevent a possible race while you are pulling the
page out of the LRU list.

> +
> +               if (!TestClearPageMlocked(page)) {
>                         delta_munlocked++;
> +                       if (clearlru)
> +                               SetPageLRU(page);
> +                       goto putback;
> +               }
> +
> +               if (!clearlru) {
> +                       __munlock_isolation_failed(page);
> +                       goto putback;
>                 }

With the other function you were processing this outside of the lock,
here you are doing it inside. It would probably make more sense here
to follow similar logic and take care of the del_page_from_lru_list
ifr clealru is set, unlock, and then if clearlru is set continue else
track the isolation failure. That way you can avoid having to use as
many jump labels.

>                 /*
> +                * Isolate this page.
> +                * We already have pin from follow_page_mask()
> +                * so we can spare the get_page() here.
> +                */
> +               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> +               del_page_from_lru_list(page, lruvec, page_lru(page));
> +               spin_unlock_irq(&zone->zone_pgdat->lru_lock);
> +               continue;
> +
> +               /*
>                  * We won't be munlocking this page in the next phase
>                  * but we still need to release the follow_page_mask()
>                  * pin. We cannot do it under lru_lock however. If it's
>                  * the last pin, __page_cache_release() would deadlock.
>                  */
> +putback:
> +               spin_unlock_irq(&zone->zone_pgdat->lru_lock);
>                 pagevec_add(&pvec_putback, pvec->pages[i]);
>                 pvec->pages[i] = NULL;
>         }
> +       /* tempary disable irq, will remove later */
> +       local_irq_disable();
>         __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
> -       spin_unlock_irq(&zone->zone_pgdat->lru_lock);
> +       local_irq_enable();
>
>         /* Now we can release pins of pages that we are not munlocking */
>         pagevec_release(&pvec_putback);
> --
> 1.8.3.1
>
>
Alex Shi July 19, 2020, 3:55 a.m. UTC | #2
在 2020/7/18 上午4:30, Alexander Duyck 写道:
> On Fri, Jul 10, 2020 at 5:59 PM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>>
>> This patch reorder the isolation steps during munlock, move the lru lock
>> to guard each pages, unfold __munlock_isolate_lru_page func, to do the
>> preparation for lru lock change.
>>
>> __split_huge_page_refcount doesn't exist, but we still have to guard
>> PageMlocked and PageLRU for tail page in __split_huge_page_tail.
>>
>> [lkp@intel.com: found a sleeping function bug ... at mm/rmap.c]
>> Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: linux-mm@kvack.org
>> Cc: linux-kernel@vger.kernel.org
>> ---
>>  mm/mlock.c | 93 ++++++++++++++++++++++++++++++++++----------------------------
>>  1 file changed, 51 insertions(+), 42 deletions(-)
>>
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 228ba5a8e0a5..0bdde88b4438 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -103,25 +103,6 @@ void mlock_vma_page(struct page *page)
>>  }
>>
>>  /*
>> - * Isolate a page from LRU with optional get_page() pin.
>> - * Assumes lru_lock already held and page already pinned.
>> - */
>> -static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
>> -{
>> -       if (TestClearPageLRU(page)) {
>> -               struct lruvec *lruvec;
>> -
>> -               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>> -               if (getpage)
>> -                       get_page(page);
>> -               del_page_from_lru_list(page, lruvec, page_lru(page));
>> -               return true;
>> -       }
>> -
>> -       return false;
>> -}
>> -
>> -/*
>>   * Finish munlock after successful page isolation
>>   *
>>   * Page must be locked. This is a wrapper for try_to_munlock()
>> @@ -181,6 +162,7 @@ static void __munlock_isolation_failed(struct page *page)
>>  unsigned int munlock_vma_page(struct page *page)
>>  {
>>         int nr_pages;
>> +       bool clearlru = false;
>>         pg_data_t *pgdat = page_pgdat(page);
>>
>>         /* For try_to_munlock() and to serialize with page migration */
>> @@ -189,32 +171,42 @@ unsigned int munlock_vma_page(struct page *page)
>>         VM_BUG_ON_PAGE(PageTail(page), page);
>>
>>         /*
>> -        * Serialize with any parallel __split_huge_page_refcount() which
>> +        * Serialize split tail pages in __split_huge_page_tail() which
>>          * might otherwise copy PageMlocked to part of the tail pages before
>>          * we clear it in the head page. It also stabilizes hpage_nr_pages().
>>          */
>> +       get_page(page);
> 
> I don't think this get_page() call needs to be up here. It could be
> left down before we delete the page from the LRU list as it is really
> needed to take a reference on the page before we call
> __munlock_isolated_page(), or at least that is the way it looks to me.
> By doing that you can avoid a bunch of cleanup in these exception
> cases.

Uh, It seems unlikely for !page->_refcount, and then got to release_pages(),
if so, get_page do could move down.
Thanks

> 
>> +       clearlru = TestClearPageLRU(page);
> 
> I'm not sure I fully understand the reason for moving this here. By
> clearing this flag before you clear Mlocked does this give you some
> sort of extra protection? I don't see how since Mlocked doesn't
> necessarily imply the page is on LRU.
> 

Above comments give a reason for the lru_lock usage,
>> +        * Serialize split tail pages in __split_huge_page_tail() which
>>          * might otherwise copy PageMlocked to part of the tail pages before
>>          * we clear it in the head page. It also stabilizes hpage_nr_pages().

Look into the __split_huge_page_tail, there is a tiny gap between tail page
get PG_mlocked, and it is added into lru list.
The TestClearPageLRU could blocked memcg changes of the page from stopping
isolate_lru_page.


>>         spin_lock_irq(&pgdat->lru_lock);
>>
>>         if (!TestClearPageMlocked(page)) {
>> -               /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
>> -               nr_pages = 1;
>> -               goto unlock_out;
>> +               if (clearlru)
>> +                       SetPageLRU(page);
>> +               /*
>> +                * Potentially, PTE-mapped THP: do not skip the rest PTEs
>> +                * Reuse lock as memory barrier for release_pages racing.
>> +                */
>> +               spin_unlock_irq(&pgdat->lru_lock);
>> +               put_page(page);
>> +               return 0;
>>         }
>>
>>         nr_pages = hpage_nr_pages(page);
>>         __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
>>
>> -       if (__munlock_isolate_lru_page(page, true)) {
>> +       if (clearlru) {
>> +               struct lruvec *lruvec;
>> +
> 
> You could just place the get_page() call here.
> 
>> +               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>> +               del_page_from_lru_list(page, lruvec, page_lru(page));
>>                 spin_unlock_irq(&pgdat->lru_lock);
>>                 __munlock_isolated_page(page);
>> -               goto out;
>> +       } else {
>> +               spin_unlock_irq(&pgdat->lru_lock);
>> +               put_page(page);
>> +               __munlock_isolation_failed(page);
> 
> If you move the get_page() as I suggested above there wouldn't be a
> need for the put_page(). It then becomes possible to simplify the code
> a bit by merging the unlock paths and doing an if/else with the
> __munlock functions like so:
> if (clearlru) {
>     ...
>     del_page_from_lru..
> }
> 
> spin_unlock_irq()
> 
> if (clearlru)
>     __munlock_isolated_page();
> else
>     __munlock_isolated_failed();
> 
>>         }
>> -       __munlock_isolation_failed(page);
>> -
>> -unlock_out:
>> -       spin_unlock_irq(&pgdat->lru_lock);
>>
>> -out:
>>         return nr_pages - 1;
>>  }
>>
>> @@ -297,34 +289,51 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
>>         pagevec_init(&pvec_putback);
>>
>>         /* Phase 1: page isolation */
>> -       spin_lock_irq(&zone->zone_pgdat->lru_lock);
>>         for (i = 0; i < nr; i++) {
>>                 struct page *page = pvec->pages[i];
>> +               struct lruvec *lruvec;
>> +               bool clearlru;
>>
>> -               if (TestClearPageMlocked(page)) {
>> -                       /*
>> -                        * We already have pin from follow_page_mask()
>> -                        * so we can spare the get_page() here.
>> -                        */
>> -                       if (__munlock_isolate_lru_page(page, false))
>> -                               continue;
>> -                       else
>> -                               __munlock_isolation_failed(page);
>> -               } else {
>> +               clearlru = TestClearPageLRU(page);
>> +               spin_lock_irq(&zone->zone_pgdat->lru_lock);
> 
> I still don't see what you are gaining by moving the bit test up to
> this point. Seems like it would be better left below with the lock
> just being used to prevent a possible race while you are pulling the
> page out of the LRU list.
> 

the same reason as above comments mentained __split_huge_page_tail() 
issue.

>> +
>> +               if (!TestClearPageMlocked(page)) {
>>                         delta_munlocked++;
>> +                       if (clearlru)
>> +                               SetPageLRU(page);
>> +                       goto putback;
>> +               }
>> +
>> +               if (!clearlru) {
>> +                       __munlock_isolation_failed(page);
>> +                       goto putback;
>>                 }
> 
> With the other function you were processing this outside of the lock,
> here you are doing it inside. It would probably make more sense here
> to follow similar logic and take care of the del_page_from_lru_list
> ifr clealru is set, unlock, and then if clearlru is set continue else
> track the isolation failure. That way you can avoid having to use as
> many jump labels.
> 
>>                 /*
>> +                * Isolate this page.
>> +                * We already have pin from follow_page_mask()
>> +                * so we can spare the get_page() here.
>> +                */
>> +               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>> +               del_page_from_lru_list(page, lruvec, page_lru(page));
>> +               spin_unlock_irq(&zone->zone_pgdat->lru_lock);
>> +               continue;
>> +
>> +               /*
>>                  * We won't be munlocking this page in the next phase
>>                  * but we still need to release the follow_page_mask()
>>                  * pin. We cannot do it under lru_lock however. If it's
>>                  * the last pin, __page_cache_release() would deadlock.
>>                  */
>> +putback:
>> +               spin_unlock_irq(&zone->zone_pgdat->lru_lock);
>>                 pagevec_add(&pvec_putback, pvec->pages[i]);
>>                 pvec->pages[i] = NULL;
>>         }
>> +       /* tempary disable irq, will remove later */
>> +       local_irq_disable();
>>         __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
>> -       spin_unlock_irq(&zone->zone_pgdat->lru_lock);
>> +       local_irq_enable();
>>
>>         /* Now we can release pins of pages that we are not munlocking */
>>         pagevec_release(&pvec_putback);
>> --
>> 1.8.3.1
>>
>>
Alexander Duyck July 20, 2020, 6:51 p.m. UTC | #3
On Sat, Jul 18, 2020 at 8:56 PM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>
>
>
> 在 2020/7/18 上午4:30, Alexander Duyck 写道:
> > On Fri, Jul 10, 2020 at 5:59 PM Alex Shi <alex.shi@linux.alibaba.com> wrote:
> >>
> >> This patch reorder the isolation steps during munlock, move the lru lock
> >> to guard each pages, unfold __munlock_isolate_lru_page func, to do the
> >> preparation for lru lock change.
> >>
> >> __split_huge_page_refcount doesn't exist, but we still have to guard
> >> PageMlocked and PageLRU for tail page in __split_huge_page_tail.
> >>
> >> [lkp@intel.com: found a sleeping function bug ... at mm/rmap.c]
> >> Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
> >> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> >> Cc: Andrew Morton <akpm@linux-foundation.org>
> >> Cc: Johannes Weiner <hannes@cmpxchg.org>
> >> Cc: Matthew Wilcox <willy@infradead.org>
> >> Cc: Hugh Dickins <hughd@google.com>
> >> Cc: linux-mm@kvack.org
> >> Cc: linux-kernel@vger.kernel.org
> >> ---
> >>  mm/mlock.c | 93 ++++++++++++++++++++++++++++++++++----------------------------
> >>  1 file changed, 51 insertions(+), 42 deletions(-)
> >>
> >> diff --git a/mm/mlock.c b/mm/mlock.c
> >> index 228ba5a8e0a5..0bdde88b4438 100644
> >> --- a/mm/mlock.c
> >> +++ b/mm/mlock.c
> >> @@ -103,25 +103,6 @@ void mlock_vma_page(struct page *page)
> >>  }
> >>
> >>  /*
> >> - * Isolate a page from LRU with optional get_page() pin.
> >> - * Assumes lru_lock already held and page already pinned.
> >> - */
> >> -static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
> >> -{
> >> -       if (TestClearPageLRU(page)) {
> >> -               struct lruvec *lruvec;
> >> -
> >> -               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> >> -               if (getpage)
> >> -                       get_page(page);
> >> -               del_page_from_lru_list(page, lruvec, page_lru(page));
> >> -               return true;
> >> -       }
> >> -
> >> -       return false;
> >> -}
> >> -
> >> -/*
> >>   * Finish munlock after successful page isolation
> >>   *
> >>   * Page must be locked. This is a wrapper for try_to_munlock()
> >> @@ -181,6 +162,7 @@ static void __munlock_isolation_failed(struct page *page)
> >>  unsigned int munlock_vma_page(struct page *page)
> >>  {
> >>         int nr_pages;
> >> +       bool clearlru = false;
> >>         pg_data_t *pgdat = page_pgdat(page);
> >>
> >>         /* For try_to_munlock() and to serialize with page migration */
> >> @@ -189,32 +171,42 @@ unsigned int munlock_vma_page(struct page *page)
> >>         VM_BUG_ON_PAGE(PageTail(page), page);
> >>
> >>         /*
> >> -        * Serialize with any parallel __split_huge_page_refcount() which
> >> +        * Serialize split tail pages in __split_huge_page_tail() which
> >>          * might otherwise copy PageMlocked to part of the tail pages before
> >>          * we clear it in the head page. It also stabilizes hpage_nr_pages().
> >>          */
> >> +       get_page(page);
> >
> > I don't think this get_page() call needs to be up here. It could be
> > left down before we delete the page from the LRU list as it is really
> > needed to take a reference on the page before we call
> > __munlock_isolated_page(), or at least that is the way it looks to me.
> > By doing that you can avoid a bunch of cleanup in these exception
> > cases.
>
> Uh, It seems unlikely for !page->_refcount, and then got to release_pages(),
> if so, get_page do could move down.
> Thanks
>
> >
> >> +       clearlru = TestClearPageLRU(page);
> >
> > I'm not sure I fully understand the reason for moving this here. By
> > clearing this flag before you clear Mlocked does this give you some
> > sort of extra protection? I don't see how since Mlocked doesn't
> > necessarily imply the page is on LRU.
> >
>
> Above comments give a reason for the lru_lock usage,

I think things are getting confused here. The problem is that clearing
the page LRU flag is not the same as acquiring the LRU lock.

I was looking through patch 22 and it occured to me that the
documentation in __pagevec_lru_add_fn was never updated. My worry is
that it might have been overlooked, either that or maybe you discussed
it previously and I missed the discussion. There it calls out that you
either have to hold onto the LRU lock, or you have to test PageLRU
after clearing the Mlocked flag otherwise you risk introducing a race.
It seems to me like you could potentially just collapse the lock down
further if you are using it more inline with the 2b case as defined
there rather than trying to still use it to protect the Mlocked flag
even though you have already pulled the LRU bit before taking the
lock. Either that or this is more like the pagevec_lru_move_fn in
which case you are already holding the LRU lock so you just need to
call the test and clear before trying to pull the page off of the LRU
list.

> >> +        * Serialize split tail pages in __split_huge_page_tail() which
> >>          * might otherwise copy PageMlocked to part of the tail pages before
> >>          * we clear it in the head page. It also stabilizes hpage_nr_pages().
>
> Look into the __split_huge_page_tail, there is a tiny gap between tail page
> get PG_mlocked, and it is added into lru list.
> The TestClearPageLRU could blocked memcg changes of the page from stopping
> isolate_lru_page.

I get that there is a gap between the two in __split_huge_page_tail.
My concern is more the fact that you are pulling the bit testing
outside of the locked region when I don't think it needs to be. The
lock is being taken unconditionally, so why pull the testing out when
you could just do it inside the lock anyway? My worry is that you
might be addressing __split_huge_page_tail but in the process you
might be introducing a new race with something like
__pagevec_lru_add_fn.

If I am not mistaken the Mlocked flag can still be cleared regardless
of if the LRU bit is set or not. So you can still clear the LRU bit
before you pull the page out of the list, but it can be done after
clearing the Mlocked flag instead of before you have even taken the
LRU lock. In that way it would function more similar to how you
handled pagevec_lru_move_fn() as all this function is really doing is
moving the pages out of the unevictable list into one of the other LRU
lists anyway since the Mlocked flag was cleared.

> >>         spin_lock_irq(&pgdat->lru_lock);
> >>
> >>         if (!TestClearPageMlocked(page)) {
> >> -               /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
> >> -               nr_pages = 1;
> >> -               goto unlock_out;
> >> +               if (clearlru)
> >> +                       SetPageLRU(page);
> >> +               /*
> >> +                * Potentially, PTE-mapped THP: do not skip the rest PTEs
> >> +                * Reuse lock as memory barrier for release_pages racing.
> >> +                */
> >> +               spin_unlock_irq(&pgdat->lru_lock);
> >> +               put_page(page);
> >> +               return 0;
> >>         }
> >>
> >>         nr_pages = hpage_nr_pages(page);
> >>         __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
> >>
> >> -       if (__munlock_isolate_lru_page(page, true)) {
> >> +       if (clearlru) {
> >> +               struct lruvec *lruvec;
> >> +
> >
> > You could just place the get_page() call here.
> >
> >> +               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> >> +               del_page_from_lru_list(page, lruvec, page_lru(page));
> >>                 spin_unlock_irq(&pgdat->lru_lock);
> >>                 __munlock_isolated_page(page);
> >> -               goto out;
> >> +       } else {
> >> +               spin_unlock_irq(&pgdat->lru_lock);
> >> +               put_page(page);
> >> +               __munlock_isolation_failed(page);
> >
> > If you move the get_page() as I suggested above there wouldn't be a
> > need for the put_page(). It then becomes possible to simplify the code
> > a bit by merging the unlock paths and doing an if/else with the
> > __munlock functions like so:
> > if (clearlru) {
> >     ...
> >     del_page_from_lru..
> > }
> >
> > spin_unlock_irq()
> >
> > if (clearlru)
> >     __munlock_isolated_page();
> > else
> >     __munlock_isolated_failed();
> >
> >>         }
> >> -       __munlock_isolation_failed(page);
> >> -
> >> -unlock_out:
> >> -       spin_unlock_irq(&pgdat->lru_lock);
> >>
> >> -out:
> >>         return nr_pages - 1;
> >>  }
> >>
> >> @@ -297,34 +289,51 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
> >>         pagevec_init(&pvec_putback);
> >>
> >>         /* Phase 1: page isolation */
> >> -       spin_lock_irq(&zone->zone_pgdat->lru_lock);
> >>         for (i = 0; i < nr; i++) {
> >>                 struct page *page = pvec->pages[i];
> >> +               struct lruvec *lruvec;
> >> +               bool clearlru;
> >>
> >> -               if (TestClearPageMlocked(page)) {
> >> -                       /*
> >> -                        * We already have pin from follow_page_mask()
> >> -                        * so we can spare the get_page() here.
> >> -                        */
> >> -                       if (__munlock_isolate_lru_page(page, false))
> >> -                               continue;
> >> -                       else
> >> -                               __munlock_isolation_failed(page);
> >> -               } else {
> >> +               clearlru = TestClearPageLRU(page);
> >> +               spin_lock_irq(&zone->zone_pgdat->lru_lock);
> >
> > I still don't see what you are gaining by moving the bit test up to
> > this point. Seems like it would be better left below with the lock
> > just being used to prevent a possible race while you are pulling the
> > page out of the LRU list.
> >
>
> the same reason as above comments mentained __split_huge_page_tail()
> issue.

I have the same argument here as above. The LRU lock is being used to
protect the Mlocked flag, as such there isn't a need to move the
get_page and clearing of the LRU flag up.  The get_page() call isn't
needed until just before we delete the page from the LRU list, and the
clearing isn't really needed until after we have already cleared the
Mlocked flag to see if we even have any work that we have to do, but
we do need to clear it before we are allowed to delete the page from
the LRU list.
Alex Shi July 21, 2020, 9:26 a.m. UTC | #4
在 2020/7/21 上午2:51, Alexander Duyck 写道:
>> Look into the __split_huge_page_tail, there is a tiny gap between tail page
>> get PG_mlocked, and it is added into lru list.
>> The TestClearPageLRU could blocked memcg changes of the page from stopping
>> isolate_lru_page.
> I get that there is a gap between the two in __split_huge_page_tail.
> My concern is more the fact that you are pulling the bit testing
> outside of the locked region when I don't think it needs to be. The
> lock is being taken unconditionally, so why pull the testing out when
> you could just do it inside the lock anyway? My worry is that you
> might be addressing __split_huge_page_tail but in the process you
> might be introducing a new race with something like
> __pagevec_lru_add_fn.

Yes, the page maybe interfered by clear_page_mlock and add pages to wrong lru
list.

> 
> If I am not mistaken the Mlocked flag can still be cleared regardless
> of if the LRU bit is set or not. So you can still clear the LRU bit
> before you pull the page out of the list, but it can be done after
> clearing the Mlocked flag instead of before you have even taken the
> LRU lock. In that way it would function more similar to how you
> handled pagevec_lru_move_fn() as all this function is really doing is
> moving the pages out of the unevictable list into one of the other LRU
> lists anyway since the Mlocked flag was cleared.
> 

Without the lru bit guard, the page may be moved between memcgs, luckly,
lock_page would stop the mem_cgroup_move_account with BUSY state cost.
whole new change would like the following, I will testing/resend again.

Thanks!
Alex

@@ -182,7 +179,7 @@ static void __munlock_isolation_failed(struct page *page)
 unsigned int munlock_vma_page(struct page *page)
 {
        int nr_pages;
-       pg_data_t *pgdat = page_pgdat(page);
+       struct lruvec *lruvec;

        /* For try_to_munlock() and to serialize with page migration */
        BUG_ON(!PageLocked(page));
@@ -190,11 +187,11 @@ unsigned int munlock_vma_page(struct page *page)
        VM_BUG_ON_PAGE(PageTail(page), page);

        /*
-        * Serialize with any parallel __split_huge_page_refcount() which
+        * Serialize split tail pages in __split_huge_page_tail() which
         * might otherwise copy PageMlocked to part of the tail pages before
         * we clear it in the head page. It also stabilizes hpage_nr_pages().
         */
-       spin_lock_irq(&pgdat->lru_lock);
+       lruvec = lock_page_lruvec_irq(page);

        if (!TestClearPageMlocked(page)) {
                /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
@@ -205,15 +202,15 @@ unsigned int munlock_vma_page(struct page *page)
        nr_pages = hpage_nr_pages(page);
        __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);

-       if (__munlock_isolate_lru_page(page, true)) {
-               spin_unlock_irq(&pgdat->lru_lock);
+       if (__munlock_isolate_lru_page(page, lruvec, true)) {
+               unlock_page_lruvec_irq(lruvec);
                __munlock_isolated_page(page);
                goto out;
        }
        __munlock_isolation_failed(page);

 unlock_out:
-       spin_unlock_irq(&pgdat->lru_lock);
+       unlock_page_lruvec_irq(lruvec);

 out:
        return nr_pages - 1;
@@ -293,23 +290,27 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
        int nr = pagevec_count(pvec);
        int delta_munlocked = -nr;
        struct pagevec pvec_putback;
+       struct lruvec *lruvec = NULL;
        int pgrescued = 0;

        pagevec_init(&pvec_putback);

        /* Phase 1: page isolation */
-       spin_lock_irq(&zone->zone_pgdat->lru_lock);
        for (i = 0; i < nr; i++) {
                struct page *page = pvec->pages[i];

+               /* block memcg change in mem_cgroup_move_account */
+               lock_page(page);
+               lruvec = relock_page_lruvec_irq(page, lruvec);
                if (TestClearPageMlocked(page)) {
                        /*
                         * We already have pin from follow_page_mask()
                         * so we can spare the get_page() here.
                         */
-                       if (__munlock_isolate_lru_page(page, false))
+                       if (__munlock_isolate_lru_page(page, lruvec, false)) {
+                               unlock_page(page);
                                continue;
-                       else
+                       } else
                                __munlock_isolation_failed(page);
                } else {
                        delta_munlocked++;
@@ -321,11 +322,14 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
                 * pin. We cannot do it under lru_lock however. If it's
                 * the last pin, __page_cache_release() would deadlock.
                 */
+               unlock_page(page);
                pagevec_add(&pvec_putback, pvec->pages[i]);
                pvec->pages[i] = NULL;
        }
-       __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-       spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+       if (lruvec) {
+               __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+               unlock_page_lruvec_irq(lruvec);
+       }

        /* Now we can release pins of pages that we are not munlocking */
        pagevec_release(&pvec_putback);
Alex Shi July 21, 2020, 1:51 p.m. UTC | #5
在 2020/7/21 下午5:26, Alex Shi 写道:
> 
> 
> 在 2020/7/21 上午2:51, Alexander Duyck 写道:
>>> Look into the __split_huge_page_tail, there is a tiny gap between tail page
>>> get PG_mlocked, and it is added into lru list.
>>> The TestClearPageLRU could blocked memcg changes of the page from stopping
>>> isolate_lru_page.
>> I get that there is a gap between the two in __split_huge_page_tail.
>> My concern is more the fact that you are pulling the bit testing
>> outside of the locked region when I don't think it needs to be. The
>> lock is being taken unconditionally, so why pull the testing out when
>> you could just do it inside the lock anyway? My worry is that you
>> might be addressing __split_huge_page_tail but in the process you
>> might be introducing a new race with something like
>> __pagevec_lru_add_fn.
> 
> Yes, the page maybe interfered by clear_page_mlock and add pages to wrong lru
> list.
> 
>>
>> If I am not mistaken the Mlocked flag can still be cleared regardless
>> of if the LRU bit is set or not. So you can still clear the LRU bit
>> before you pull the page out of the list, but it can be done after
>> clearing the Mlocked flag instead of before you have even taken the
>> LRU lock. In that way it would function more similar to how you
>> handled pagevec_lru_move_fn() as all this function is really doing is
>> moving the pages out of the unevictable list into one of the other LRU
>> lists anyway since the Mlocked flag was cleared.
>>
> 
> Without the lru bit guard, the page may be moved between memcgs, luckly,
> lock_page would stop the mem_cgroup_move_account with BUSY state cost.
> whole new change would like the following, I will testing/resend again.
> 

Hi Johannes,

It looks like lock_page_memcg() could be used to replace lock_page(), which
could change retry into spinlock wait. Would you like to give some comments?

Thank
Alex
> Thanks!
> Alex
> 
> @@ -182,7 +179,7 @@ static void __munlock_isolation_failed(struct page *page)
>  unsigned int munlock_vma_page(struct page *page)
>  {
>         int nr_pages;
> -       pg_data_t *pgdat = page_pgdat(page);
> +       struct lruvec *lruvec;
> 
>         /* For try_to_munlock() and to serialize with page migration */
>         BUG_ON(!PageLocked(page));
> @@ -190,11 +187,11 @@ unsigned int munlock_vma_page(struct page *page)
>         VM_BUG_ON_PAGE(PageTail(page), page);
> 
>         /*
> -        * Serialize with any parallel __split_huge_page_refcount() which
> +        * Serialize split tail pages in __split_huge_page_tail() which
>          * might otherwise copy PageMlocked to part of the tail pages before
>          * we clear it in the head page. It also stabilizes hpage_nr_pages().
>          */
> -       spin_lock_irq(&pgdat->lru_lock);
> +       lruvec = lock_page_lruvec_irq(page);
> 
>         if (!TestClearPageMlocked(page)) {
>                 /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
> @@ -205,15 +202,15 @@ unsigned int munlock_vma_page(struct page *page)
>         nr_pages = hpage_nr_pages(page);
>         __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
> 
> -       if (__munlock_isolate_lru_page(page, true)) {
> -               spin_unlock_irq(&pgdat->lru_lock);
> +       if (__munlock_isolate_lru_page(page, lruvec, true)) {
> +               unlock_page_lruvec_irq(lruvec);
>                 __munlock_isolated_page(page);
>                 goto out;
>         }
>         __munlock_isolation_failed(page);
> 
>  unlock_out:
> -       spin_unlock_irq(&pgdat->lru_lock);
> +       unlock_page_lruvec_irq(lruvec);
> 
>  out:
>         return nr_pages - 1;
> @@ -293,23 +290,27 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
>         int nr = pagevec_count(pvec);
>         int delta_munlocked = -nr;
>         struct pagevec pvec_putback;
> +       struct lruvec *lruvec = NULL;
>         int pgrescued = 0;
> 
>         pagevec_init(&pvec_putback);
> 
>         /* Phase 1: page isolation */
> -       spin_lock_irq(&zone->zone_pgdat->lru_lock);
>         for (i = 0; i < nr; i++) {
>                 struct page *page = pvec->pages[i];
> 
> +               /* block memcg change in mem_cgroup_move_account */
> +               lock_page(page);
> +               lruvec = relock_page_lruvec_irq(page, lruvec);
>                 if (TestClearPageMlocked(page)) {
>                         /*
>                          * We already have pin from follow_page_mask()
>                          * so we can spare the get_page() here.
>                          */
> -                       if (__munlock_isolate_lru_page(page, false))
> +                       if (__munlock_isolate_lru_page(page, lruvec, false)) {
> +                               unlock_page(page);
>                                 continue;
> -                       else
> +                       } else
>                                 __munlock_isolation_failed(page);
>                 } else {
>                         delta_munlocked++;
> @@ -321,11 +322,14 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
>                  * pin. We cannot do it under lru_lock however. If it's
>                  * the last pin, __page_cache_release() would deadlock.
>                  */
> +               unlock_page(page);
>                 pagevec_add(&pvec_putback, pvec->pages[i]);
>                 pvec->pages[i] = NULL;
>         }
> -       __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
> -       spin_unlock_irq(&zone->zone_pgdat->lru_lock);
> +       if (lruvec) {
> +               __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
> +               unlock_page_lruvec_irq(lruvec);
> +       }
> 
>         /* Now we can release pins of pages that we are not munlocking */
>         pagevec_release(&pvec_putback);
>
diff mbox series

Patch

diff --git a/mm/mlock.c b/mm/mlock.c
index 228ba5a8e0a5..0bdde88b4438 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -103,25 +103,6 @@  void mlock_vma_page(struct page *page)
 }
 
 /*
- * Isolate a page from LRU with optional get_page() pin.
- * Assumes lru_lock already held and page already pinned.
- */
-static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
-{
-	if (TestClearPageLRU(page)) {
-		struct lruvec *lruvec;
-
-		lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
-		if (getpage)
-			get_page(page);
-		del_page_from_lru_list(page, lruvec, page_lru(page));
-		return true;
-	}
-
-	return false;
-}
-
-/*
  * Finish munlock after successful page isolation
  *
  * Page must be locked. This is a wrapper for try_to_munlock()
@@ -181,6 +162,7 @@  static void __munlock_isolation_failed(struct page *page)
 unsigned int munlock_vma_page(struct page *page)
 {
 	int nr_pages;
+	bool clearlru = false;
 	pg_data_t *pgdat = page_pgdat(page);
 
 	/* For try_to_munlock() and to serialize with page migration */
@@ -189,32 +171,42 @@  unsigned int munlock_vma_page(struct page *page)
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
 	/*
-	 * Serialize with any parallel __split_huge_page_refcount() which
+	 * Serialize split tail pages in __split_huge_page_tail() which
 	 * might otherwise copy PageMlocked to part of the tail pages before
 	 * we clear it in the head page. It also stabilizes hpage_nr_pages().
 	 */
+	get_page(page);
+	clearlru = TestClearPageLRU(page);
 	spin_lock_irq(&pgdat->lru_lock);
 
 	if (!TestClearPageMlocked(page)) {
-		/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
-		nr_pages = 1;
-		goto unlock_out;
+		if (clearlru)
+			SetPageLRU(page);
+		/*
+		 * Potentially, PTE-mapped THP: do not skip the rest PTEs
+		 * Reuse lock as memory barrier for release_pages racing.
+		 */
+		spin_unlock_irq(&pgdat->lru_lock);
+		put_page(page);
+		return 0;
 	}
 
 	nr_pages = hpage_nr_pages(page);
 	__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 
-	if (__munlock_isolate_lru_page(page, true)) {
+	if (clearlru) {
+		struct lruvec *lruvec;
+
+		lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		del_page_from_lru_list(page, lruvec, page_lru(page));
 		spin_unlock_irq(&pgdat->lru_lock);
 		__munlock_isolated_page(page);
-		goto out;
+	} else {
+		spin_unlock_irq(&pgdat->lru_lock);
+		put_page(page);
+		__munlock_isolation_failed(page);
 	}
-	__munlock_isolation_failed(page);
-
-unlock_out:
-	spin_unlock_irq(&pgdat->lru_lock);
 
-out:
 	return nr_pages - 1;
 }
 
@@ -297,34 +289,51 @@  static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 	pagevec_init(&pvec_putback);
 
 	/* Phase 1: page isolation */
-	spin_lock_irq(&zone->zone_pgdat->lru_lock);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pvec->pages[i];
+		struct lruvec *lruvec;
+		bool clearlru;
 
-		if (TestClearPageMlocked(page)) {
-			/*
-			 * We already have pin from follow_page_mask()
-			 * so we can spare the get_page() here.
-			 */
-			if (__munlock_isolate_lru_page(page, false))
-				continue;
-			else
-				__munlock_isolation_failed(page);
-		} else {
+		clearlru = TestClearPageLRU(page);
+		spin_lock_irq(&zone->zone_pgdat->lru_lock);
+
+		if (!TestClearPageMlocked(page)) {
 			delta_munlocked++;
+			if (clearlru)
+				SetPageLRU(page);
+			goto putback;
+		}
+
+		if (!clearlru) {
+			__munlock_isolation_failed(page);
+			goto putback;
 		}
 
 		/*
+		 * Isolate this page.
+		 * We already have pin from follow_page_mask()
+		 * so we can spare the get_page() here.
+		 */
+		lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		del_page_from_lru_list(page, lruvec, page_lru(page));
+		spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+		continue;
+
+		/*
 		 * We won't be munlocking this page in the next phase
 		 * but we still need to release the follow_page_mask()
 		 * pin. We cannot do it under lru_lock however. If it's
 		 * the last pin, __page_cache_release() would deadlock.
 		 */
+putback:
+		spin_unlock_irq(&zone->zone_pgdat->lru_lock);
 		pagevec_add(&pvec_putback, pvec->pages[i]);
 		pvec->pages[i] = NULL;
 	}
+	/* tempary disable irq, will remove later */
+	local_irq_disable();
 	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-	spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+	local_irq_enable();
 
 	/* Now we can release pins of pages that we are not munlocking */
 	pagevec_release(&pvec_putback);