diff mbox series

[RFC,v2,2/3] mm: handle large folio when large folio in VM_LOCKED VMA range

Message ID 20230712060144.3006358-3-fengwei.yin@intel.com (mailing list archive)
State New
Headers show
Series support large folio for mlock | expand

Commit Message

Yin, Fengwei July 12, 2023, 6:01 a.m. UTC
If large folio is in the range of VM_LOCKED VMA, it should be
mlocked to avoid being picked by page reclaim. Which may split
the large folio and then mlock each pages again.

Mlock this kind of large folio to prevent them being picked by
page reclaim.

For the large folio which cross the boundary of VM_LOCKED VMA,
we'd better not to mlock it. So if the system is under memory
pressure, this kind of large folio will be split and the pages
ouf of VM_LOCKED VMA can be reclaimed.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 mm/internal.h | 11 ++++++++---
 mm/rmap.c     | 34 +++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 10 deletions(-)

Comments

Yu Zhao July 12, 2023, 6:23 a.m. UTC | #1
On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
>
> If large folio is in the range of VM_LOCKED VMA, it should be
> mlocked to avoid being picked by page reclaim. Which may split
> the large folio and then mlock each pages again.
>
> Mlock this kind of large folio to prevent them being picked by
> page reclaim.
>
> For the large folio which cross the boundary of VM_LOCKED VMA,
> we'd better not to mlock it. So if the system is under memory
> pressure, this kind of large folio will be split and the pages
> ouf of VM_LOCKED VMA can be reclaimed.
>
> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> ---
>  mm/internal.h | 11 ++++++++---
>  mm/rmap.c     | 34 +++++++++++++++++++++++++++-------
>  2 files changed, 35 insertions(+), 10 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index c7dd15d8de3ef..776141de2797a 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
>          */
>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
> -           (compound || !folio_test_large(folio)))
> +           (compound || !folio_test_large(folio) ||
> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
>                 mlock_folio(folio);
>  }

This can be simplified:
1. remove the compound parameter
2. make the if
        if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
            folio_within_vma())
                mlock_folio(folio);

> @@ -651,8 +652,12 @@ void munlock_folio(struct folio *folio);
>  static inline void munlock_vma_folio(struct folio *folio,
>                         struct vm_area_struct *vma, bool compound)

Remove the compound parameter here too.

>  {
> -       if (unlikely(vma->vm_flags & VM_LOCKED) &&
> -           (compound || !folio_test_large(folio)))
> +       /*
> +        * To handle the case that a mlocked large folio is unmapped from VMA
> +        * piece by piece, allow munlock the large folio which is partially
> +        * mapped to VMA.
> +        */
> +       if (unlikely(vma->vm_flags & VM_LOCKED))
>                 munlock_folio(folio);
>  }
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 2668f5ea35342..455f415d8d9ca 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -803,6 +803,14 @@ struct folio_referenced_arg {
>         unsigned long vm_flags;
>         struct mem_cgroup *memcg;
>  };
> +
> +static inline bool should_restore_mlock(struct folio *folio,
> +               struct vm_area_struct *vma, bool pmd_mapped)
> +{
> +       return !folio_test_large(folio) ||
> +                       pmd_mapped || folio_within_vma(folio, vma);
> +}

This is just folio_within_vma() :)

>  /*
>   * arg: folio_referenced_arg will be passed
>   */
> @@ -816,13 +824,25 @@ static bool folio_referenced_one(struct folio *folio,
>         while (page_vma_mapped_walk(&pvmw)) {
>                 address = pvmw.address;
>
> -               if ((vma->vm_flags & VM_LOCKED) &&
> -                   (!folio_test_large(folio) || !pvmw.pte)) {
> -                       /* Restore the mlock which got missed */
> -                       mlock_vma_folio(folio, vma, !pvmw.pte);
> -                       page_vma_mapped_walk_done(&pvmw);
> -                       pra->vm_flags |= VM_LOCKED;
> -                       return false; /* To break the loop */
> +               if (vma->vm_flags & VM_LOCKED) {
> +                       if (should_restore_mlock(folio, vma, !pvmw.pte)) {
> +                               /* Restore the mlock which got missed */
> +                               mlock_vma_folio(folio, vma, !pvmw.pte);
> +                               page_vma_mapped_walk_done(&pvmw);
> +                               pra->vm_flags |= VM_LOCKED;
> +                               return false; /* To break the loop */
> +                       } else {

There is no need for "else", or just

  if (!folio_within_vma())
    goto dec_pra_mapcount;

> +                               /*
> +                                * For large folio cross VMA boundaries, it's
> +                                * expected to be picked  by page reclaim. But
> +                                * should skip reference of pages which are in
> +                                * the range of VM_LOCKED vma. As page reclaim
> +                                * should just count the reference of pages out
> +                                * the range of VM_LOCKED vma.
> +                                */
> +                               pra->mapcount--;
> +                               continue;
> +                       }
>                 }
Yin, Fengwei July 12, 2023, 6:43 a.m. UTC | #2
On 7/12/23 14:23, Yu Zhao wrote:
> On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
>>
>> If large folio is in the range of VM_LOCKED VMA, it should be
>> mlocked to avoid being picked by page reclaim. Which may split
>> the large folio and then mlock each pages again.
>>
>> Mlock this kind of large folio to prevent them being picked by
>> page reclaim.
>>
>> For the large folio which cross the boundary of VM_LOCKED VMA,
>> we'd better not to mlock it. So if the system is under memory
>> pressure, this kind of large folio will be split and the pages
>> ouf of VM_LOCKED VMA can be reclaimed.
>>
>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>> ---
>>  mm/internal.h | 11 ++++++++---
>>  mm/rmap.c     | 34 +++++++++++++++++++++++++++-------
>>  2 files changed, 35 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/internal.h b/mm/internal.h
>> index c7dd15d8de3ef..776141de2797a 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
>>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
>>          */
>>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
>> -           (compound || !folio_test_large(folio)))
>> +           (compound || !folio_test_large(folio) ||
>> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
>>                 mlock_folio(folio);
>>  }
> 
> This can be simplified:
> 1. remove the compound parameter
Yes. There is not difference here for pmd mapping of THPs and pte mappings of THPs
if the only condition need check is whether the folio is within VMA range or not.

But let me add Huge for confirmation.


> 2. make the if
>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
>             folio_within_vma())
>                 mlock_folio(folio);
!folio_test_large(folio) was kept here by purpose. For normal 4K page, don't need
to call folio_within_vma() which is heavy for normal 4K page.


> 
>> @@ -651,8 +652,12 @@ void munlock_folio(struct folio *folio);
>>  static inline void munlock_vma_folio(struct folio *folio,
>>                         struct vm_area_struct *vma, bool compound)
> 
> Remove the compound parameter here too.
> 
>>  {
>> -       if (unlikely(vma->vm_flags & VM_LOCKED) &&
>> -           (compound || !folio_test_large(folio)))
>> +       /*
>> +        * To handle the case that a mlocked large folio is unmapped from VMA
>> +        * piece by piece, allow munlock the large folio which is partially
>> +        * mapped to VMA.
>> +        */
>> +       if (unlikely(vma->vm_flags & VM_LOCKED))
>>                 munlock_folio(folio);
>>  }
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 2668f5ea35342..455f415d8d9ca 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -803,6 +803,14 @@ struct folio_referenced_arg {
>>         unsigned long vm_flags;
>>         struct mem_cgroup *memcg;
>>  };
>> +
>> +static inline bool should_restore_mlock(struct folio *folio,
>> +               struct vm_area_struct *vma, bool pmd_mapped)
>> +{
>> +       return !folio_test_large(folio) ||
>> +                       pmd_mapped || folio_within_vma(folio, vma);
>> +}
> 
> This is just folio_within_vma() :)
> 
>>  /*
>>   * arg: folio_referenced_arg will be passed
>>   */
>> @@ -816,13 +824,25 @@ static bool folio_referenced_one(struct folio *folio,
>>         while (page_vma_mapped_walk(&pvmw)) {
>>                 address = pvmw.address;
>>
>> -               if ((vma->vm_flags & VM_LOCKED) &&
>> -                   (!folio_test_large(folio) || !pvmw.pte)) {
>> -                       /* Restore the mlock which got missed */
>> -                       mlock_vma_folio(folio, vma, !pvmw.pte);
>> -                       page_vma_mapped_walk_done(&pvmw);
>> -                       pra->vm_flags |= VM_LOCKED;
>> -                       return false; /* To break the loop */
>> +               if (vma->vm_flags & VM_LOCKED) {
>> +                       if (should_restore_mlock(folio, vma, !pvmw.pte)) {
>> +                               /* Restore the mlock which got missed */
>> +                               mlock_vma_folio(folio, vma, !pvmw.pte);
>> +                               page_vma_mapped_walk_done(&pvmw);
>> +                               pra->vm_flags |= VM_LOCKED;
>> +                               return false; /* To break the loop */
>> +                       } else {
> 
> There is no need for "else", or just
> 
>   if (!folio_within_vma())
>     goto dec_pra_mapcount;
I tried not to use goto as much as possible. I suppose you mean:

    if (!should_restore_lock())
        goto dec_pra_mapcount; (I may use continue here. :)).

    mlock_vma_folio();
    page_vma_mapped_walk_done()
   ...

Right?


Regards
Yin, Fengwei

> 
>> +                               /*
>> +                                * For large folio cross VMA boundaries, it's
>> +                                * expected to be picked  by page reclaim. But
>> +                                * should skip reference of pages which are in
>> +                                * the range of VM_LOCKED vma. As page reclaim
>> +                                * should just count the reference of pages out
>> +                                * the range of VM_LOCKED vma.
>> +                                */
>> +                               pra->mapcount--;
>> +                               continue;
>> +                       }
>>                 }
Yu Zhao July 12, 2023, 5:03 p.m. UTC | #3
On Wed, Jul 12, 2023 at 12:44 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
>
>
> On 7/12/23 14:23, Yu Zhao wrote:
> > On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
> >>
> >> If large folio is in the range of VM_LOCKED VMA, it should be
> >> mlocked to avoid being picked by page reclaim. Which may split
> >> the large folio and then mlock each pages again.
> >>
> >> Mlock this kind of large folio to prevent them being picked by
> >> page reclaim.
> >>
> >> For the large folio which cross the boundary of VM_LOCKED VMA,
> >> we'd better not to mlock it. So if the system is under memory
> >> pressure, this kind of large folio will be split and the pages
> >> ouf of VM_LOCKED VMA can be reclaimed.
> >>
> >> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> >> ---
> >>  mm/internal.h | 11 ++++++++---
> >>  mm/rmap.c     | 34 +++++++++++++++++++++++++++-------
> >>  2 files changed, 35 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/mm/internal.h b/mm/internal.h
> >> index c7dd15d8de3ef..776141de2797a 100644
> >> --- a/mm/internal.h
> >> +++ b/mm/internal.h
> >> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
> >>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
> >>          */
> >>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
> >> -           (compound || !folio_test_large(folio)))
> >> +           (compound || !folio_test_large(folio) ||
> >> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
> >>                 mlock_folio(folio);
> >>  }
> >
> > This can be simplified:
> > 1. remove the compound parameter
> Yes. There is not difference here for pmd mapping of THPs and pte mappings of THPs
> if the only condition need check is whether the folio is within VMA range or not.
>
> But let me add Huge for confirmation.
>
>
> > 2. make the if
> >         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
> >             folio_within_vma())
> >                 mlock_folio(folio);
> !folio_test_large(folio) was kept here by purpose. For normal 4K page, don't need
> to call folio_within_vma() which is heavy for normal 4K page.

I suspected you would think so -- I don't think it would make any
measurable (for systems with mostly large folios, it would actually be
an extra work). Since we have many places like this once, probably we
could wrap folio_test_large() into folio_within_vma() and call it
large_folio_within_vma(), if you feel it's necessary.

> >> @@ -651,8 +652,12 @@ void munlock_folio(struct folio *folio);
> >>  static inline void munlock_vma_folio(struct folio *folio,
> >>                         struct vm_area_struct *vma, bool compound)
> >
> > Remove the compound parameter here too.
> >
> >>  {
> >> -       if (unlikely(vma->vm_flags & VM_LOCKED) &&
> >> -           (compound || !folio_test_large(folio)))
> >> +       /*
> >> +        * To handle the case that a mlocked large folio is unmapped from VMA
> >> +        * piece by piece, allow munlock the large folio which is partially
> >> +        * mapped to VMA.
> >> +        */
> >> +       if (unlikely(vma->vm_flags & VM_LOCKED))
> >>                 munlock_folio(folio);
> >>  }
> >>
> >> diff --git a/mm/rmap.c b/mm/rmap.c
> >> index 2668f5ea35342..455f415d8d9ca 100644
> >> --- a/mm/rmap.c
> >> +++ b/mm/rmap.c
> >> @@ -803,6 +803,14 @@ struct folio_referenced_arg {
> >>         unsigned long vm_flags;
> >>         struct mem_cgroup *memcg;
> >>  };
> >> +
> >> +static inline bool should_restore_mlock(struct folio *folio,
> >> +               struct vm_area_struct *vma, bool pmd_mapped)
> >> +{
> >> +       return !folio_test_large(folio) ||
> >> +                       pmd_mapped || folio_within_vma(folio, vma);
> >> +}
> >
> > This is just folio_within_vma() :)
> >
> >>  /*
> >>   * arg: folio_referenced_arg will be passed
> >>   */
> >> @@ -816,13 +824,25 @@ static bool folio_referenced_one(struct folio *folio,
> >>         while (page_vma_mapped_walk(&pvmw)) {
> >>                 address = pvmw.address;
> >>
> >> -               if ((vma->vm_flags & VM_LOCKED) &&
> >> -                   (!folio_test_large(folio) || !pvmw.pte)) {
> >> -                       /* Restore the mlock which got missed */
> >> -                       mlock_vma_folio(folio, vma, !pvmw.pte);
> >> -                       page_vma_mapped_walk_done(&pvmw);
> >> -                       pra->vm_flags |= VM_LOCKED;
> >> -                       return false; /* To break the loop */
> >> +               if (vma->vm_flags & VM_LOCKED) {
> >> +                       if (should_restore_mlock(folio, vma, !pvmw.pte)) {
> >> +                               /* Restore the mlock which got missed */
> >> +                               mlock_vma_folio(folio, vma, !pvmw.pte);
> >> +                               page_vma_mapped_walk_done(&pvmw);
> >> +                               pra->vm_flags |= VM_LOCKED;
> >> +                               return false; /* To break the loop */
> >> +                       } else {
> >
> > There is no need for "else", or just
> >
> >   if (!folio_within_vma())
> >     goto dec_pra_mapcount;
> I tried not to use goto as much as possible. I suppose you mean:
>
>     if (!should_restore_lock())
>         goto dec_pra_mapcount; (I may use continue here. :)).

should_restore_lock() is just folio_within_vma() -- see the comment
above. "continue" looks good to me too (prefer not to add more indents
to the functions below).

>     mlock_vma_folio();
>     page_vma_mapped_walk_done()
>    ...
>
> Right?

Right.
Yin, Fengwei July 13, 2023, 1:55 a.m. UTC | #4
On 7/13/23 01:03, Yu Zhao wrote:
> On Wed, Jul 12, 2023 at 12:44 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
>>
>>
>> On 7/12/23 14:23, Yu Zhao wrote:
>>> On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
>>>>
>>>> If large folio is in the range of VM_LOCKED VMA, it should be
>>>> mlocked to avoid being picked by page reclaim. Which may split
>>>> the large folio and then mlock each pages again.
>>>>
>>>> Mlock this kind of large folio to prevent them being picked by
>>>> page reclaim.
>>>>
>>>> For the large folio which cross the boundary of VM_LOCKED VMA,
>>>> we'd better not to mlock it. So if the system is under memory
>>>> pressure, this kind of large folio will be split and the pages
>>>> ouf of VM_LOCKED VMA can be reclaimed.
>>>>
>>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>>> ---
>>>>  mm/internal.h | 11 ++++++++---
>>>>  mm/rmap.c     | 34 +++++++++++++++++++++++++++-------
>>>>  2 files changed, 35 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>> index c7dd15d8de3ef..776141de2797a 100644
>>>> --- a/mm/internal.h
>>>> +++ b/mm/internal.h
>>>> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
>>>>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
>>>>          */
>>>>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
>>>> -           (compound || !folio_test_large(folio)))
>>>> +           (compound || !folio_test_large(folio) ||
>>>> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
>>>>                 mlock_folio(folio);
>>>>  }
>>>
>>> This can be simplified:
>>> 1. remove the compound parameter
>> Yes. There is not difference here for pmd mapping of THPs and pte mappings of THPs
>> if the only condition need check is whether the folio is within VMA range or not.
>>
>> But let me add Huge for confirmation.
>>
>>
>>> 2. make the if
>>>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
>>>             folio_within_vma())
>>>                 mlock_folio(folio);
>> !folio_test_large(folio) was kept here by purpose. For normal 4K page, don't need
>> to call folio_within_vma() which is heavy for normal 4K page.
> 
> I suspected you would think so -- I don't think it would make any
> measurable (for systems with mostly large folios, it would actually be
> an extra work). Since we have many places like this once, probably we
> could wrap folio_test_large() into folio_within_vma() and call it
> large_folio_within_vma(), if you feel it's necessary.
I thought about moving folio_test_large() to folio_in_range(). But gave
it up because of checking folio addr in vma range.

But with new folio_within_vma(), we could do that. Will move folio_test_large()
to folio_within_vma() (I will keep current naming) and make it like:

     return !folio_test_large() || folio_in_range();

> 
>>>> @@ -651,8 +652,12 @@ void munlock_folio(struct folio *folio);
>>>>  static inline void munlock_vma_folio(struct folio *folio,
>>>>                         struct vm_area_struct *vma, bool compound)
>>>
>>> Remove the compound parameter here too.
>>>
>>>>  {
>>>> -       if (unlikely(vma->vm_flags & VM_LOCKED) &&
>>>> -           (compound || !folio_test_large(folio)))
>>>> +       /*
>>>> +        * To handle the case that a mlocked large folio is unmapped from VMA
>>>> +        * piece by piece, allow munlock the large folio which is partially
>>>> +        * mapped to VMA.
>>>> +        */
>>>> +       if (unlikely(vma->vm_flags & VM_LOCKED))
>>>>                 munlock_folio(folio);
>>>>  }
>>>>
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index 2668f5ea35342..455f415d8d9ca 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -803,6 +803,14 @@ struct folio_referenced_arg {
>>>>         unsigned long vm_flags;
>>>>         struct mem_cgroup *memcg;
>>>>  };
>>>> +
>>>> +static inline bool should_restore_mlock(struct folio *folio,
>>>> +               struct vm_area_struct *vma, bool pmd_mapped)
>>>> +{
>>>> +       return !folio_test_large(folio) ||
>>>> +                       pmd_mapped || folio_within_vma(folio, vma);
>>>> +}
>>>
>>> This is just folio_within_vma() :)
>>>
>>>>  /*
>>>>   * arg: folio_referenced_arg will be passed
>>>>   */
>>>> @@ -816,13 +824,25 @@ static bool folio_referenced_one(struct folio *folio,
>>>>         while (page_vma_mapped_walk(&pvmw)) {
>>>>                 address = pvmw.address;
>>>>
>>>> -               if ((vma->vm_flags & VM_LOCKED) &&
>>>> -                   (!folio_test_large(folio) || !pvmw.pte)) {
>>>> -                       /* Restore the mlock which got missed */
>>>> -                       mlock_vma_folio(folio, vma, !pvmw.pte);
>>>> -                       page_vma_mapped_walk_done(&pvmw);
>>>> -                       pra->vm_flags |= VM_LOCKED;
>>>> -                       return false; /* To break the loop */
>>>> +               if (vma->vm_flags & VM_LOCKED) {
>>>> +                       if (should_restore_mlock(folio, vma, !pvmw.pte)) {
>>>> +                               /* Restore the mlock which got missed */
>>>> +                               mlock_vma_folio(folio, vma, !pvmw.pte);
>>>> +                               page_vma_mapped_walk_done(&pvmw);
>>>> +                               pra->vm_flags |= VM_LOCKED;
>>>> +                               return false; /* To break the loop */
>>>> +                       } else {
>>>
>>> There is no need for "else", or just
>>>
>>>   if (!folio_within_vma())
>>>     goto dec_pra_mapcount;
>> I tried not to use goto as much as possible. I suppose you mean:
>>
>>     if (!should_restore_lock())
>>         goto dec_pra_mapcount; (I may use continue here. :)).
> 
> should_restore_lock() is just folio_within_vma() -- see the comment
> above. "continue" looks good to me too (prefer not to add more indents
> to the functions below).
Yes.

> 
>>     mlock_vma_folio();
>>     page_vma_mapped_walk_done()
>>    ...
>>
>> Right?
> 
> Right.
This is very good suggestion. Will update v3 accordingly after wait
for a while in case other comments. Thanks.


Regards
Yin, Fengwei
Hugh Dickins July 14, 2023, 2:21 a.m. UTC | #5
On Wed, 12 Jul 2023, Yin Fengwei wrote:
> On 7/12/23 14:23, Yu Zhao wrote:
> > On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
> >> --- a/mm/internal.h
> >> +++ b/mm/internal.h
> >> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
> >>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
> >>          */
> >>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
> >> -           (compound || !folio_test_large(folio)))
> >> +           (compound || !folio_test_large(folio) ||
> >> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
> >>                 mlock_folio(folio);
> >>  }
> > 
> > This can be simplified:
> > 1. remove the compound parameter
> Yes. There is not difference here for pmd mapping of THPs and pte mappings of THPs
> if the only condition need check is whether the folio is within VMA range or not.
> 
> But let me add Huge for confirmation.

I'm not sure what it is that you need me to confirm: if the folio fits
within the vma, then the folio fits within the vma, pmd-mapped or not.

(And I agree with Yu that it's better to drop the folio_test_large()
check too.)

This idea, of counting the folio as mlocked according to whether the
whole folio fits within the vma, does seem a good idea to me: worth
pursuing.  But whether the implementation adds up and works out, I
have not checked.  It was always difficult to arrive at a satisfactory
compromise in mlocking compound pages: I hope this way does work out.

Hugh
Yin, Fengwei July 14, 2023, 2:49 a.m. UTC | #6
On 7/14/2023 10:21 AM, Hugh Dickins wrote:
> On Wed, 12 Jul 2023, Yin Fengwei wrote:
>> On 7/12/23 14:23, Yu Zhao wrote:
>>> On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
>>>> --- a/
>>>> +++ b/mm/internal.h
>>>> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
>>>>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
>>>>          */
>>>>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
>>>> -           (compound || !folio_test_large(folio)))
>>>> +           (compound || !folio_test_large(folio) ||
>>>> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
>>>>                 mlock_folio(folio);
>>>>  }
>>>
>>> This can be simplified:
>>> 1. remove the compound parameter
>> Yes. There is not difference here for pmd mapping of THPs and pte mappings of THPs
>> if the only condition need check is whether the folio is within VMA range or not.
>>
>> But let me add Huge for confirmation.
> 
> I'm not sure what it is that you need me to confirm: if the folio fits
> within the vma, then the folio fits within the vma, pmd-mapped or not.
Sorry. My bad. I should speak it out for what I want your confirmation:
  Whether we can remove the compound and use whether folio is within
  VMA instead.

I suppose you answer is Yes.

> 
> (And I agree with Yu that it's better to drop the folio_test_large()
> check too.)
My argument was folio_test_large() can filter the normal 4K page out so
it doesn't need to call folio_in_range() which looks to me a little bit
heavy for normal 4K page. And the deal was move folio_test_large()
to folio_in_range() like function so simplify the code in caller side.

> 
> This idea, of counting the folio as mlocked according to whether the
> whole folio fits within the vma, does seem a good idea to me: worth
> pursuing.  But whether the implementation adds up and works out, I
> have not checked.  It was always difficult to arrive at a satisfactory
> compromise in mlocking compound pages: I hope this way does work out.
This is the purpose of this patch. :). Thanks.


Regards
Yin, Fengwei

> 
> Hugh
Hugh Dickins July 14, 2023, 3:41 a.m. UTC | #7
On Fri, 14 Jul 2023, Yin, Fengwei wrote:
> On 7/14/2023 10:21 AM, Hugh Dickins wrote:
> > On Wed, 12 Jul 2023, Yin Fengwei wrote:
> >> On 7/12/23 14:23, Yu Zhao wrote:
> >>> On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
> >>>> --- a/
> >>>> +++ b/mm/internal.h
> >>>> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
> >>>>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
> >>>>          */
> >>>>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
> >>>> -           (compound || !folio_test_large(folio)))
> >>>> +           (compound || !folio_test_large(folio) ||
> >>>> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
> >>>>                 mlock_folio(folio);
> >>>>  }
> >>>
> >>> This can be simplified:
> >>> 1. remove the compound parameter
> >> Yes. There is not difference here for pmd mapping of THPs and pte mappings of THPs
> >> if the only condition need check is whether the folio is within VMA range or not.
> >>
> >> But let me add Huge for confirmation.
> > 
> > I'm not sure what it is that you need me to confirm: if the folio fits
> > within the vma, then the folio fits within the vma, pmd-mapped or not.
> Sorry. My bad. I should speak it out for what I want your confirmation:
>   Whether we can remove the compound and use whether folio is within
>   VMA instead.
> 
> I suppose you answer is Yes.

Yes (if it all works out going that way).

> 
> > 
> > (And I agree with Yu that it's better to drop the folio_test_large()
> > check too.)
> My argument was folio_test_large() can filter the normal 4K page out so
> it doesn't need to call folio_in_range() which looks to me a little bit
> heavy for normal 4K page. And the deal was move folio_test_large()
> to folio_in_range() like function so simplify the code in caller side.

I realized that, but agree with Yu.

It looked a little over-engineered to me, but I didn't spend long enough
looking to understand why there's folio_within_vma() distinct from
folio_in_range(), when everyone(?) calls folio_in_range() with the same
arguments vma->vm_start, vma->vm_end.

> 
> > 
> > This idea, of counting the folio as mlocked according to whether the
> > whole folio fits within the vma, does seem a good idea to me: worth
> > pursuing.  But whether the implementation adds up and works out, I
> > have not checked.  It was always difficult to arrive at a satisfactory
> > compromise in mlocking compound pages: I hope this way does work out.
> This is the purpose of this patch. :). Thanks.
> 
> 
> Regards
> Yin, Fengwei
> 
> > 
> > Hugh
Yin, Fengwei July 14, 2023, 5:45 a.m. UTC | #8
On 7/14/2023 11:41 AM, Hugh Dickins wrote:
> On Fri, 14 Jul 2023, Yin, Fengwei wrote:
>> On 7/14/2023 10:21 AM, Hugh Dickins wrote:
>>> On Wed, 12 Jul 2023, Yin Fengwei wrote:
>>>> On 7/12/23 14:23, Yu Zhao wrote:
>>>>> On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@intel.com> wrote:
>>>>>> --- a/
>>>>>> +++ b/mm/internal.h
>>>>>> @@ -643,7 +643,8 @@ static inline void mlock_vma_folio(struct folio *folio,
>>>>>>          *    still be set while VM_SPECIAL bits are added: so ignore it then.
>>>>>>          */
>>>>>>         if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
>>>>>> -           (compound || !folio_test_large(folio)))
>>>>>> +           (compound || !folio_test_large(folio) ||
>>>>>> +           folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
>>>>>>                 mlock_folio(folio);
>>>>>>  }
>>>>>
>>>>> This can be simplified:
>>>>> 1. remove the compound parameter
>>>> Yes. There is not difference here for pmd mapping of THPs and pte mappings of THPs
>>>> if the only condition need check is whether the folio is within VMA range or not.
>>>>
>>>> But let me add Huge for confirmation.
>>>
>>> I'm not sure what it is that you need me to confirm: if the folio fits
>>> within the vma, then the folio fits within the vma, pmd-mapped or not.
>> Sorry. My bad. I should speak it out for what I want your confirmation:
>>   Whether we can remove the compound and use whether folio is within
>>   VMA instead.
>>
>> I suppose you answer is Yes.
> 
> Yes (if it all works out going that way).
> 
>>
>>>
>>> (And I agree with Yu that it's better to drop the folio_test_large()
>>> check too.)
>> My argument was folio_test_large() can filter the normal 4K page out so
>> it doesn't need to call folio_in_range() which looks to me a little bit
>> heavy for normal 4K page. And the deal was move folio_test_large()
>> to folio_in_range() like function so simplify the code in caller side.
> 
> I realized that, but agree with Yu.
OK. I will rethink this as both Yu and you suggested same thing.

> 
> It looked a little over-engineered to me, but I didn't spend long enough
> looking to understand why there's folio_within_vma() distinct from
> folio_in_range(), when everyone(?) calls folio_in_range() with the same
> arguments vma->vm_start, vma->vm_end.
madvise could call folio_in_range() with start/end from user space instead
of using VMA range.


Regards
Yin, Fengwei

> 
>>
>>>
>>> This idea, of counting the folio as mlocked according to whether the
>>> whole folio fits within the vma, does seem a good idea to me: worth
>>> pursuing.  But whether the implementation adds up and works out, I
>>> have not checked.  It was always difficult to arrive at a satisfactory
>>> compromise in mlocking compound pages: I hope this way does work out.
>> This is the purpose of this patch. :). Thanks.
>>
>>
>> Regards
>> Yin, Fengwei
>>
>>>
>>> Hugh
diff mbox series

Patch

diff --git a/mm/internal.h b/mm/internal.h
index c7dd15d8de3ef..776141de2797a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -643,7 +643,8 @@  static inline void mlock_vma_folio(struct folio *folio,
 	 *    still be set while VM_SPECIAL bits are added: so ignore it then.
 	 */
 	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
-	    (compound || !folio_test_large(folio)))
+	    (compound || !folio_test_large(folio) ||
+	    folio_in_range(folio, vma, vma->vm_start, vma->vm_end)))
 		mlock_folio(folio);
 }
 
@@ -651,8 +652,12 @@  void munlock_folio(struct folio *folio);
 static inline void munlock_vma_folio(struct folio *folio,
 			struct vm_area_struct *vma, bool compound)
 {
-	if (unlikely(vma->vm_flags & VM_LOCKED) &&
-	    (compound || !folio_test_large(folio)))
+	/*
+	 * To handle the case that a mlocked large folio is unmapped from VMA
+	 * piece by piece, allow munlock the large folio which is partially
+	 * mapped to VMA.
+	 */
+	if (unlikely(vma->vm_flags & VM_LOCKED))
 		munlock_folio(folio);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 2668f5ea35342..455f415d8d9ca 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -803,6 +803,14 @@  struct folio_referenced_arg {
 	unsigned long vm_flags;
 	struct mem_cgroup *memcg;
 };
+
+static inline bool should_restore_mlock(struct folio *folio,
+		struct vm_area_struct *vma, bool pmd_mapped)
+{
+	return !folio_test_large(folio) ||
+			pmd_mapped || folio_within_vma(folio, vma);
+}
+
 /*
  * arg: folio_referenced_arg will be passed
  */
@@ -816,13 +824,25 @@  static bool folio_referenced_one(struct folio *folio,
 	while (page_vma_mapped_walk(&pvmw)) {
 		address = pvmw.address;
 
-		if ((vma->vm_flags & VM_LOCKED) &&
-		    (!folio_test_large(folio) || !pvmw.pte)) {
-			/* Restore the mlock which got missed */
-			mlock_vma_folio(folio, vma, !pvmw.pte);
-			page_vma_mapped_walk_done(&pvmw);
-			pra->vm_flags |= VM_LOCKED;
-			return false; /* To break the loop */
+		if (vma->vm_flags & VM_LOCKED) {
+			if (should_restore_mlock(folio, vma, !pvmw.pte)) {
+				/* Restore the mlock which got missed */
+				mlock_vma_folio(folio, vma, !pvmw.pte);
+				page_vma_mapped_walk_done(&pvmw);
+				pra->vm_flags |= VM_LOCKED;
+				return false; /* To break the loop */
+			} else {
+				/*
+				 * For large folio cross VMA boundaries, it's
+				 * expected to be picked  by page reclaim. But
+				 * should skip reference of pages which are in
+				 * the range of VM_LOCKED vma. As page reclaim
+				 * should just count the reference of pages out
+				 * the range of VM_LOCKED vma.
+				 */
+				pra->mapcount--;
+				continue;
+			}
 		}
 
 		if (pvmw.pte) {