diff mbox series

[v9,4/4] mm/madvise: optimize lazyfreeing with mTHP in madvise_free

Message ID 20240418105750.98866-5-ioworker0@gmail.com (mailing list archive)
State New
Headers show
Series mm/madvise: enhance lazyfreeing with mTHP in madvise_free | expand

Commit Message

Lance Yang April 18, 2024, 10:57 a.m. UTC
This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
(Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
splitting if the large folio is fully mapped within the target range.

If a large folio is locked or shared, or if we fail to split it, we just
leave it in place and advance to the next PTE in the range. But note that
the behavior is changed; previously, any failure of this sort would cause
the entire operation to give up. As large folios become more common,
sticking to the old way could result in wasted opportunities.

On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
the same size results in the following runtimes for madvise(MADV_FREE) in
seconds (shorter is better):

Folio Size |   Old    |   New    | Change
------------------------------------------
      4KiB | 0.590251 | 0.590259 |    0%
     16KiB | 2.990447 | 0.185655 |  -94%
     32KiB | 2.547831 | 0.104870 |  -95%
     64KiB | 2.457796 | 0.052812 |  -97%
    128KiB | 2.281034 | 0.032777 |  -99%
    256KiB | 2.230387 | 0.017496 |  -99%
    512KiB | 2.189106 | 0.010781 |  -99%
   1024KiB | 2.183949 | 0.007753 |  -99%
   2048KiB | 0.002799 | 0.002804 |    0%

[1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
[2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Lance Yang <ioworker0@gmail.com>
---
 mm/madvise.c | 85 +++++++++++++++++++++++++++-------------------------
 1 file changed, 44 insertions(+), 41 deletions(-)

Comments

David Hildenbrand April 18, 2024, 12:03 p.m. UTC | #1
On 18.04.24 12:57, Lance Yang wrote:
> This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
> (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
> splitting if the large folio is fully mapped within the target range.
> 
> If a large folio is locked or shared, or if we fail to split it, we just
> leave it in place and advance to the next PTE in the range. But note that
> the behavior is changed; previously, any failure of this sort would cause
> the entire operation to give up. As large folios become more common,
> sticking to the old way could result in wasted opportunities.
> 
> On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
> the same size results in the following runtimes for madvise(MADV_FREE) in
> seconds (shorter is better):
> 
> Folio Size |   Old    |   New    | Change
> ------------------------------------------
>        4KiB | 0.590251 | 0.590259 |    0%
>       16KiB | 2.990447 | 0.185655 |  -94%
>       32KiB | 2.547831 | 0.104870 |  -95%
>       64KiB | 2.457796 | 0.052812 |  -97%
>      128KiB | 2.281034 | 0.032777 |  -99%
>      256KiB | 2.230387 | 0.017496 |  -99%
>      512KiB | 2.189106 | 0.010781 |  -99%
>     1024KiB | 2.183949 | 0.007753 |  -99%
>     2048KiB | 0.002799 | 0.002804 |    0%
> 
> [1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
> [2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com
> 
> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> ---
>   mm/madvise.c | 85 +++++++++++++++++++++++++++-------------------------
>   1 file changed, 44 insertions(+), 41 deletions(-)
> 
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 4597a3568e7e..375ab3234603 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -643,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>   				unsigned long end, struct mm_walk *walk)
>   
>   {
> +	const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
>   	struct mmu_gather *tlb = walk->private;
>   	struct mm_struct *mm = tlb->mm;
>   	struct vm_area_struct *vma = walk->vma;
> @@ -697,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>   			continue;
>   
>   		/*
> -		 * If pmd isn't transhuge but the folio is large and
> -		 * is owned by only this process, split it and
> -		 * deactivate all pages.
> +		 * If we encounter a large folio, only split it if it is not
> +		 * fully mapped within the range we are operating on. Otherwise
> +		 * leave it as is so that it can be marked as lazyfree. If we
> +		 * fail to split a folio, leave it in place and advance to the
> +		 * next pte in the range.
>   		 */
>   		if (folio_test_large(folio)) {
> -			int err;
> +			bool any_young, any_dirty;
>   
> -			if (folio_likely_mapped_shared(folio))
> -				break;
> -			if (!folio_trylock(folio))
> -				break;
> -			folio_get(folio);
> -			arch_leave_lazy_mmu_mode();
> -			pte_unmap_unlock(start_pte, ptl);
> -			start_pte = NULL;
> -			err = split_folio(folio);
> -			folio_unlock(folio);
> -			folio_put(folio);
> -			if (err)
> -				break;
> -			start_pte = pte =
> -				pte_offset_map_lock(mm, pmd, addr, &ptl);
> -			if (!start_pte)
> -				break;
> -			arch_enter_lazy_mmu_mode();
> -			pte--;
> -			addr -= PAGE_SIZE;
> -			continue;
> +			nr = madvise_folio_pte_batch(addr, end, folio, pte,
> +						     ptent, &any_young, NULL);
> +
> +			if (nr < folio_nr_pages(folio)) {
> +				int err;
> +
> +				if (folio_likely_mapped_shared(folio))
> +					continue;
> +				if (!folio_trylock(folio))
> +					continue;
> +				folio_get(folio);
> +				arch_leave_lazy_mmu_mode();
> +				pte_unmap_unlock(start_pte, ptl);
> +				start_pte = NULL;
> +				err = split_folio(folio);
> +				folio_unlock(folio);
> +				folio_put(folio);
> +				start_pte = pte =
> +					pte_offset_map_lock(mm, pmd, addr, &ptl);

I'd just put it on a single line.

> +				if (!start_pte)
> +					break;
> +				arch_enter_lazy_mmu_mode();
> +				if (!err)
> +					nr = 0;
> +				continue;
> +			}
> +
> +			if (any_young)
> +				ptent = pte_mkyoung(ptent);
> +			if (any_dirty)

any_dirty is never set, likely missed to pass it to 
madvise_folio_pte_batch().

Apart from that LGTM and this patch is much easier to review now!


With above:

Acked-by: David Hildenbrand <david@redhat.com>
Lance Yang April 18, 2024, 12:16 p.m. UTC | #2
Hey David,

Thanks a lot for taking time to review!

On Thu, Apr 18, 2024 at 8:03 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.04.24 12:57, Lance Yang wrote:
> > This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
> > (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
> > splitting if the large folio is fully mapped within the target range.
> >
> > If a large folio is locked or shared, or if we fail to split it, we just
> > leave it in place and advance to the next PTE in the range. But note that
> > the behavior is changed; previously, any failure of this sort would cause
> > the entire operation to give up. As large folios become more common,
> > sticking to the old way could result in wasted opportunities.
> >
> > On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
> > the same size results in the following runtimes for madvise(MADV_FREE) in
> > seconds (shorter is better):
> >
> > Folio Size |   Old    |   New    | Change
> > ------------------------------------------
> >        4KiB | 0.590251 | 0.590259 |    0%
> >       16KiB | 2.990447 | 0.185655 |  -94%
> >       32KiB | 2.547831 | 0.104870 |  -95%
> >       64KiB | 2.457796 | 0.052812 |  -97%
> >      128KiB | 2.281034 | 0.032777 |  -99%
> >      256KiB | 2.230387 | 0.017496 |  -99%
> >      512KiB | 2.189106 | 0.010781 |  -99%
> >     1024KiB | 2.183949 | 0.007753 |  -99%
> >     2048KiB | 0.002799 | 0.002804 |    0%
> >
> > [1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
> > [2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com
> >
> > Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > ---
> >   mm/madvise.c | 85 +++++++++++++++++++++++++++-------------------------
> >   1 file changed, 44 insertions(+), 41 deletions(-)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 4597a3568e7e..375ab3234603 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -643,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >                               unsigned long end, struct mm_walk *walk)
> >
> >   {
> > +     const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
> >       struct mmu_gather *tlb = walk->private;
> >       struct mm_struct *mm = tlb->mm;
> >       struct vm_area_struct *vma = walk->vma;
> > @@ -697,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >                       continue;
> >
> >               /*
> > -              * If pmd isn't transhuge but the folio is large and
> > -              * is owned by only this process, split it and
> > -              * deactivate all pages.
> > +              * If we encounter a large folio, only split it if it is not
> > +              * fully mapped within the range we are operating on. Otherwise
> > +              * leave it as is so that it can be marked as lazyfree. If we
> > +              * fail to split a folio, leave it in place and advance to the
> > +              * next pte in the range.
> >                */
> >               if (folio_test_large(folio)) {
> > -                     int err;
> > +                     bool any_young, any_dirty;
> >
> > -                     if (folio_likely_mapped_shared(folio))
> > -                             break;
> > -                     if (!folio_trylock(folio))
> > -                             break;
> > -                     folio_get(folio);
> > -                     arch_leave_lazy_mmu_mode();
> > -                     pte_unmap_unlock(start_pte, ptl);
> > -                     start_pte = NULL;
> > -                     err = split_folio(folio);
> > -                     folio_unlock(folio);
> > -                     folio_put(folio);
> > -                     if (err)
> > -                             break;
> > -                     start_pte = pte =
> > -                             pte_offset_map_lock(mm, pmd, addr, &ptl);
> > -                     if (!start_pte)
> > -                             break;
> > -                     arch_enter_lazy_mmu_mode();
> > -                     pte--;
> > -                     addr -= PAGE_SIZE;
> > -                     continue;
> > +                     nr = madvise_folio_pte_batch(addr, end, folio, pte,
> > +                                                  ptent, &any_young, NULL);
> > +
> > +                     if (nr < folio_nr_pages(folio)) {
> > +                             int err;
> > +
> > +                             if (folio_likely_mapped_shared(folio))
> > +                                     continue;
> > +                             if (!folio_trylock(folio))
> > +                                     continue;
> > +                             folio_get(folio);
> > +                             arch_leave_lazy_mmu_mode();
> > +                             pte_unmap_unlock(start_pte, ptl);
> > +                             start_pte = NULL;
> > +                             err = split_folio(folio);
> > +                             folio_unlock(folio);
> > +                             folio_put(folio);
> > +                             start_pte = pte =
> > +                                     pte_offset_map_lock(mm, pmd, addr, &ptl);
>
> I'd just put it on a single line.

Got it.

>
> > +                             if (!start_pte)
> > +                                     break;
> > +                             arch_enter_lazy_mmu_mode();
> > +                             if (!err)
> > +                                     nr = 0;
> > +                             continue;
> > +                     }
> > +
> > +                     if (any_young)
> > +                             ptent = pte_mkyoung(ptent);
> > +                     if (any_dirty)
>
> any_dirty is never set, likely missed to pass it to
> madvise_folio_pte_batch().

Good spot! My bad for the mistake - sorry :(

>
> Apart from that LGTM and this patch is much easier to review now!
>
>
> With above:
>
> Acked-by: David Hildenbrand <david@redhat.com>

Thanks again for the review!
Lance

>
> --
> Cheers,
>
> David / dhildenb
>
Lance Yang April 18, 2024, 12:33 p.m. UTC | #3
On Thu, Apr 18, 2024 at 8:03 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.04.24 12:57, Lance Yang wrote:
> > This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
> > (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
> > splitting if the large folio is fully mapped within the target range.
> >
> > If a large folio is locked or shared, or if we fail to split it, we just
> > leave it in place and advance to the next PTE in the range. But note that
> > the behavior is changed; previously, any failure of this sort would cause
> > the entire operation to give up. As large folios become more common,
> > sticking to the old way could result in wasted opportunities.
> >
> > On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
> > the same size results in the following runtimes for madvise(MADV_FREE) in
> > seconds (shorter is better):
> >
> > Folio Size |   Old    |   New    | Change
> > ------------------------------------------
> >        4KiB | 0.590251 | 0.590259 |    0%
> >       16KiB | 2.990447 | 0.185655 |  -94%
> >       32KiB | 2.547831 | 0.104870 |  -95%
> >       64KiB | 2.457796 | 0.052812 |  -97%
> >      128KiB | 2.281034 | 0.032777 |  -99%
> >      256KiB | 2.230387 | 0.017496 |  -99%
> >      512KiB | 2.189106 | 0.010781 |  -99%
> >     1024KiB | 2.183949 | 0.007753 |  -99%
> >     2048KiB | 0.002799 | 0.002804 |    0%
> >
> > [1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
> > [2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com
> >
> > Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > ---
> >   mm/madvise.c | 85 +++++++++++++++++++++++++++-------------------------
> >   1 file changed, 44 insertions(+), 41 deletions(-)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 4597a3568e7e..375ab3234603 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -643,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >                               unsigned long end, struct mm_walk *walk)
> >
> >   {
> > +     const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
> >       struct mmu_gather *tlb = walk->private;
> >       struct mm_struct *mm = tlb->mm;
> >       struct vm_area_struct *vma = walk->vma;
> > @@ -697,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >                       continue;
> >
> >               /*
> > -              * If pmd isn't transhuge but the folio is large and
> > -              * is owned by only this process, split it and
> > -              * deactivate all pages.
> > +              * If we encounter a large folio, only split it if it is not
> > +              * fully mapped within the range we are operating on. Otherwise
> > +              * leave it as is so that it can be marked as lazyfree. If we
> > +              * fail to split a folio, leave it in place and advance to the
> > +              * next pte in the range.
> >                */
> >               if (folio_test_large(folio)) {
> > -                     int err;
> > +                     bool any_young, any_dirty;
> >
> > -                     if (folio_likely_mapped_shared(folio))
> > -                             break;
> > -                     if (!folio_trylock(folio))
> > -                             break;
> > -                     folio_get(folio);
> > -                     arch_leave_lazy_mmu_mode();
> > -                     pte_unmap_unlock(start_pte, ptl);
> > -                     start_pte = NULL;
> > -                     err = split_folio(folio);
> > -                     folio_unlock(folio);
> > -                     folio_put(folio);
> > -                     if (err)
> > -                             break;
> > -                     start_pte = pte =
> > -                             pte_offset_map_lock(mm, pmd, addr, &ptl);
> > -                     if (!start_pte)
> > -                             break;
> > -                     arch_enter_lazy_mmu_mode();
> > -                     pte--;
> > -                     addr -= PAGE_SIZE;
> > -                     continue;
> > +                     nr = madvise_folio_pte_batch(addr, end, folio, pte,
> > +                                                  ptent, &any_young, NULL);
> > +
> > +                     if (nr < folio_nr_pages(folio)) {
> > +                             int err;
> > +
> > +                             if (folio_likely_mapped_shared(folio))
> > +                                     continue;
> > +                             if (!folio_trylock(folio))
> > +                                     continue;
> > +                             folio_get(folio);
> > +                             arch_leave_lazy_mmu_mode();
> > +                             pte_unmap_unlock(start_pte, ptl);
> > +                             start_pte = NULL;
> > +                             err = split_folio(folio);
> > +                             folio_unlock(folio);
> > +                             folio_put(folio);
> > +                             start_pte = pte =
> > +                                     pte_offset_map_lock(mm, pmd, addr, &ptl);
>
> I'd just put it on a single line.

start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);

I suddenly realized that putting it on a single line would exceed the
80-char limit.

Or:

start_pte = pte = pte_offset_map_lock(
        mm, pmd, addr, &ptl);

Thanks,
Lance
>
> > +                             if (!start_pte)
> > +                                     break;
> > +                             arch_enter_lazy_mmu_mode();
> > +                             if (!err)
> > +                                     nr = 0;
> > +                             continue;
> > +                     }
> > +
> > +                     if (any_young)
> > +                             ptent = pte_mkyoung(ptent);
> > +                     if (any_dirty)
>
> any_dirty is never set, likely missed to pass it to
> madvise_folio_pte_batch().
>
> Apart from that LGTM and this patch is much easier to review now!
>
>
> With above:
>
> Acked-by: David Hildenbrand <david@redhat.com>
>
> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand April 18, 2024, 12:44 p.m. UTC | #4
On 18.04.24 14:33, Lance Yang wrote:
> On Thu, Apr 18, 2024 at 8:03 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 18.04.24 12:57, Lance Yang wrote:
>>> This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
>>> (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
>>> splitting if the large folio is fully mapped within the target range.
>>>
>>> If a large folio is locked or shared, or if we fail to split it, we just
>>> leave it in place and advance to the next PTE in the range. But note that
>>> the behavior is changed; previously, any failure of this sort would cause
>>> the entire operation to give up. As large folios become more common,
>>> sticking to the old way could result in wasted opportunities.
>>>
>>> On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
>>> the same size results in the following runtimes for madvise(MADV_FREE) in
>>> seconds (shorter is better):
>>>
>>> Folio Size |   Old    |   New    | Change
>>> ------------------------------------------
>>>         4KiB | 0.590251 | 0.590259 |    0%
>>>        16KiB | 2.990447 | 0.185655 |  -94%
>>>        32KiB | 2.547831 | 0.104870 |  -95%
>>>        64KiB | 2.457796 | 0.052812 |  -97%
>>>       128KiB | 2.281034 | 0.032777 |  -99%
>>>       256KiB | 2.230387 | 0.017496 |  -99%
>>>       512KiB | 2.189106 | 0.010781 |  -99%
>>>      1024KiB | 2.183949 | 0.007753 |  -99%
>>>      2048KiB | 0.002799 | 0.002804 |    0%
>>>
>>> [1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
>>> [2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com
>>>
>>> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
>>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
>>> ---
>>>    mm/madvise.c | 85 +++++++++++++++++++++++++++-------------------------
>>>    1 file changed, 44 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/mm/madvise.c b/mm/madvise.c
>>> index 4597a3568e7e..375ab3234603 100644
>>> --- a/mm/madvise.c
>>> +++ b/mm/madvise.c
>>> @@ -643,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>>>                                unsigned long end, struct mm_walk *walk)
>>>
>>>    {
>>> +     const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
>>>        struct mmu_gather *tlb = walk->private;
>>>        struct mm_struct *mm = tlb->mm;
>>>        struct vm_area_struct *vma = walk->vma;
>>> @@ -697,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>>>                        continue;
>>>
>>>                /*
>>> -              * If pmd isn't transhuge but the folio is large and
>>> -              * is owned by only this process, split it and
>>> -              * deactivate all pages.
>>> +              * If we encounter a large folio, only split it if it is not
>>> +              * fully mapped within the range we are operating on. Otherwise
>>> +              * leave it as is so that it can be marked as lazyfree. If we
>>> +              * fail to split a folio, leave it in place and advance to the
>>> +              * next pte in the range.
>>>                 */
>>>                if (folio_test_large(folio)) {
>>> -                     int err;
>>> +                     bool any_young, any_dirty;
>>>
>>> -                     if (folio_likely_mapped_shared(folio))
>>> -                             break;
>>> -                     if (!folio_trylock(folio))
>>> -                             break;
>>> -                     folio_get(folio);
>>> -                     arch_leave_lazy_mmu_mode();
>>> -                     pte_unmap_unlock(start_pte, ptl);
>>> -                     start_pte = NULL;
>>> -                     err = split_folio(folio);
>>> -                     folio_unlock(folio);
>>> -                     folio_put(folio);
>>> -                     if (err)
>>> -                             break;
>>> -                     start_pte = pte =
>>> -                             pte_offset_map_lock(mm, pmd, addr, &ptl);
>>> -                     if (!start_pte)
>>> -                             break;
>>> -                     arch_enter_lazy_mmu_mode();
>>> -                     pte--;
>>> -                     addr -= PAGE_SIZE;
>>> -                     continue;
>>> +                     nr = madvise_folio_pte_batch(addr, end, folio, pte,
>>> +                                                  ptent, &any_young, NULL);
>>> +
>>> +                     if (nr < folio_nr_pages(folio)) {
>>> +                             int err;
>>> +
>>> +                             if (folio_likely_mapped_shared(folio))
>>> +                                     continue;
>>> +                             if (!folio_trylock(folio))
>>> +                                     continue;
>>> +                             folio_get(folio);
>>> +                             arch_leave_lazy_mmu_mode();
>>> +                             pte_unmap_unlock(start_pte, ptl);
>>> +                             start_pte = NULL;
>>> +                             err = split_folio(folio);
>>> +                             folio_unlock(folio);
>>> +                             folio_put(folio);
>>> +                             start_pte = pte =
>>> +                                     pte_offset_map_lock(mm, pmd, addr, &ptl);
>>
>> I'd just put it on a single line.
> 
> start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> 
> I suddenly realized that putting it on a single line would exceed the
> 80-char limit.

Which is fine according to Documentation/process/coding-style.rst

... as long as it aids readability.

Alternatively, the following might do:

pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
start_pte = pte;
Lance Yang April 18, 2024, 12:48 p.m. UTC | #5
On Thu, Apr 18, 2024 at 8:44 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.04.24 14:33, Lance Yang wrote:
> > On Thu, Apr 18, 2024 at 8:03 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 18.04.24 12:57, Lance Yang wrote:
> >>> This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
> >>> (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
> >>> splitting if the large folio is fully mapped within the target range.
> >>>
> >>> If a large folio is locked or shared, or if we fail to split it, we just
> >>> leave it in place and advance to the next PTE in the range. But note that
> >>> the behavior is changed; previously, any failure of this sort would cause
> >>> the entire operation to give up. As large folios become more common,
> >>> sticking to the old way could result in wasted opportunities.
> >>>
> >>> On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
> >>> the same size results in the following runtimes for madvise(MADV_FREE) in
> >>> seconds (shorter is better):
> >>>
> >>> Folio Size |   Old    |   New    | Change
> >>> ------------------------------------------
> >>>         4KiB | 0.590251 | 0.590259 |    0%
> >>>        16KiB | 2.990447 | 0.185655 |  -94%
> >>>        32KiB | 2.547831 | 0.104870 |  -95%
> >>>        64KiB | 2.457796 | 0.052812 |  -97%
> >>>       128KiB | 2.281034 | 0.032777 |  -99%
> >>>       256KiB | 2.230387 | 0.017496 |  -99%
> >>>       512KiB | 2.189106 | 0.010781 |  -99%
> >>>      1024KiB | 2.183949 | 0.007753 |  -99%
> >>>      2048KiB | 0.002799 | 0.002804 |    0%
> >>>
> >>> [1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
> >>> [2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com
> >>>
> >>> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
> >>> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> >>> ---
> >>>    mm/madvise.c | 85 +++++++++++++++++++++++++++-------------------------
> >>>    1 file changed, 44 insertions(+), 41 deletions(-)
> >>>
> >>> diff --git a/mm/madvise.c b/mm/madvise.c
> >>> index 4597a3568e7e..375ab3234603 100644
> >>> --- a/mm/madvise.c
> >>> +++ b/mm/madvise.c
> >>> @@ -643,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >>>                                unsigned long end, struct mm_walk *walk)
> >>>
> >>>    {
> >>> +     const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
> >>>        struct mmu_gather *tlb = walk->private;
> >>>        struct mm_struct *mm = tlb->mm;
> >>>        struct vm_area_struct *vma = walk->vma;
> >>> @@ -697,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >>>                        continue;
> >>>
> >>>                /*
> >>> -              * If pmd isn't transhuge but the folio is large and
> >>> -              * is owned by only this process, split it and
> >>> -              * deactivate all pages.
> >>> +              * If we encounter a large folio, only split it if it is not
> >>> +              * fully mapped within the range we are operating on. Otherwise
> >>> +              * leave it as is so that it can be marked as lazyfree. If we
> >>> +              * fail to split a folio, leave it in place and advance to the
> >>> +              * next pte in the range.
> >>>                 */
> >>>                if (folio_test_large(folio)) {
> >>> -                     int err;
> >>> +                     bool any_young, any_dirty;
> >>>
> >>> -                     if (folio_likely_mapped_shared(folio))
> >>> -                             break;
> >>> -                     if (!folio_trylock(folio))
> >>> -                             break;
> >>> -                     folio_get(folio);
> >>> -                     arch_leave_lazy_mmu_mode();
> >>> -                     pte_unmap_unlock(start_pte, ptl);
> >>> -                     start_pte = NULL;
> >>> -                     err = split_folio(folio);
> >>> -                     folio_unlock(folio);
> >>> -                     folio_put(folio);
> >>> -                     if (err)
> >>> -                             break;
> >>> -                     start_pte = pte =
> >>> -                             pte_offset_map_lock(mm, pmd, addr, &ptl);
> >>> -                     if (!start_pte)
> >>> -                             break;
> >>> -                     arch_enter_lazy_mmu_mode();
> >>> -                     pte--;
> >>> -                     addr -= PAGE_SIZE;
> >>> -                     continue;
> >>> +                     nr = madvise_folio_pte_batch(addr, end, folio, pte,
> >>> +                                                  ptent, &any_young, NULL);
> >>> +
> >>> +                     if (nr < folio_nr_pages(folio)) {
> >>> +                             int err;
> >>> +
> >>> +                             if (folio_likely_mapped_shared(folio))
> >>> +                                     continue;
> >>> +                             if (!folio_trylock(folio))
> >>> +                                     continue;
> >>> +                             folio_get(folio);
> >>> +                             arch_leave_lazy_mmu_mode();
> >>> +                             pte_unmap_unlock(start_pte, ptl);
> >>> +                             start_pte = NULL;
> >>> +                             err = split_folio(folio);
> >>> +                             folio_unlock(folio);
> >>> +                             folio_put(folio);
> >>> +                             start_pte = pte =
> >>> +                                     pte_offset_map_lock(mm, pmd, addr, &ptl);
> >>
> >> I'd just put it on a single line.
> >
> > start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> >
> > I suddenly realized that putting it on a single line would exceed the
> > 80-char limit.
>
> Which is fine according to Documentation/process/coding-style.rst
>
> ... as long as it aids readability.
>
> Alternatively, the following might do:
>
> pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> start_pte = pte;

Yep, I understood.

Thanks,
Lance

>
> --
> Cheers,
>
> David / dhildenb
>
diff mbox series

Patch

diff --git a/mm/madvise.c b/mm/madvise.c
index 4597a3568e7e..375ab3234603 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -643,6 +643,7 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
 {
+	const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
 	struct mmu_gather *tlb = walk->private;
 	struct mm_struct *mm = tlb->mm;
 	struct vm_area_struct *vma = walk->vma;
@@ -697,44 +698,57 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			continue;
 
 		/*
-		 * If pmd isn't transhuge but the folio is large and
-		 * is owned by only this process, split it and
-		 * deactivate all pages.
+		 * If we encounter a large folio, only split it if it is not
+		 * fully mapped within the range we are operating on. Otherwise
+		 * leave it as is so that it can be marked as lazyfree. If we
+		 * fail to split a folio, leave it in place and advance to the
+		 * next pte in the range.
 		 */
 		if (folio_test_large(folio)) {
-			int err;
+			bool any_young, any_dirty;
 
-			if (folio_likely_mapped_shared(folio))
-				break;
-			if (!folio_trylock(folio))
-				break;
-			folio_get(folio);
-			arch_leave_lazy_mmu_mode();
-			pte_unmap_unlock(start_pte, ptl);
-			start_pte = NULL;
-			err = split_folio(folio);
-			folio_unlock(folio);
-			folio_put(folio);
-			if (err)
-				break;
-			start_pte = pte =
-				pte_offset_map_lock(mm, pmd, addr, &ptl);
-			if (!start_pte)
-				break;
-			arch_enter_lazy_mmu_mode();
-			pte--;
-			addr -= PAGE_SIZE;
-			continue;
+			nr = madvise_folio_pte_batch(addr, end, folio, pte,
+						     ptent, &any_young, NULL);
+
+			if (nr < folio_nr_pages(folio)) {
+				int err;
+
+				if (folio_likely_mapped_shared(folio))
+					continue;
+				if (!folio_trylock(folio))
+					continue;
+				folio_get(folio);
+				arch_leave_lazy_mmu_mode();
+				pte_unmap_unlock(start_pte, ptl);
+				start_pte = NULL;
+				err = split_folio(folio);
+				folio_unlock(folio);
+				folio_put(folio);
+				start_pte = pte =
+					pte_offset_map_lock(mm, pmd, addr, &ptl);
+				if (!start_pte)
+					break;
+				arch_enter_lazy_mmu_mode();
+				if (!err)
+					nr = 0;
+				continue;
+			}
+
+			if (any_young)
+				ptent = pte_mkyoung(ptent);
+			if (any_dirty)
+				ptent = pte_mkdirty(ptent);
 		}
 
 		if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
 			if (!folio_trylock(folio))
 				continue;
 			/*
-			 * If folio is shared with others, we mustn't clear
-			 * the folio's dirty flag.
+			 * If we have a large folio at this point, we know it is
+			 * fully mapped so if its mapcount is the same as its
+			 * number of pages, it must be exclusive.
 			 */
-			if (folio_mapcount(folio) != 1) {
+			if (folio_mapcount(folio) != folio_nr_pages(folio)) {
 				folio_unlock(folio);
 				continue;
 			}
@@ -750,19 +764,8 @@  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		}
 
 		if (pte_young(ptent) || pte_dirty(ptent)) {
-			/*
-			 * Some of architecture(ex, PPC) don't update TLB
-			 * with set_pte_at and tlb_remove_tlb_entry so for
-			 * the portability, remap the pte with old|clean
-			 * after pte clearing.
-			 */
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
-							tlb->fullmm);
-
-			ptent = pte_mkold(ptent);
-			ptent = pte_mkclean(ptent);
-			set_pte_at(mm, addr, pte, ptent);
-			tlb_remove_tlb_entry(tlb, pte, addr);
+			clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
+			tlb_remove_tlb_entries(tlb, pte, nr, addr);
 		}
 		folio_mark_lazyfree(folio);
 	}