diff mbox series

[v8,04/10] mm: thp: Support allocation of anonymous multi-size THP

Message ID 20231204102027.57185-5-ryan.roberts@arm.com (mailing list archive)
State New, archived
Headers show
Series Multi-size THP for anonymous memory | expand

Commit Message

Ryan Roberts Dec. 4, 2023, 10:20 a.m. UTC
Introduce the logic to allow THP to be configured (through the new sysfs
interface we just added) to allocate large folios to back anonymous
memory, which are larger than the base page size but smaller than
PMD-size. We call this new THP extension "multi-size THP" (mTHP).

mTHP continues to be PTE-mapped, but in many cases can still provide
similar benefits to traditional PMD-sized THP: Page faults are
significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
the configured order), but latency spikes are much less prominent
because the size of each page isn't as huge as the PMD-sized variant and
there is less memory to clear in each page fault. The number of per-page
operations (e.g. ref counting, rmap management, lru list management) are
also significantly reduced since those ops now become per-folio.

Some architectures also employ TLB compression mechanisms to squeeze
more entries in when a set of PTEs are virtually and physically
contiguous and approporiately aligned. In this case, TLB misses will
occur less often.

The new behaviour is disabled by default, but can be enabled at runtime
by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
(see documentation in previous commit). The long term aim is to change
the default to include suitable lower orders, but there are some risks
around internal fragmentation that need to be better understood first.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 include/linux/huge_mm.h |   6 ++-
 mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 101 insertions(+), 11 deletions(-)

Comments

Barry Song Dec. 5, 2023, 1:15 a.m. UTC | #1
On Mon, Dec 4, 2023 at 6:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> Introduce the logic to allow THP to be configured (through the new sysfs
> interface we just added) to allocate large folios to back anonymous
> memory, which are larger than the base page size but smaller than
> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>
> mTHP continues to be PTE-mapped, but in many cases can still provide
> similar benefits to traditional PMD-sized THP: Page faults are
> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
> the configured order), but latency spikes are much less prominent
> because the size of each page isn't as huge as the PMD-sized variant and
> there is less memory to clear in each page fault. The number of per-page
> operations (e.g. ref counting, rmap management, lru list management) are
> also significantly reduced since those ops now become per-folio.
>
> Some architectures also employ TLB compression mechanisms to squeeze
> more entries in when a set of PTEs are virtually and physically
> contiguous and approporiately aligned. In this case, TLB misses will
> occur less often.
>
> The new behaviour is disabled by default, but can be enabled at runtime
> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
> (see documentation in previous commit). The long term aim is to change
> the default to include suitable lower orders, but there are some risks
> around internal fragmentation that need to be better understood first.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  include/linux/huge_mm.h |   6 ++-
>  mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>  2 files changed, 101 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index bd0eadd3befb..91a53b9835a4 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>
>  /*
> - * Mask of all large folio orders supported for anonymous THP.
> + * Mask of all large folio orders supported for anonymous THP; all orders up to
> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
> + * (which is a limitation of the THP implementation).
>   */
> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>
>  /*
>   * Mask of all large folio orders supported for file THP.
> diff --git a/mm/memory.c b/mm/memory.c
> index 3ceeb0f45bf5..bf7e93813018 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         return ret;
>  }
>
> +static bool pte_range_none(pte_t *pte, int nr_pages)
> +{
> +       int i;
> +
> +       for (i = 0; i < nr_pages; i++) {
> +               if (!pte_none(ptep_get_lockless(pte + i)))
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> +{
> +       gfp_t gfp;
> +       pte_t *pte;
> +       unsigned long addr;
> +       struct folio *folio;
> +       struct vm_area_struct *vma = vmf->vma;
> +       unsigned long orders;
> +       int order;
> +
> +       /*
> +        * If uffd is active for the vma we need per-page fault fidelity to
> +        * maintain the uffd semantics.
> +        */
> +       if (userfaultfd_armed(vma))
> +               goto fallback;
> +
> +       /*
> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
> +        * for this vma. Then filter out the orders that can't be allocated over
> +        * the faulting address and still be fully contained in the vma.
> +        */
> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
> +                                         BIT(PMD_ORDER) - 1);
> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> +
> +       if (!orders)
> +               goto fallback;
> +
> +       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> +       if (!pte)
> +               return ERR_PTR(-EAGAIN);
> +
> +       order = first_order(orders);
> +       while (orders) {
> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> +               vmf->pte = pte + pte_index(addr);
> +               if (pte_range_none(vmf->pte, 1 << order))
> +                       break;
> +               order = next_order(&orders, order);
> +       }
> +
> +       vmf->pte = NULL;
> +       pte_unmap(pte);
> +
> +       gfp = vma_thp_gfp_mask(vma);
> +
> +       while (orders) {
> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
> +               if (folio) {
> +                       clear_huge_page(&folio->page, addr, 1 << order);

Minor.

Do we have to constantly clear a huge page? Is it possible to let
post_alloc_hook()
finish this job by using __GFP_ZERO/__GFP_ZEROTAGS as
vma_alloc_zeroed_movable_folio() is doing?

struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
                                                unsigned long vaddr)
{
        gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;

        /*
         * If the page is mapped with PROT_MTE, initialise the tags at the
         * point of allocation and page zeroing as this is usually faster than
         * separate DC ZVA and STGM.
         */
        if (vma->vm_flags & VM_MTE)
                flags |= __GFP_ZEROTAGS;

        return vma_alloc_folio(flags, 0, vma, vaddr, false);
}

> +                       return folio;
> +               }
> +               order = next_order(&orders, order);
> +       }
> +
> +fallback:
> +       return vma_alloc_zeroed_movable_folio(vma, vmf->address);
> +}
> +#else
> +#define alloc_anon_folio(vmf) \
> +               vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
> +#endif
> +
>  /*
>   * We enter with non-exclusive mmap_lock (to exclude vma changes,
>   * but allow concurrent faults), and pte mapped but not yet locked.
> @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   */
>  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  {
> +       int i;
> +       int nr_pages = 1;
> +       unsigned long addr = vmf->address;
>         bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>         struct vm_area_struct *vma = vmf->vma;
>         struct folio *folio;
> @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>         /* Allocate our own private page. */
>         if (unlikely(anon_vma_prepare(vma)))
>                 goto oom;
> -       folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
> +       folio = alloc_anon_folio(vmf);
> +       if (IS_ERR(folio))
> +               return 0;
>         if (!folio)
>                 goto oom;
>
> +       nr_pages = folio_nr_pages(folio);
> +       addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> +
>         if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>                 goto oom_free_page;
>         folio_throttle_swaprate(folio, GFP_KERNEL);
> @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>         if (vma->vm_flags & VM_WRITE)
>                 entry = pte_mkwrite(pte_mkdirty(entry), vma);
>
> -       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> -                       &vmf->ptl);
> +       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>         if (!vmf->pte)
>                 goto release;
> -       if (vmf_pte_changed(vmf)) {
> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
> +       if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
> +           (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
> +               for (i = 0; i < nr_pages; i++)
> +                       update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>                 goto release;
>         }
>
> @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>                 return handle_userfault(vmf, VM_UFFD_MISSING);
>         }
>
> -       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> -       folio_add_new_anon_rmap(folio, vma, vmf->address);
> +       folio_ref_add(folio, nr_pages - 1);
> +       add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> +       folio_add_new_anon_rmap(folio, vma, addr);
>         folio_add_lru_vma(folio, vma);
>  setpte:
>         if (uffd_wp)
>                 entry = pte_mkuffd_wp(entry);
> -       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
> +       set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>
>         /* No need to invalidate - it was non-present before */
> -       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> +       update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
>  unlock:
>         if (vmf->pte)
>                 pte_unmap_unlock(vmf->pte, vmf->ptl);
> --
> 2.25.1
>
Barry Song Dec. 5, 2023, 1:24 a.m. UTC | #2
On Tue, Dec 5, 2023 at 9:15 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Mon, Dec 4, 2023 at 6:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >
> > Introduce the logic to allow THP to be configured (through the new sysfs
> > interface we just added) to allocate large folios to back anonymous
> > memory, which are larger than the base page size but smaller than
> > PMD-size. We call this new THP extension "multi-size THP" (mTHP).
> >
> > mTHP continues to be PTE-mapped, but in many cases can still provide
> > similar benefits to traditional PMD-sized THP: Page faults are
> > significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
> > the configured order), but latency spikes are much less prominent
> > because the size of each page isn't as huge as the PMD-sized variant and
> > there is less memory to clear in each page fault. The number of per-page
> > operations (e.g. ref counting, rmap management, lru list management) are
> > also significantly reduced since those ops now become per-folio.
> >
> > Some architectures also employ TLB compression mechanisms to squeeze
> > more entries in when a set of PTEs are virtually and physically
> > contiguous and approporiately aligned. In this case, TLB misses will
> > occur less often.
> >
> > The new behaviour is disabled by default, but can be enabled at runtime
> > by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
> > (see documentation in previous commit). The long term aim is to change
> > the default to include suitable lower orders, but there are some risks
> > around internal fragmentation that need to be better understood first.
> >
> > Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> > ---
> >  include/linux/huge_mm.h |   6 ++-
> >  mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 101 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index bd0eadd3befb..91a53b9835a4 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
> >  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
> >
> >  /*
> > - * Mask of all large folio orders supported for anonymous THP.
> > + * Mask of all large folio orders supported for anonymous THP; all orders up to
> > + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
> > + * (which is a limitation of the THP implementation).
> >   */
> > -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
> > +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> >
> >  /*
> >   * Mask of all large folio orders supported for file THP.
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 3ceeb0f45bf5..bf7e93813018 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >         return ret;
> >  }
> >
> > +static bool pte_range_none(pte_t *pte, int nr_pages)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < nr_pages; i++) {
> > +               if (!pte_none(ptep_get_lockless(pte + i)))
> > +                       return false;
> > +       }
> > +
> > +       return true;
> > +}
> > +
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> > +{
> > +       gfp_t gfp;
> > +       pte_t *pte;
> > +       unsigned long addr;
> > +       struct folio *folio;
> > +       struct vm_area_struct *vma = vmf->vma;
> > +       unsigned long orders;
> > +       int order;
> > +
> > +       /*
> > +        * If uffd is active for the vma we need per-page fault fidelity to
> > +        * maintain the uffd semantics.
> > +        */
> > +       if (userfaultfd_armed(vma))
> > +               goto fallback;
> > +
> > +       /*
> > +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
> > +        * for this vma. Then filter out the orders that can't be allocated over
> > +        * the faulting address and still be fully contained in the vma.
> > +        */
> > +       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
> > +                                         BIT(PMD_ORDER) - 1);
> > +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> > +
> > +       if (!orders)
> > +               goto fallback;
> > +
> > +       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> > +       if (!pte)
> > +               return ERR_PTR(-EAGAIN);
> > +
> > +       order = first_order(orders);
> > +       while (orders) {
> > +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> > +               vmf->pte = pte + pte_index(addr);
> > +               if (pte_range_none(vmf->pte, 1 << order))
> > +                       break;
> > +               order = next_order(&orders, order);
> > +       }
> > +
> > +       vmf->pte = NULL;
> > +       pte_unmap(pte);
> > +
> > +       gfp = vma_thp_gfp_mask(vma);
> > +
> > +       while (orders) {
> > +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> > +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
> > +               if (folio) {
> > +                       clear_huge_page(&folio->page, addr, 1 << order);
>
> Minor.
>
> Do we have to constantly clear a huge page? Is it possible to let
> post_alloc_hook()
> finish this job by using __GFP_ZERO/__GFP_ZEROTAGS as
> vma_alloc_zeroed_movable_folio() is doing?
>
> struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
>                                                 unsigned long vaddr)
> {
>         gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
>
>         /*
>          * If the page is mapped with PROT_MTE, initialise the tags at the
>          * point of allocation and page zeroing as this is usually faster than
>          * separate DC ZVA and STGM.
>          */
>         if (vma->vm_flags & VM_MTE)
>                 flags |= __GFP_ZEROTAGS;
>
>         return vma_alloc_folio(flags, 0, vma, vaddr, false);
> }

I am asking this because Android and some other kernels might always set
CONFIG_INIT_ON_ALLOC_DEFAULT_ON, that means one more explicit
clear_page is doing a duplicated job.

when the below is true, post_alloc_hook has cleared huge_page before
vma_alloc_folio() returns the folio,

static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc))
                return true;
        return flags & __GFP_ZERO;
}


>
> > +                       return folio;
> > +               }
> > +               order = next_order(&orders, order);
> > +       }
> > +
> > +fallback:
> > +       return vma_alloc_zeroed_movable_folio(vma, vmf->address);
> > +}
> > +#else
> > +#define alloc_anon_folio(vmf) \
> > +               vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
> > +#endif
> > +
> >  /*
> >   * We enter with non-exclusive mmap_lock (to exclude vma changes,
> >   * but allow concurrent faults), and pte mapped but not yet locked.
> > @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >   */
> >  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> >  {
> > +       int i;
> > +       int nr_pages = 1;
> > +       unsigned long addr = vmf->address;
> >         bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
> >         struct vm_area_struct *vma = vmf->vma;
> >         struct folio *folio;
> > @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> >         /* Allocate our own private page. */
> >         if (unlikely(anon_vma_prepare(vma)))
> >                 goto oom;
> > -       folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
> > +       folio = alloc_anon_folio(vmf);
> > +       if (IS_ERR(folio))
> > +               return 0;
> >         if (!folio)
> >                 goto oom;
> >
> > +       nr_pages = folio_nr_pages(folio);
> > +       addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> > +
> >         if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
> >                 goto oom_free_page;
> >         folio_throttle_swaprate(folio, GFP_KERNEL);
> > @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> >         if (vma->vm_flags & VM_WRITE)
> >                 entry = pte_mkwrite(pte_mkdirty(entry), vma);
> >
> > -       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> > -                       &vmf->ptl);
> > +       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
> >         if (!vmf->pte)
> >                 goto release;
> > -       if (vmf_pte_changed(vmf)) {
> > -               update_mmu_tlb(vma, vmf->address, vmf->pte);
> > +       if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
> > +           (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
> > +               for (i = 0; i < nr_pages; i++)
> > +                       update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
> >                 goto release;
> >         }
> >
> > @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> >                 return handle_userfault(vmf, VM_UFFD_MISSING);
> >         }
> >
> > -       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> > -       folio_add_new_anon_rmap(folio, vma, vmf->address);
> > +       folio_ref_add(folio, nr_pages - 1);
> > +       add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> > +       folio_add_new_anon_rmap(folio, vma, addr);
> >         folio_add_lru_vma(folio, vma);
> >  setpte:
> >         if (uffd_wp)
> >                 entry = pte_mkuffd_wp(entry);
> > -       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
> > +       set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
> >
> >         /* No need to invalidate - it was non-present before */
> > -       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> > +       update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
> >  unlock:
> >         if (vmf->pte)
> >                 pte_unmap_unlock(vmf->pte, vmf->ptl);
> > --
> > 2.25.1
> >
Ryan Roberts Dec. 5, 2023, 10:48 a.m. UTC | #3
On 05/12/2023 01:24, Barry Song wrote:
> On Tue, Dec 5, 2023 at 9:15 AM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Mon, Dec 4, 2023 at 6:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>
>>> Introduce the logic to allow THP to be configured (through the new sysfs
>>> interface we just added) to allocate large folios to back anonymous
>>> memory, which are larger than the base page size but smaller than
>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>>
>>> mTHP continues to be PTE-mapped, but in many cases can still provide
>>> similar benefits to traditional PMD-sized THP: Page faults are
>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>>> the configured order), but latency spikes are much less prominent
>>> because the size of each page isn't as huge as the PMD-sized variant and
>>> there is less memory to clear in each page fault. The number of per-page
>>> operations (e.g. ref counting, rmap management, lru list management) are
>>> also significantly reduced since those ops now become per-folio.
>>>
>>> Some architectures also employ TLB compression mechanisms to squeeze
>>> more entries in when a set of PTEs are virtually and physically
>>> contiguous and approporiately aligned. In this case, TLB misses will
>>> occur less often.
>>>
>>> The new behaviour is disabled by default, but can be enabled at runtime
>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>>> (see documentation in previous commit). The long term aim is to change
>>> the default to include suitable lower orders, but there are some risks
>>> around internal fragmentation that need to be better understood first.
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>  include/linux/huge_mm.h |   6 ++-
>>>  mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>>>  2 files changed, 101 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index bd0eadd3befb..91a53b9835a4 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>>
>>>  /*
>>> - * Mask of all large folio orders supported for anonymous THP.
>>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
>>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
>>> + * (which is a limitation of the THP implementation).
>>>   */
>>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
>>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>
>>>  /*
>>>   * Mask of all large folio orders supported for file THP.
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 3ceeb0f45bf5..bf7e93813018 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>         return ret;
>>>  }
>>>
>>> +static bool pte_range_none(pte_t *pte, int nr_pages)
>>> +{
>>> +       int i;
>>> +
>>> +       for (i = 0; i < nr_pages; i++) {
>>> +               if (!pte_none(ptep_get_lockless(pte + i)))
>>> +                       return false;
>>> +       }
>>> +
>>> +       return true;
>>> +}
>>> +
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>> +{
>>> +       gfp_t gfp;
>>> +       pte_t *pte;
>>> +       unsigned long addr;
>>> +       struct folio *folio;
>>> +       struct vm_area_struct *vma = vmf->vma;
>>> +       unsigned long orders;
>>> +       int order;
>>> +
>>> +       /*
>>> +        * If uffd is active for the vma we need per-page fault fidelity to
>>> +        * maintain the uffd semantics.
>>> +        */
>>> +       if (userfaultfd_armed(vma))
>>> +               goto fallback;
>>> +
>>> +       /*
>>> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>> +        * for this vma. Then filter out the orders that can't be allocated over
>>> +        * the faulting address and still be fully contained in the vma.
>>> +        */
>>> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>>> +                                         BIT(PMD_ORDER) - 1);
>>> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>>> +
>>> +       if (!orders)
>>> +               goto fallback;
>>> +
>>> +       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>> +       if (!pte)
>>> +               return ERR_PTR(-EAGAIN);
>>> +
>>> +       order = first_order(orders);
>>> +       while (orders) {
>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +               vmf->pte = pte + pte_index(addr);
>>> +               if (pte_range_none(vmf->pte, 1 << order))
>>> +                       break;
>>> +               order = next_order(&orders, order);
>>> +       }
>>> +
>>> +       vmf->pte = NULL;
>>> +       pte_unmap(pte);
>>> +
>>> +       gfp = vma_thp_gfp_mask(vma);
>>> +
>>> +       while (orders) {
>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>> +               if (folio) {
>>> +                       clear_huge_page(&folio->page, addr, 1 << order);
>>
>> Minor.
>>
>> Do we have to constantly clear a huge page? Is it possible to let
>> post_alloc_hook()
>> finish this job by using __GFP_ZERO/__GFP_ZEROTAGS as
>> vma_alloc_zeroed_movable_folio() is doing?

I'm currently following the same allocation pattern as is done for PMD-sized
THP. In earlier versions of this patch I was trying to be smarter and use the
__GFP_ZERO/__GFP_ZEROTAGS as you suggest, but I was advised to keep it simple
and follow the existing pattern.

I have a vague recollection __GFP_ZERO is not preferred for large folios because
of some issue with virtually indexed caches? (Matthew: did I see you mention
that in some other context?)

That said, I wasn't aware that Android ships with
CONFIG_INIT_ON_ALLOC_DEFAULT_ON (I thought it was only used as a debug option),
so I can see the potential for some overhead reduction here.

Options:

 1) leave it as is and accept the duplicated clearing
 2) Pass __GFP_ZERO and remove clear_huge_page()
 3) define __GFP_SKIP_ZERO even when kasan is not enabled and pass it down so
    clear_huge_page() is the only clear
 4) make clear_huge_page() conditional on !want_init_on_alloc()

I prefer option 4. What do you think?

As an aside, I've also noticed that clear_huge_page() should take vmf->address
so that it clears the faulting page last to keep the cache hot. If we decide on
an option that keeps clear_huge_page(), I'll also make that change.

Thanks,
Ryan

>>
>> struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
>>                                                 unsigned long vaddr)
>> {
>>         gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
>>
>>         /*
>>          * If the page is mapped with PROT_MTE, initialise the tags at the
>>          * point of allocation and page zeroing as this is usually faster than
>>          * separate DC ZVA and STGM.
>>          */
>>         if (vma->vm_flags & VM_MTE)
>>                 flags |= __GFP_ZEROTAGS;
>>
>>         return vma_alloc_folio(flags, 0, vma, vaddr, false);
>> }
> 
> I am asking this because Android and some other kernels might always set
> CONFIG_INIT_ON_ALLOC_DEFAULT_ON, that means one more explicit
> clear_page is doing a duplicated job.
> 
> when the below is true, post_alloc_hook has cleared huge_page before
> vma_alloc_folio() returns the folio,
> 
> static inline bool want_init_on_alloc(gfp_t flags)
> {
>         if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
>                                 &init_on_alloc))
>                 return true;
>         return flags & __GFP_ZERO;
> }
> 
> 
>>
>>> +                       return folio;
>>> +               }
>>> +               order = next_order(&orders, order);
>>> +       }
>>> +
>>> +fallback:
>>> +       return vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>> +}
>>> +#else
>>> +#define alloc_anon_folio(vmf) \
>>> +               vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
>>> +#endif
>>> +
>>>  /*
>>>   * We enter with non-exclusive mmap_lock (to exclude vma changes,
>>>   * but allow concurrent faults), and pte mapped but not yet locked.
>>> @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>   */
>>>  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>  {
>>> +       int i;
>>> +       int nr_pages = 1;
>>> +       unsigned long addr = vmf->address;
>>>         bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>>         struct vm_area_struct *vma = vmf->vma;
>>>         struct folio *folio;
>>> @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>         /* Allocate our own private page. */
>>>         if (unlikely(anon_vma_prepare(vma)))
>>>                 goto oom;
>>> -       folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>> +       folio = alloc_anon_folio(vmf);
>>> +       if (IS_ERR(folio))
>>> +               return 0;
>>>         if (!folio)
>>>                 goto oom;
>>>
>>> +       nr_pages = folio_nr_pages(folio);
>>> +       addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>> +
>>>         if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>>>                 goto oom_free_page;
>>>         folio_throttle_swaprate(folio, GFP_KERNEL);
>>> @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>         if (vma->vm_flags & VM_WRITE)
>>>                 entry = pte_mkwrite(pte_mkdirty(entry), vma);
>>>
>>> -       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>>> -                       &vmf->ptl);
>>> +       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>>>         if (!vmf->pte)
>>>                 goto release;
>>> -       if (vmf_pte_changed(vmf)) {
>>> -               update_mmu_tlb(vma, vmf->address, vmf->pte);
>>> +       if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
>>> +           (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
>>> +               for (i = 0; i < nr_pages; i++)
>>> +                       update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>>                 goto release;
>>>         }
>>>
>>> @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>                 return handle_userfault(vmf, VM_UFFD_MISSING);
>>>         }
>>>
>>> -       inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>> -       folio_add_new_anon_rmap(folio, vma, vmf->address);
>>> +       folio_ref_add(folio, nr_pages - 1);
>>> +       add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>>> +       folio_add_new_anon_rmap(folio, vma, addr);
>>>         folio_add_lru_vma(folio, vma);
>>>  setpte:
>>>         if (uffd_wp)
>>>                 entry = pte_mkuffd_wp(entry);
>>> -       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
>>> +       set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>>>
>>>         /* No need to invalidate - it was non-present before */
>>> -       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
>>> +       update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
>>>  unlock:
>>>         if (vmf->pte)
>>>                 pte_unmap_unlock(vmf->pte, vmf->ptl);
>>> --
>>> 2.25.1
>>>
David Hildenbrand Dec. 5, 2023, 11:16 a.m. UTC | #4
On 05.12.23 11:48, Ryan Roberts wrote:
> On 05/12/2023 01:24, Barry Song wrote:
>> On Tue, Dec 5, 2023 at 9:15 AM Barry Song <21cnbao@gmail.com> wrote:
>>>
>>> On Mon, Dec 4, 2023 at 6:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> Introduce the logic to allow THP to be configured (through the new sysfs
>>>> interface we just added) to allocate large folios to back anonymous
>>>> memory, which are larger than the base page size but smaller than
>>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>>>
>>>> mTHP continues to be PTE-mapped, but in many cases can still provide
>>>> similar benefits to traditional PMD-sized THP: Page faults are
>>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>>>> the configured order), but latency spikes are much less prominent
>>>> because the size of each page isn't as huge as the PMD-sized variant and
>>>> there is less memory to clear in each page fault. The number of per-page
>>>> operations (e.g. ref counting, rmap management, lru list management) are
>>>> also significantly reduced since those ops now become per-folio.
>>>>
>>>> Some architectures also employ TLB compression mechanisms to squeeze
>>>> more entries in when a set of PTEs are virtually and physically
>>>> contiguous and approporiately aligned. In this case, TLB misses will
>>>> occur less often.
>>>>
>>>> The new behaviour is disabled by default, but can be enabled at runtime
>>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>>>> (see documentation in previous commit). The long term aim is to change
>>>> the default to include suitable lower orders, but there are some risks
>>>> around internal fragmentation that need to be better understood first.
>>>>
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>   include/linux/huge_mm.h |   6 ++-
>>>>   mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>>>>   2 files changed, 101 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index bd0eadd3befb..91a53b9835a4 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>   #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>>>
>>>>   /*
>>>> - * Mask of all large folio orders supported for anonymous THP.
>>>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
>>>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
>>>> + * (which is a limitation of the THP implementation).
>>>>    */
>>>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
>>>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>>
>>>>   /*
>>>>    * Mask of all large folio orders supported for file THP.
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 3ceeb0f45bf5..bf7e93813018 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>          return ret;
>>>>   }
>>>>
>>>> +static bool pte_range_none(pte_t *pte, int nr_pages)
>>>> +{
>>>> +       int i;
>>>> +
>>>> +       for (i = 0; i < nr_pages; i++) {
>>>> +               if (!pte_none(ptep_get_lockless(pte + i)))
>>>> +                       return false;
>>>> +       }
>>>> +
>>>> +       return true;
>>>> +}
>>>> +
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>>> +{
>>>> +       gfp_t gfp;
>>>> +       pte_t *pte;
>>>> +       unsigned long addr;
>>>> +       struct folio *folio;
>>>> +       struct vm_area_struct *vma = vmf->vma;
>>>> +       unsigned long orders;
>>>> +       int order;
>>>> +
>>>> +       /*
>>>> +        * If uffd is active for the vma we need per-page fault fidelity to
>>>> +        * maintain the uffd semantics.
>>>> +        */
>>>> +       if (userfaultfd_armed(vma))
>>>> +               goto fallback;
>>>> +
>>>> +       /*
>>>> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>>> +        * for this vma. Then filter out the orders that can't be allocated over
>>>> +        * the faulting address and still be fully contained in the vma.
>>>> +        */
>>>> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>>>> +                                         BIT(PMD_ORDER) - 1);
>>>> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>>>> +
>>>> +       if (!orders)
>>>> +               goto fallback;
>>>> +
>>>> +       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>> +       if (!pte)
>>>> +               return ERR_PTR(-EAGAIN);
>>>> +
>>>> +       order = first_order(orders);
>>>> +       while (orders) {
>>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>> +               vmf->pte = pte + pte_index(addr);
>>>> +               if (pte_range_none(vmf->pte, 1 << order))
>>>> +                       break;
>>>> +               order = next_order(&orders, order);
>>>> +       }
>>>> +
>>>> +       vmf->pte = NULL;
>>>> +       pte_unmap(pte);
>>>> +
>>>> +       gfp = vma_thp_gfp_mask(vma);
>>>> +
>>>> +       while (orders) {
>>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>> +               if (folio) {
>>>> +                       clear_huge_page(&folio->page, addr, 1 << order);
>>>
>>> Minor.
>>>
>>> Do we have to constantly clear a huge page? Is it possible to let
>>> post_alloc_hook()
>>> finish this job by using __GFP_ZERO/__GFP_ZEROTAGS as
>>> vma_alloc_zeroed_movable_folio() is doing?
> 
> I'm currently following the same allocation pattern as is done for PMD-sized
> THP. In earlier versions of this patch I was trying to be smarter and use the
> __GFP_ZERO/__GFP_ZEROTAGS as you suggest, but I was advised to keep it simple
> and follow the existing pattern.

Yes, this should be optimized on top IMHO.
David Hildenbrand Dec. 5, 2023, 4:32 p.m. UTC | #5
On 04.12.23 11:20, Ryan Roberts wrote:
> Introduce the logic to allow THP to be configured (through the new sysfs
> interface we just added) to allocate large folios to back anonymous
> memory, which are larger than the base page size but smaller than
> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
> 
> mTHP continues to be PTE-mapped, but in many cases can still provide
> similar benefits to traditional PMD-sized THP: Page faults are
> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
> the configured order), but latency spikes are much less prominent
> because the size of each page isn't as huge as the PMD-sized variant and
> there is less memory to clear in each page fault. The number of per-page
> operations (e.g. ref counting, rmap management, lru list management) are
> also significantly reduced since those ops now become per-folio.
> 
> Some architectures also employ TLB compression mechanisms to squeeze
> more entries in when a set of PTEs are virtually and physically
> contiguous and approporiately aligned. In this case, TLB misses will
> occur less often.
> 
> The new behaviour is disabled by default, but can be enabled at runtime
> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
> (see documentation in previous commit). The long term aim is to change
> the default to include suitable lower orders, but there are some risks
> around internal fragmentation that need to be better understood first.
> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>

In general, looks good to me, some comments/nits. And the usual "let's 
make sure we don't degrade order-0 and keep that as fast as possible" 
comment.

> ---
>   include/linux/huge_mm.h |   6 ++-
>   mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>   2 files changed, 101 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index bd0eadd3befb..91a53b9835a4 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>   #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>   
>   /*
> - * Mask of all large folio orders supported for anonymous THP.
> + * Mask of all large folio orders supported for anonymous THP; all orders up to
> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
> + * (which is a limitation of the THP implementation).
>    */
> -#define THP_ORDERS_ALL_ANON	BIT(PMD_ORDER)
> +#define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>   
>   /*
>    * Mask of all large folio orders supported for file THP.
> diff --git a/mm/memory.c b/mm/memory.c
> index 3ceeb0f45bf5..bf7e93813018 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   	return ret;
>   }
>   
> +static bool pte_range_none(pte_t *pte, int nr_pages)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		if (!pte_none(ptep_get_lockless(pte + i)))
> +			return false;
> +	}
> +
> +	return true;
> +}
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> +{
> +	gfp_t gfp;
> +	pte_t *pte;
> +	unsigned long addr;
> +	struct folio *folio;
> +	struct vm_area_struct *vma = vmf->vma;
> +	unsigned long orders;
> +	int order;

Nit: reverse christmas tree encouraged ;)

> +
> +	/*
> +	 * If uffd is active for the vma we need per-page fault fidelity to
> +	 * maintain the uffd semantics.
> +	 */
> +	if (userfaultfd_armed(vma))

Nit: unlikely()

> +		goto fallback;
> +
> +	/*
> +	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
> +	 * for this vma. Then filter out the orders that can't be allocated over
> +	 * the faulting address and still be fully contained in the vma.
> +	 */
> +	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
> +					  BIT(PMD_ORDER) - 1);
> +	orders = thp_vma_suitable_orders(vma, vmf->address, orders);

Comment: Both will eventually loop over all orders, correct? Could 
eventually be sped up in the future.

Nit: the orders = ... order = ... looks like this might deserve a helper 
function that makes this easier to read.

Nit: Why call thp_vma_suitable_orders if the orders are already 0? 
Again, some helper might be reasonable where that is handled internally.

Comment: For order-0 we'll always perform a function call to both 
thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should 
perform some fast and efficient check if any <PMD THP are even enabled 
in the system / for this VMA, and in that case just fallback before 
doing more expensive checks.

> +
> +	if (!orders)
> +		goto fallback;
> +
> +	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> +	if (!pte)
> +		return ERR_PTR(-EAGAIN);
> +
> +	order = first_order(orders);
> +	while (orders) {
> +		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> +		vmf->pte = pte + pte_index(addr);
> +		if (pte_range_none(vmf->pte, 1 << order))
> +			break;

Comment: Likely it would make sense to scan only once and determine the 
"largest none range" around that address, having the largest suitable 
order in mind.

> +		order = next_order(&orders, order);
> +	}
> +
> +	vmf->pte = NULL;

Nit: Can you elaborate why you are messing with vmf->pte here? A simple 
helper variable will make this code look less magical. Unless I am 
missing something important :)

> +	pte_unmap(pte);
> +
> +	gfp = vma_thp_gfp_mask(vma);
> +
> +	while (orders) {
> +		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> +		folio = vma_alloc_folio(gfp, order, vma, addr, true);
> +		if (folio) {
> +			clear_huge_page(&folio->page, addr, 1 << order);
> +			return folio;
> +		}
> +		order = next_order(&orders, order);
> +	}
> +

Queestion: would it make sense to combine both loops? I suspect memory 
allocations with pte_offset_map()/kmao are problematic.

> +fallback:
> +	return vma_alloc_zeroed_movable_folio(vma, vmf->address);
> +}
> +#else
> +#define alloc_anon_folio(vmf) \
> +		vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
> +#endif
> +
>   /*
>    * We enter with non-exclusive mmap_lock (to exclude vma changes,
>    * but allow concurrent faults), and pte mapped but not yet locked.
> @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>    */
>   static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>   {
> +	int i;
> +	int nr_pages = 1;
> +	unsigned long addr = vmf->address;
>   	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>   	struct vm_area_struct *vma = vmf->vma;
>   	struct folio *folio;

Nit: reverse christmas tree :)

> @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>   	/* Allocate our own private page. */
>   	if (unlikely(anon_vma_prepare(vma)))
>   		goto oom;
> -	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
> +	folio = alloc_anon_folio(vmf);
> +	if (IS_ERR(folio))
> +		return 0;
>   	if (!folio)
>   		goto oom;
>   
> +	nr_pages = folio_nr_pages(folio);
> +	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> +
>   	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>   		goto oom_free_page;
>   	folio_throttle_swaprate(folio, GFP_KERNEL);
> @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>   	if (vma->vm_flags & VM_WRITE)
>   		entry = pte_mkwrite(pte_mkdirty(entry), vma);
>   
> -	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> -			&vmf->ptl);
> +	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>   	if (!vmf->pte)
>   		goto release;
> -	if (vmf_pte_changed(vmf)) {
> -		update_mmu_tlb(vma, vmf->address, vmf->pte);
> +	if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
> +	    (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
> +		for (i = 0; i < nr_pages; i++)
> +			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);

Comment: separating the order-0 case from the other case might make this 
easier to read.

>   		goto release;
>   	}
>   
> @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>   		return handle_userfault(vmf, VM_UFFD_MISSING);
>   	}
>   
> -	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> -	folio_add_new_anon_rmap(folio, vma, vmf->address);
> +	folio_ref_add(folio, nr_pages - 1);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> +	folio_add_new_anon_rmap(folio, vma, addr);
>   	folio_add_lru_vma(folio, vma);
>   setpte:
>   	if (uffd_wp)
>   		entry = pte_mkuffd_wp(entry);
> -	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
> +	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>   
>   	/* No need to invalidate - it was non-present before */
> -	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> +	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
>   unlock:
>   	if (vmf->pte)
>   		pte_unmap_unlock(vmf->pte, vmf->ptl);

Benchmarking order-0 allocations might be interesting. There will be 
some added checks + multiple loops/conditionals for order-0 that could 
be avoided by having two separate code paths. If we can't measure a 
difference, all good.
David Hildenbrand Dec. 5, 2023, 4:35 p.m. UTC | #6
> Comment: Both will eventually loop over all orders, correct? Could
> eventually be sped up in the future.
> 
> Nit: the orders = ... order = ... looks like this might deserve a helper
> function that makes this easier to read.
> 
> Nit: Why call thp_vma_suitable_orders if the orders are already 0?
> Again, some helper might be reasonable where that is handled internally.
> 
> Comment: For order-0 we'll always perform a function call to both
> thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should
> perform some fast and efficient check if any <PMD THP are even enabled
> in the system / for this VMA, and in that case just fallback before
> doing more expensive checks.

Correction: only a call to thp_vma_allowable_orders(). I wonder if we 
can move some of thp_vma_allowable_orders() into the header file where 
we'd just check as fast and as efficiently for "no THP < PMD_THP 
enabled" on this system.
Barry Song Dec. 5, 2023, 8:16 p.m. UTC | #7
On Tue, Dec 5, 2023 at 11:48 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 05/12/2023 01:24, Barry Song wrote:
> > On Tue, Dec 5, 2023 at 9:15 AM Barry Song <21cnbao@gmail.com> wrote:
> >>
> >> On Mon, Dec 4, 2023 at 6:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>
> >>> Introduce the logic to allow THP to be configured (through the new sysfs
> >>> interface we just added) to allocate large folios to back anonymous
> >>> memory, which are larger than the base page size but smaller than
> >>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
> >>>
> >>> mTHP continues to be PTE-mapped, but in many cases can still provide
> >>> similar benefits to traditional PMD-sized THP: Page faults are
> >>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
> >>> the configured order), but latency spikes are much less prominent
> >>> because the size of each page isn't as huge as the PMD-sized variant and
> >>> there is less memory to clear in each page fault. The number of per-page
> >>> operations (e.g. ref counting, rmap management, lru list management) are
> >>> also significantly reduced since those ops now become per-folio.
> >>>
> >>> Some architectures also employ TLB compression mechanisms to squeeze
> >>> more entries in when a set of PTEs are virtually and physically
> >>> contiguous and approporiately aligned. In this case, TLB misses will
> >>> occur less often.
> >>>
> >>> The new behaviour is disabled by default, but can be enabled at runtime
> >>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
> >>> (see documentation in previous commit). The long term aim is to change
> >>> the default to include suitable lower orders, but there are some risks
> >>> around internal fragmentation that need to be better understood first.
> >>>
> >>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >>> ---
> >>>  include/linux/huge_mm.h |   6 ++-
> >>>  mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
> >>>  2 files changed, 101 insertions(+), 11 deletions(-)
> >>>
> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>> index bd0eadd3befb..91a53b9835a4 100644
> >>> --- a/include/linux/huge_mm.h
> >>> +++ b/include/linux/huge_mm.h
> >>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>>  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
> >>>
> >>>  /*
> >>> - * Mask of all large folio orders supported for anonymous THP.
> >>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
> >>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
> >>> + * (which is a limitation of the THP implementation).
> >>>   */
> >>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
> >>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> >>>
> >>>  /*
> >>>   * Mask of all large folio orders supported for file THP.
> >>> diff --git a/mm/memory.c b/mm/memory.c
> >>> index 3ceeb0f45bf5..bf7e93813018 100644
> >>> --- a/mm/memory.c
> >>> +++ b/mm/memory.c
> >>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >>>         return ret;
> >>>  }
> >>>
> >>> +static bool pte_range_none(pte_t *pte, int nr_pages)
> >>> +{
> >>> +       int i;
> >>> +
> >>> +       for (i = 0; i < nr_pages; i++) {
> >>> +               if (!pte_none(ptep_get_lockless(pte + i)))
> >>> +                       return false;
> >>> +       }
> >>> +
> >>> +       return true;
> >>> +}
> >>> +
> >>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> >>> +{
> >>> +       gfp_t gfp;
> >>> +       pte_t *pte;
> >>> +       unsigned long addr;
> >>> +       struct folio *folio;
> >>> +       struct vm_area_struct *vma = vmf->vma;
> >>> +       unsigned long orders;
> >>> +       int order;
> >>> +
> >>> +       /*
> >>> +        * If uffd is active for the vma we need per-page fault fidelity to
> >>> +        * maintain the uffd semantics.
> >>> +        */
> >>> +       if (userfaultfd_armed(vma))
> >>> +               goto fallback;
> >>> +
> >>> +       /*
> >>> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
> >>> +        * for this vma. Then filter out the orders that can't be allocated over
> >>> +        * the faulting address and still be fully contained in the vma.
> >>> +        */
> >>> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
> >>> +                                         BIT(PMD_ORDER) - 1);
> >>> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> >>> +
> >>> +       if (!orders)
> >>> +               goto fallback;
> >>> +
> >>> +       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> >>> +       if (!pte)
> >>> +               return ERR_PTR(-EAGAIN);
> >>> +
> >>> +       order = first_order(orders);
> >>> +       while (orders) {
> >>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >>> +               vmf->pte = pte + pte_index(addr);
> >>> +               if (pte_range_none(vmf->pte, 1 << order))
> >>> +                       break;
> >>> +               order = next_order(&orders, order);
> >>> +       }
> >>> +
> >>> +       vmf->pte = NULL;
> >>> +       pte_unmap(pte);
> >>> +
> >>> +       gfp = vma_thp_gfp_mask(vma);
> >>> +
> >>> +       while (orders) {
> >>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >>> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
> >>> +               if (folio) {
> >>> +                       clear_huge_page(&folio->page, addr, 1 << order);
> >>
> >> Minor.
> >>
> >> Do we have to constantly clear a huge page? Is it possible to let
> >> post_alloc_hook()
> >> finish this job by using __GFP_ZERO/__GFP_ZEROTAGS as
> >> vma_alloc_zeroed_movable_folio() is doing?
>
> I'm currently following the same allocation pattern as is done for PMD-sized
> THP. In earlier versions of this patch I was trying to be smarter and use the
> __GFP_ZERO/__GFP_ZEROTAGS as you suggest, but I was advised to keep it simple
> and follow the existing pattern.
>
> I have a vague recollection __GFP_ZERO is not preferred for large folios because
> of some issue with virtually indexed caches? (Matthew: did I see you mention
> that in some other context?)
>
> That said, I wasn't aware that Android ships with
> CONFIG_INIT_ON_ALLOC_DEFAULT_ON (I thought it was only used as a debug option),
> so I can see the potential for some overhead reduction here.
>
> Options:
>
>  1) leave it as is and accept the duplicated clearing
>  2) Pass __GFP_ZERO and remove clear_huge_page()
>  3) define __GFP_SKIP_ZERO even when kasan is not enabled and pass it down so
>     clear_huge_page() is the only clear
>  4) make clear_huge_page() conditional on !want_init_on_alloc()
>
> I prefer option 4. What do you think?

either 1 and 4 is ok to me if we will finally remove this duplicated
clear_huge_page on top.
4 is even better as it can at least temporarily resolve the problem.

in Android gki_defconfig,
https://android.googlesource.com/kernel/common/+/refs/heads/android14-6.1-lts/arch/arm64/configs/gki_defconfig

Android always has the below,
CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y

here is some explanation for the reason,
https://source.android.com/docs/security/test/memory-safety/zero-initialized-memory

>
> As an aside, I've also noticed that clear_huge_page() should take vmf->address
> so that it clears the faulting page last to keep the cache hot. If we decide on
> an option that keeps clear_huge_page(), I'll also make that change.
>
> Thanks,
> Ryan
>
> >>

Thanks
Barry
Ryan Roberts Dec. 6, 2023, 10:15 a.m. UTC | #8
On 05/12/2023 20:16, Barry Song wrote:
> On Tue, Dec 5, 2023 at 11:48 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 05/12/2023 01:24, Barry Song wrote:
>>> On Tue, Dec 5, 2023 at 9:15 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>
>>>> On Mon, Dec 4, 2023 at 6:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>
>>>>> Introduce the logic to allow THP to be configured (through the new sysfs
>>>>> interface we just added) to allocate large folios to back anonymous
>>>>> memory, which are larger than the base page size but smaller than
>>>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>>>>
>>>>> mTHP continues to be PTE-mapped, but in many cases can still provide
>>>>> similar benefits to traditional PMD-sized THP: Page faults are
>>>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>>>>> the configured order), but latency spikes are much less prominent
>>>>> because the size of each page isn't as huge as the PMD-sized variant and
>>>>> there is less memory to clear in each page fault. The number of per-page
>>>>> operations (e.g. ref counting, rmap management, lru list management) are
>>>>> also significantly reduced since those ops now become per-folio.
>>>>>
>>>>> Some architectures also employ TLB compression mechanisms to squeeze
>>>>> more entries in when a set of PTEs are virtually and physically
>>>>> contiguous and approporiately aligned. In this case, TLB misses will
>>>>> occur less often.
>>>>>
>>>>> The new behaviour is disabled by default, but can be enabled at runtime
>>>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>>>>> (see documentation in previous commit). The long term aim is to change
>>>>> the default to include suitable lower orders, but there are some risks
>>>>> around internal fragmentation that need to be better understood first.
>>>>>
>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>> ---
>>>>>  include/linux/huge_mm.h |   6 ++-
>>>>>  mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>>>>>  2 files changed, 101 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index bd0eadd3befb..91a53b9835a4 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>>  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>>>>
>>>>>  /*
>>>>> - * Mask of all large folio orders supported for anonymous THP.
>>>>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
>>>>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
>>>>> + * (which is a limitation of the THP implementation).
>>>>>   */
>>>>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
>>>>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>>>
>>>>>  /*
>>>>>   * Mask of all large folio orders supported for file THP.
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 3ceeb0f45bf5..bf7e93813018 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>>         return ret;
>>>>>  }
>>>>>
>>>>> +static bool pte_range_none(pte_t *pte, int nr_pages)
>>>>> +{
>>>>> +       int i;
>>>>> +
>>>>> +       for (i = 0; i < nr_pages; i++) {
>>>>> +               if (!pte_none(ptep_get_lockless(pte + i)))
>>>>> +                       return false;
>>>>> +       }
>>>>> +
>>>>> +       return true;
>>>>> +}
>>>>> +
>>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>>>> +{
>>>>> +       gfp_t gfp;
>>>>> +       pte_t *pte;
>>>>> +       unsigned long addr;
>>>>> +       struct folio *folio;
>>>>> +       struct vm_area_struct *vma = vmf->vma;
>>>>> +       unsigned long orders;
>>>>> +       int order;
>>>>> +
>>>>> +       /*
>>>>> +        * If uffd is active for the vma we need per-page fault fidelity to
>>>>> +        * maintain the uffd semantics.
>>>>> +        */
>>>>> +       if (userfaultfd_armed(vma))
>>>>> +               goto fallback;
>>>>> +
>>>>> +       /*
>>>>> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>>>> +        * for this vma. Then filter out the orders that can't be allocated over
>>>>> +        * the faulting address and still be fully contained in the vma.
>>>>> +        */
>>>>> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>>>>> +                                         BIT(PMD_ORDER) - 1);
>>>>> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>>>>> +
>>>>> +       if (!orders)
>>>>> +               goto fallback;
>>>>> +
>>>>> +       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>>> +       if (!pte)
>>>>> +               return ERR_PTR(-EAGAIN);
>>>>> +
>>>>> +       order = first_order(orders);
>>>>> +       while (orders) {
>>>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>>> +               vmf->pte = pte + pte_index(addr);
>>>>> +               if (pte_range_none(vmf->pte, 1 << order))
>>>>> +                       break;
>>>>> +               order = next_order(&orders, order);
>>>>> +       }
>>>>> +
>>>>> +       vmf->pte = NULL;
>>>>> +       pte_unmap(pte);
>>>>> +
>>>>> +       gfp = vma_thp_gfp_mask(vma);
>>>>> +
>>>>> +       while (orders) {
>>>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>>> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>>> +               if (folio) {
>>>>> +                       clear_huge_page(&folio->page, addr, 1 << order);
>>>>
>>>> Minor.
>>>>
>>>> Do we have to constantly clear a huge page? Is it possible to let
>>>> post_alloc_hook()
>>>> finish this job by using __GFP_ZERO/__GFP_ZEROTAGS as
>>>> vma_alloc_zeroed_movable_folio() is doing?
>>
>> I'm currently following the same allocation pattern as is done for PMD-sized
>> THP. In earlier versions of this patch I was trying to be smarter and use the
>> __GFP_ZERO/__GFP_ZEROTAGS as you suggest, but I was advised to keep it simple
>> and follow the existing pattern.
>>
>> I have a vague recollection __GFP_ZERO is not preferred for large folios because
>> of some issue with virtually indexed caches? (Matthew: did I see you mention
>> that in some other context?)
>>
>> That said, I wasn't aware that Android ships with
>> CONFIG_INIT_ON_ALLOC_DEFAULT_ON (I thought it was only used as a debug option),
>> so I can see the potential for some overhead reduction here.
>>
>> Options:
>>
>>  1) leave it as is and accept the duplicated clearing
>>  2) Pass __GFP_ZERO and remove clear_huge_page()
>>  3) define __GFP_SKIP_ZERO even when kasan is not enabled and pass it down so
>>     clear_huge_page() is the only clear
>>  4) make clear_huge_page() conditional on !want_init_on_alloc()
>>
>> I prefer option 4. What do you think?
> 
> either 1 and 4 is ok to me if we will finally remove this duplicated
> clear_huge_page on top.
> 4 is even better as it can at least temporarily resolve the problem.

I'm going to stick with option 1 for this series. Then we can fix it uniformly
here and for PMD-sized THP in a separate patch (possibly with the approach
suggested in 4).

> 
> in Android gki_defconfig,
> https://android.googlesource.com/kernel/common/+/refs/heads/android14-6.1-lts/arch/arm64/configs/gki_defconfig
> 
> Android always has the below,
> CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
> 
> here is some explanation for the reason,
> https://source.android.com/docs/security/test/memory-safety/zero-initialized-memory
> 
>>
>> As an aside, I've also noticed that clear_huge_page() should take vmf->address
>> so that it clears the faulting page last to keep the cache hot. If we decide on
>> an option that keeps clear_huge_page(), I'll also make that change.

I'll make this change for the next version.

>>
>> Thanks,
>> Ryan
>>
>>>>
> 
> Thanks
> Barry
Barry Song Dec. 6, 2023, 10:25 a.m. UTC | #9
On Wed, Dec 6, 2023 at 11:16 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 05/12/2023 20:16, Barry Song wrote:
> > On Tue, Dec 5, 2023 at 11:48 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 05/12/2023 01:24, Barry Song wrote:
> >>> On Tue, Dec 5, 2023 at 9:15 AM Barry Song <21cnbao@gmail.com> wrote:
> >>>>
> >>>> On Mon, Dec 4, 2023 at 6:21 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>
> >>>>> Introduce the logic to allow THP to be configured (through the new sysfs
> >>>>> interface we just added) to allocate large folios to back anonymous
> >>>>> memory, which are larger than the base page size but smaller than
> >>>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
> >>>>>
> >>>>> mTHP continues to be PTE-mapped, but in many cases can still provide
> >>>>> similar benefits to traditional PMD-sized THP: Page faults are
> >>>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
> >>>>> the configured order), but latency spikes are much less prominent
> >>>>> because the size of each page isn't as huge as the PMD-sized variant and
> >>>>> there is less memory to clear in each page fault. The number of per-page
> >>>>> operations (e.g. ref counting, rmap management, lru list management) are
> >>>>> also significantly reduced since those ops now become per-folio.
> >>>>>
> >>>>> Some architectures also employ TLB compression mechanisms to squeeze
> >>>>> more entries in when a set of PTEs are virtually and physically
> >>>>> contiguous and approporiately aligned. In this case, TLB misses will
> >>>>> occur less often.
> >>>>>
> >>>>> The new behaviour is disabled by default, but can be enabled at runtime
> >>>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
> >>>>> (see documentation in previous commit). The long term aim is to change
> >>>>> the default to include suitable lower orders, but there are some risks
> >>>>> around internal fragmentation that need to be better understood first.
> >>>>>
> >>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >>>>> ---
> >>>>>  include/linux/huge_mm.h |   6 ++-
> >>>>>  mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
> >>>>>  2 files changed, 101 insertions(+), 11 deletions(-)
> >>>>>
> >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>> index bd0eadd3befb..91a53b9835a4 100644
> >>>>> --- a/include/linux/huge_mm.h
> >>>>> +++ b/include/linux/huge_mm.h
> >>>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
> >>>>>  #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
> >>>>>
> >>>>>  /*
> >>>>> - * Mask of all large folio orders supported for anonymous THP.
> >>>>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
> >>>>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
> >>>>> + * (which is a limitation of the THP implementation).
> >>>>>   */
> >>>>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
> >>>>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
> >>>>>
> >>>>>  /*
> >>>>>   * Mask of all large folio orders supported for file THP.
> >>>>> diff --git a/mm/memory.c b/mm/memory.c
> >>>>> index 3ceeb0f45bf5..bf7e93813018 100644
> >>>>> --- a/mm/memory.c
> >>>>> +++ b/mm/memory.c
> >>>>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >>>>>         return ret;
> >>>>>  }
> >>>>>
> >>>>> +static bool pte_range_none(pte_t *pte, int nr_pages)
> >>>>> +{
> >>>>> +       int i;
> >>>>> +
> >>>>> +       for (i = 0; i < nr_pages; i++) {
> >>>>> +               if (!pte_none(ptep_get_lockless(pte + i)))
> >>>>> +                       return false;
> >>>>> +       }
> >>>>> +
> >>>>> +       return true;
> >>>>> +}
> >>>>> +
> >>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >>>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> >>>>> +{
> >>>>> +       gfp_t gfp;
> >>>>> +       pte_t *pte;
> >>>>> +       unsigned long addr;
> >>>>> +       struct folio *folio;
> >>>>> +       struct vm_area_struct *vma = vmf->vma;
> >>>>> +       unsigned long orders;
> >>>>> +       int order;
> >>>>> +
> >>>>> +       /*
> >>>>> +        * If uffd is active for the vma we need per-page fault fidelity to
> >>>>> +        * maintain the uffd semantics.
> >>>>> +        */
> >>>>> +       if (userfaultfd_armed(vma))
> >>>>> +               goto fallback;
> >>>>> +
> >>>>> +       /*
> >>>>> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
> >>>>> +        * for this vma. Then filter out the orders that can't be allocated over
> >>>>> +        * the faulting address and still be fully contained in the vma.
> >>>>> +        */
> >>>>> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
> >>>>> +                                         BIT(PMD_ORDER) - 1);
> >>>>> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> >>>>> +
> >>>>> +       if (!orders)
> >>>>> +               goto fallback;
> >>>>> +
> >>>>> +       pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> >>>>> +       if (!pte)
> >>>>> +               return ERR_PTR(-EAGAIN);
> >>>>> +
> >>>>> +       order = first_order(orders);
> >>>>> +       while (orders) {
> >>>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >>>>> +               vmf->pte = pte + pte_index(addr);
> >>>>> +               if (pte_range_none(vmf->pte, 1 << order))
> >>>>> +                       break;
> >>>>> +               order = next_order(&orders, order);
> >>>>> +       }
> >>>>> +
> >>>>> +       vmf->pte = NULL;
> >>>>> +       pte_unmap(pte);
> >>>>> +
> >>>>> +       gfp = vma_thp_gfp_mask(vma);
> >>>>> +
> >>>>> +       while (orders) {
> >>>>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >>>>> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
> >>>>> +               if (folio) {
> >>>>> +                       clear_huge_page(&folio->page, addr, 1 << order);
> >>>>
> >>>> Minor.
> >>>>
> >>>> Do we have to constantly clear a huge page? Is it possible to let
> >>>> post_alloc_hook()
> >>>> finish this job by using __GFP_ZERO/__GFP_ZEROTAGS as
> >>>> vma_alloc_zeroed_movable_folio() is doing?
> >>
> >> I'm currently following the same allocation pattern as is done for PMD-sized
> >> THP. In earlier versions of this patch I was trying to be smarter and use the
> >> __GFP_ZERO/__GFP_ZEROTAGS as you suggest, but I was advised to keep it simple
> >> and follow the existing pattern.
> >>
> >> I have a vague recollection __GFP_ZERO is not preferred for large folios because
> >> of some issue with virtually indexed caches? (Matthew: did I see you mention
> >> that in some other context?)
> >>
> >> That said, I wasn't aware that Android ships with
> >> CONFIG_INIT_ON_ALLOC_DEFAULT_ON (I thought it was only used as a debug option),
> >> so I can see the potential for some overhead reduction here.
> >>
> >> Options:
> >>
> >>  1) leave it as is and accept the duplicated clearing
> >>  2) Pass __GFP_ZERO and remove clear_huge_page()
> >>  3) define __GFP_SKIP_ZERO even when kasan is not enabled and pass it down so
> >>     clear_huge_page() is the only clear
> >>  4) make clear_huge_page() conditional on !want_init_on_alloc()
> >>
> >> I prefer option 4. What do you think?
> >
> > either 1 and 4 is ok to me if we will finally remove this duplicated
> > clear_huge_page on top.
> > 4 is even better as it can at least temporarily resolve the problem.
>
> I'm going to stick with option 1 for this series. Then we can fix it uniformly
> here and for PMD-sized THP in a separate patch (possibly with the approach
> suggested in 4).

Ok. Thanks. there is no one fixing PMD-sized THP, probably because PMD-sized
THP is shutdown immediately after Android boots :-)

>
> >
> > in Android gki_defconfig,
> > https://android.googlesource.com/kernel/common/+/refs/heads/android14-6.1-lts/arch/arm64/configs/gki_defconfig
> >
> > Android always has the below,
> > CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
> >
> > here is some explanation for the reason,
> > https://source.android.com/docs/security/test/memory-safety/zero-initialized-memory
> >
> >>
> >> As an aside, I've also noticed that clear_huge_page() should take vmf->address
> >> so that it clears the faulting page last to keep the cache hot. If we decide on
> >> an option that keeps clear_huge_page(), I'll also make that change.
>
> I'll make this change for the next version.
>
> >>
> >> Thanks,
> >> Ryan
> >>

Thanks
Barry
Ryan Roberts Dec. 6, 2023, 2:19 p.m. UTC | #10
On 05/12/2023 16:32, David Hildenbrand wrote:
> On 04.12.23 11:20, Ryan Roberts wrote:
>> Introduce the logic to allow THP to be configured (through the new sysfs
>> interface we just added) to allocate large folios to back anonymous
>> memory, which are larger than the base page size but smaller than
>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>
>> mTHP continues to be PTE-mapped, but in many cases can still provide
>> similar benefits to traditional PMD-sized THP: Page faults are
>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>> the configured order), but latency spikes are much less prominent
>> because the size of each page isn't as huge as the PMD-sized variant and
>> there is less memory to clear in each page fault. The number of per-page
>> operations (e.g. ref counting, rmap management, lru list management) are
>> also significantly reduced since those ops now become per-folio.
>>
>> Some architectures also employ TLB compression mechanisms to squeeze
>> more entries in when a set of PTEs are virtually and physically
>> contiguous and approporiately aligned. In this case, TLB misses will
>> occur less often.
>>
>> The new behaviour is disabled by default, but can be enabled at runtime
>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>> (see documentation in previous commit). The long term aim is to change
>> the default to include suitable lower orders, but there are some risks
>> around internal fragmentation that need to be better understood first.
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> 
> In general, looks good to me, some comments/nits. And the usual "let's make sure
> we don't degrade order-0 and keep that as fast as possible" comment.
> 
>> ---
>>   include/linux/huge_mm.h |   6 ++-
>>   mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>>   2 files changed, 101 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index bd0eadd3befb..91a53b9835a4 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>   #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>     /*
>> - * Mask of all large folio orders supported for anonymous THP.
>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
>> + * (which is a limitation of the THP implementation).
>>    */
>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>     /*
>>    * Mask of all large folio orders supported for file THP.
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 3ceeb0f45bf5..bf7e93813018 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>       return ret;
>>   }
>>   +static bool pte_range_none(pte_t *pte, int nr_pages)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < nr_pages; i++) {
>> +        if (!pte_none(ptep_get_lockless(pte + i)))
>> +            return false;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>> +{
>> +    gfp_t gfp;
>> +    pte_t *pte;
>> +    unsigned long addr;
>> +    struct folio *folio;
>> +    struct vm_area_struct *vma = vmf->vma;
>> +    unsigned long orders;
>> +    int order;
> 
> Nit: reverse christmas tree encouraged ;)

ACK will fix.

> 
>> +
>> +    /*
>> +     * If uffd is active for the vma we need per-page fault fidelity to
>> +     * maintain the uffd semantics.
>> +     */
>> +    if (userfaultfd_armed(vma))
> 
> Nit: unlikely()

ACK will fix.

> 
>> +        goto fallback;
>> +
>> +    /*
>> +     * Get a list of all the (large) orders below PMD_ORDER that are enabled
>> +     * for this vma. Then filter out the orders that can't be allocated over
>> +     * the faulting address and still be fully contained in the vma.
>> +     */
>> +    orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>> +                      BIT(PMD_ORDER) - 1);
>> +    orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> 
> Comment: Both will eventually loop over all orders, correct? Could eventually be
> sped up in the future.

No only thp_vma_suitable_orders() will loop. thp_vma_allowable_orders() only
loops if in_pf=false (it's true here).

> 
> Nit: the orders = ... order = ... looks like this might deserve a helper
> function that makes this easier to read.

To be honest, the existing function that I've modified is a bit of a mess.
thp_vma_allowable_orders() calls thp_vma_suitable_orders() if we are not in a
page fault, because the page fault handlers already do that check themselves. It
would be nice to refactor the whole thing so that thp_vma_allowable_orders() is
a strict superset of thp_vma_suitable_orders(). Then this can just call
thp_vma_allowable_orders(). But that's going to start touching the PMD and PUD
handlers, so prefer if we leave that for a separate patch set.

> 
> Nit: Why call thp_vma_suitable_orders if the orders are already 0? Again, some
> helper might be reasonable where that is handled internally.

Because thp_vma_suitable_orders() will handle it safely and is inline, so it
should just as efficient? This would go away with the refactoring described above.

> 
> Comment: For order-0 we'll always perform a function call to both
> thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should perform some
> fast and efficient check if any <PMD THP are even enabled in the system / for
> this VMA, and in that case just fallback before doing more expensive checks.

thp_vma_allowable_orders() is inline as you mentioned.

I was deliberately trying to keep all the decision logic in one place
(thp_vma_suitable_orders) because it's already pretty complicated. But if you
insist, how about this in the header:

static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
				       unsigned long vm_flags, bool smaps,
				       bool in_pf, bool enforce_sysfs,
				       unsigned long orders)
{
	/* Optimization to check if required orders are enabled early. */
	if (enforce_sysfs && vma_is_anonymous(vma)) {
		unsigned long mask = READ_ONCE(huge_anon_orders_always);

		if (vm_flags & VM_HUGEPAGE)
			mask |= READ_ONCE(huge_anon_orders_madvise);
		if (hugepage_global_always() ||
			((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
			mask |= READ_ONCE(huge_anon_orders_inherit);

		orders &= mask;
		if (!orders)
			return 0;
		
		enforce_sysfs = false;
	}

	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
					  enforce_sysfs, orders);
}

Then the above check can be removed from __thp_vma_allowable_orders() - it will
still retain the `if (enforce_sysfs && !vma_is_anonymous(vma))` part.


> 
>> +
>> +    if (!orders)
>> +        goto fallback;
>> +
>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>> +    if (!pte)
>> +        return ERR_PTR(-EAGAIN);
>> +
>> +    order = first_order(orders);
>> +    while (orders) {
>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> +        vmf->pte = pte + pte_index(addr);
>> +        if (pte_range_none(vmf->pte, 1 << order))
>> +            break;
> 
> Comment: Likely it would make sense to scan only once and determine the "largest
> none range" around that address, having the largest suitable order in mind.

Yes, that's how I used to do it, but Yu Zhou requested simplifying to this,
IIRC. Perhaps this an optimization opportunity for later?

> 
>> +        order = next_order(&orders, order);
>> +    }
>> +
>> +    vmf->pte = NULL;
> 
> Nit: Can you elaborate why you are messing with vmf->pte here? A simple helper
> variable will make this code look less magical. Unless I am missing something
> important :)

Gahh, I used to pass the vmf to what pte_range_none() was refactored into (an
approach that was suggested by Yu Zhou IIRC). But since I did some refactoring
based on some comments from JohnH, I see I don't need that anymore. Agreed; it
will be much clearer just to use a local variable. Will fix.

> 
>> +    pte_unmap(pte);
>> +
>> +    gfp = vma_thp_gfp_mask(vma);
>> +
>> +    while (orders) {
>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>> +        if (folio) {
>> +            clear_huge_page(&folio->page, addr, 1 << order);
>> +            return folio;
>> +        }
>> +        order = next_order(&orders, order);
>> +    }
>> +
> 
> Queestion: would it make sense to combine both loops? I suspect memory
> allocations with pte_offset_map()/kmao are problematic.

They are both operating on separate orders; next_order() is "consuming" an order
by removing the current one from the orders bitfield and returning the next one.

So the first loop starts at the highest order and keeps checking lower orders
until one fully fits in the VMA. And the second loop starts at the first order
that was found to fully fits and loops to lower orders until an allocation is
successful.

So I don't see a need to combine the loops.

> 
>> +fallback:
>> +    return vma_alloc_zeroed_movable_folio(vma, vmf->address);
>> +}
>> +#else
>> +#define alloc_anon_folio(vmf) \
>> +        vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
>> +#endif
>> +
>>   /*
>>    * We enter with non-exclusive mmap_lock (to exclude vma changes,
>>    * but allow concurrent faults), and pte mapped but not yet locked.
>> @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>    */
>>   static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>   {
>> +    int i;
>> +    int nr_pages = 1;
>> +    unsigned long addr = vmf->address;
>>       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>       struct vm_area_struct *vma = vmf->vma;
>>       struct folio *folio;
> 
> Nit: reverse christmas tree :)

ACK

> 
>> @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>       /* Allocate our own private page. */
>>       if (unlikely(anon_vma_prepare(vma)))
>>           goto oom;
>> -    folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
>> +    folio = alloc_anon_folio(vmf);
>> +    if (IS_ERR(folio))
>> +        return 0;
>>       if (!folio)
>>           goto oom;
>>   +    nr_pages = folio_nr_pages(folio);
>> +    addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>> +
>>       if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>>           goto oom_free_page;
>>       folio_throttle_swaprate(folio, GFP_KERNEL);
>> @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>       if (vma->vm_flags & VM_WRITE)
>>           entry = pte_mkwrite(pte_mkdirty(entry), vma);
>>   -    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>> -            &vmf->ptl);
>> +    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>>       if (!vmf->pte)
>>           goto release;
>> -    if (vmf_pte_changed(vmf)) {
>> -        update_mmu_tlb(vma, vmf->address, vmf->pte);
>> +    if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
>> +        (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
>> +        for (i = 0; i < nr_pages; i++)
>> +            update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
> 
> Comment: separating the order-0 case from the other case might make this easier
> to read.

Yeah fair enough. Will fix.

> 
>>           goto release;
>>       }
>>   @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault
>> *vmf)
>>           return handle_userfault(vmf, VM_UFFD_MISSING);
>>       }
>>   -    inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>> -    folio_add_new_anon_rmap(folio, vma, vmf->address);
>> +    folio_ref_add(folio, nr_pages - 1);
>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>> +    folio_add_new_anon_rmap(folio, vma, addr);
>>       folio_add_lru_vma(folio, vma);
>>   setpte:
>>       if (uffd_wp)
>>           entry = pte_mkuffd_wp(entry);
>> -    set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
>> +    set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>>         /* No need to invalidate - it was non-present before */
>> -    update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
>> +    update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
>>   unlock:
>>       if (vmf->pte)
>>           pte_unmap_unlock(vmf->pte, vmf->ptl);
> 
> Benchmarking order-0 allocations might be interesting. There will be some added
> checks + multiple loops/conditionals for order-0 that could be avoided by having
> two separate code paths. If we can't measure a difference, all good.

Yep will do - will post numbers once I have them. I've been assuming that the
major cost is clearing the page, but perhaps I'm wrong.

>
Ryan Roberts Dec. 6, 2023, 3:44 p.m. UTC | #11
On 06/12/2023 14:19, Ryan Roberts wrote:
> On 05/12/2023 16:32, David Hildenbrand wrote:
>> On 04.12.23 11:20, Ryan Roberts wrote:
>>> Introduce the logic to allow THP to be configured (through the new sysfs
>>> interface we just added) to allocate large folios to back anonymous
>>> memory, which are larger than the base page size but smaller than
>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>>
>>> mTHP continues to be PTE-mapped, but in many cases can still provide
>>> similar benefits to traditional PMD-sized THP: Page faults are
>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>>> the configured order), but latency spikes are much less prominent
>>> because the size of each page isn't as huge as the PMD-sized variant and
>>> there is less memory to clear in each page fault. The number of per-page
>>> operations (e.g. ref counting, rmap management, lru list management) are
>>> also significantly reduced since those ops now become per-folio.
>>>
>>> Some architectures also employ TLB compression mechanisms to squeeze
>>> more entries in when a set of PTEs are virtually and physically
>>> contiguous and approporiately aligned. In this case, TLB misses will
>>> occur less often.
>>>
>>> The new behaviour is disabled by default, but can be enabled at runtime
>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>>> (see documentation in previous commit). The long term aim is to change
>>> the default to include suitable lower orders, but there are some risks
>>> around internal fragmentation that need to be better understood first.
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>
>> In general, looks good to me, some comments/nits. And the usual "let's make sure
>> we don't degrade order-0 and keep that as fast as possible" comment.
>>
>>> ---
>>>   include/linux/huge_mm.h |   6 ++-
>>>   mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>>>   2 files changed, 101 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index bd0eadd3befb..91a53b9835a4 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>   #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>>     /*
>>> - * Mask of all large folio orders supported for anonymous THP.
>>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
>>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
>>> + * (which is a limitation of the THP implementation).
>>>    */
>>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
>>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>     /*
>>>    * Mask of all large folio orders supported for file THP.
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 3ceeb0f45bf5..bf7e93813018 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>       return ret;
>>>   }
>>>   +static bool pte_range_none(pte_t *pte, int nr_pages)
>>> +{
>>> +    int i;
>>> +
>>> +    for (i = 0; i < nr_pages; i++) {
>>> +        if (!pte_none(ptep_get_lockless(pte + i)))
>>> +            return false;
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>> +{
>>> +    gfp_t gfp;
>>> +    pte_t *pte;
>>> +    unsigned long addr;
>>> +    struct folio *folio;
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    unsigned long orders;
>>> +    int order;
>>
>> Nit: reverse christmas tree encouraged ;)
> 
> ACK will fix.
> 
>>
>>> +
>>> +    /*
>>> +     * If uffd is active for the vma we need per-page fault fidelity to
>>> +     * maintain the uffd semantics.
>>> +     */
>>> +    if (userfaultfd_armed(vma))
>>
>> Nit: unlikely()
> 
> ACK will fix.
> 
>>
>>> +        goto fallback;
>>> +
>>> +    /*
>>> +     * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>> +     * for this vma. Then filter out the orders that can't be allocated over
>>> +     * the faulting address and still be fully contained in the vma.
>>> +     */
>>> +    orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>>> +                      BIT(PMD_ORDER) - 1);
>>> +    orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>>
>> Comment: Both will eventually loop over all orders, correct? Could eventually be
>> sped up in the future.
> 
> No only thp_vma_suitable_orders() will loop. thp_vma_allowable_orders() only
> loops if in_pf=false (it's true here).
> 
>>
>> Nit: the orders = ... order = ... looks like this might deserve a helper
>> function that makes this easier to read.
> 
> To be honest, the existing function that I've modified is a bit of a mess.
> thp_vma_allowable_orders() calls thp_vma_suitable_orders() if we are not in a
> page fault, because the page fault handlers already do that check themselves. It
> would be nice to refactor the whole thing so that thp_vma_allowable_orders() is
> a strict superset of thp_vma_suitable_orders(). Then this can just call
> thp_vma_allowable_orders(). But that's going to start touching the PMD and PUD
> handlers, so prefer if we leave that for a separate patch set.
> 
>>
>> Nit: Why call thp_vma_suitable_orders if the orders are already 0? Again, some
>> helper might be reasonable where that is handled internally.
> 
> Because thp_vma_suitable_orders() will handle it safely and is inline, so it
> should just as efficient? This would go away with the refactoring described above.
> 
>>
>> Comment: For order-0 we'll always perform a function call to both
>> thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should perform some
>> fast and efficient check if any <PMD THP are even enabled in the system / for
>> this VMA, and in that case just fallback before doing more expensive checks.
> 

I just noticed I got these functions round the wrong way in my previous response:

> thp_vma_allowable_orders() is inline as you mentioned.

^ Meant thp_vma_suitable_orders() here.

> 
> I was deliberately trying to keep all the decision logic in one place
> (thp_vma_suitable_orders) because it's already pretty complicated. But if you

^ Meant thp_vma_allowable_orders() here.

Sorry for the confusion.

> insist, how about this in the header:
> 
> static inline
> unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> 				       unsigned long vm_flags, bool smaps,
> 				       bool in_pf, bool enforce_sysfs,
> 				       unsigned long orders)
> {
> 	/* Optimization to check if required orders are enabled early. */
> 	if (enforce_sysfs && vma_is_anonymous(vma)) {
> 		unsigned long mask = READ_ONCE(huge_anon_orders_always);
> 
> 		if (vm_flags & VM_HUGEPAGE)
> 			mask |= READ_ONCE(huge_anon_orders_madvise);
> 		if (hugepage_global_always() ||
> 			((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
> 			mask |= READ_ONCE(huge_anon_orders_inherit);
> 
> 		orders &= mask;
> 		if (!orders)
> 			return 0;
> 		
> 		enforce_sysfs = false;
> 	}
> 
> 	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
> 					  enforce_sysfs, orders);
> }
> 
> Then the above check can be removed from __thp_vma_allowable_orders() - it will
> still retain the `if (enforce_sysfs && !vma_is_anonymous(vma))` part.
> 
> 
>>
>>> +
>>> +    if (!orders)
>>> +        goto fallback;
>>> +
>>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>> +    if (!pte)
>>> +        return ERR_PTR(-EAGAIN);
>>> +
>>> +    order = first_order(orders);
>>> +    while (orders) {
>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +        vmf->pte = pte + pte_index(addr);
>>> +        if (pte_range_none(vmf->pte, 1 << order))
>>> +            break;
>>
>> Comment: Likely it would make sense to scan only once and determine the "largest
>> none range" around that address, having the largest suitable order in mind.
> 
> Yes, that's how I used to do it, but Yu Zhou requested simplifying to this,
> IIRC. Perhaps this an optimization opportunity for later?
> 
>>
>>> +        order = next_order(&orders, order);
>>> +    }
>>> +
>>> +    vmf->pte = NULL;
>>
>> Nit: Can you elaborate why you are messing with vmf->pte here? A simple helper
>> variable will make this code look less magical. Unless I am missing something
>> important :)
> 
> Gahh, I used to pass the vmf to what pte_range_none() was refactored into (an
> approach that was suggested by Yu Zhou IIRC). But since I did some refactoring
> based on some comments from JohnH, I see I don't need that anymore. Agreed; it
> will be much clearer just to use a local variable. Will fix.
> 
>>
>>> +    pte_unmap(pte);
>>> +
>>> +    gfp = vma_thp_gfp_mask(vma);
>>> +
>>> +    while (orders) {
>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>> +        if (folio) {
>>> +            clear_huge_page(&folio->page, addr, 1 << order);
>>> +            return folio;
>>> +        }
>>> +        order = next_order(&orders, order);
>>> +    }
>>> +
>>
>> Queestion: would it make sense to combine both loops? I suspect memory
>> allocations with pte_offset_map()/kmao are problematic.
> 
> They are both operating on separate orders; next_order() is "consuming" an order
> by removing the current one from the orders bitfield and returning the next one.
> 
> So the first loop starts at the highest order and keeps checking lower orders
> until one fully fits in the VMA. And the second loop starts at the first order
> that was found to fully fits and loops to lower orders until an allocation is
> successful.
> 
> So I don't see a need to combine the loops.
> 
>>
>>> +fallback:
>>> +    return vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>> +}
>>> +#else
>>> +#define alloc_anon_folio(vmf) \
>>> +        vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
>>> +#endif
>>> +
>>>   /*
>>>    * We enter with non-exclusive mmap_lock (to exclude vma changes,
>>>    * but allow concurrent faults), and pte mapped but not yet locked.
>>> @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>    */
>>>   static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>   {
>>> +    int i;
>>> +    int nr_pages = 1;
>>> +    unsigned long addr = vmf->address;
>>>       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>>       struct vm_area_struct *vma = vmf->vma;
>>>       struct folio *folio;
>>
>> Nit: reverse christmas tree :)
> 
> ACK
> 
>>
>>> @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>       /* Allocate our own private page. */
>>>       if (unlikely(anon_vma_prepare(vma)))
>>>           goto oom;
>>> -    folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>> +    folio = alloc_anon_folio(vmf);
>>> +    if (IS_ERR(folio))
>>> +        return 0;
>>>       if (!folio)
>>>           goto oom;
>>>   +    nr_pages = folio_nr_pages(folio);
>>> +    addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>> +
>>>       if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>>>           goto oom_free_page;
>>>       folio_throttle_swaprate(folio, GFP_KERNEL);
>>> @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>       if (vma->vm_flags & VM_WRITE)
>>>           entry = pte_mkwrite(pte_mkdirty(entry), vma);
>>>   -    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>>> -            &vmf->ptl);
>>> +    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>>>       if (!vmf->pte)
>>>           goto release;
>>> -    if (vmf_pte_changed(vmf)) {
>>> -        update_mmu_tlb(vma, vmf->address, vmf->pte);
>>> +    if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
>>> +        (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
>>> +        for (i = 0; i < nr_pages; i++)
>>> +            update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>
>> Comment: separating the order-0 case from the other case might make this easier
>> to read.
> 
> Yeah fair enough. Will fix.
> 
>>
>>>           goto release;
>>>       }
>>>   @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault
>>> *vmf)
>>>           return handle_userfault(vmf, VM_UFFD_MISSING);
>>>       }
>>>   -    inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>> -    folio_add_new_anon_rmap(folio, vma, vmf->address);
>>> +    folio_ref_add(folio, nr_pages - 1);
>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>>> +    folio_add_new_anon_rmap(folio, vma, addr);
>>>       folio_add_lru_vma(folio, vma);
>>>   setpte:
>>>       if (uffd_wp)
>>>           entry = pte_mkuffd_wp(entry);
>>> -    set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
>>> +    set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>>>         /* No need to invalidate - it was non-present before */
>>> -    update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
>>> +    update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
>>>   unlock:
>>>       if (vmf->pte)
>>>           pte_unmap_unlock(vmf->pte, vmf->ptl);
>>
>> Benchmarking order-0 allocations might be interesting. There will be some added
>> checks + multiple loops/conditionals for order-0 that could be avoided by having
>> two separate code paths. If we can't measure a difference, all good.
> 
> Yep will do - will post numbers once I have them. I've been assuming that the
> major cost is clearing the page, but perhaps I'm wrong.
> 
>>
>
Ryan Roberts Dec. 7, 2023, 10:37 a.m. UTC | #12
On 06/12/2023 15:44, Ryan Roberts wrote:
> On 06/12/2023 14:19, Ryan Roberts wrote:
>> On 05/12/2023 16:32, David Hildenbrand wrote:
>>> On 04.12.23 11:20, Ryan Roberts wrote:
>>>> Introduce the logic to allow THP to be configured (through the new sysfs
>>>> interface we just added) to allocate large folios to back anonymous
>>>> memory, which are larger than the base page size but smaller than
>>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>>>
>>>> mTHP continues to be PTE-mapped, but in many cases can still provide
>>>> similar benefits to traditional PMD-sized THP: Page faults are
>>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>>>> the configured order), but latency spikes are much less prominent
>>>> because the size of each page isn't as huge as the PMD-sized variant and
>>>> there is less memory to clear in each page fault. The number of per-page
>>>> operations (e.g. ref counting, rmap management, lru list management) are
>>>> also significantly reduced since those ops now become per-folio.
>>>>
>>>> Some architectures also employ TLB compression mechanisms to squeeze
>>>> more entries in when a set of PTEs are virtually and physically
>>>> contiguous and approporiately aligned. In this case, TLB misses will
>>>> occur less often.
>>>>
>>>> The new behaviour is disabled by default, but can be enabled at runtime
>>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>>>> (see documentation in previous commit). The long term aim is to change
>>>> the default to include suitable lower orders, but there are some risks
>>>> around internal fragmentation that need to be better understood first.
>>>>
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>
>>> In general, looks good to me, some comments/nits. And the usual "let's make sure
>>> we don't degrade order-0 and keep that as fast as possible" comment.
>>>
>>>> ---
>>>>   include/linux/huge_mm.h |   6 ++-
>>>>   mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>>>>   2 files changed, 101 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index bd0eadd3befb..91a53b9835a4 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>   #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>>>     /*
>>>> - * Mask of all large folio orders supported for anonymous THP.
>>>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
>>>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
>>>> + * (which is a limitation of the THP implementation).
>>>>    */
>>>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
>>>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>>     /*
>>>>    * Mask of all large folio orders supported for file THP.
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 3ceeb0f45bf5..bf7e93813018 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>       return ret;
>>>>   }
>>>>   +static bool pte_range_none(pte_t *pte, int nr_pages)
>>>> +{
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i < nr_pages; i++) {
>>>> +        if (!pte_none(ptep_get_lockless(pte + i)))
>>>> +            return false;
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>>> +{
>>>> +    gfp_t gfp;
>>>> +    pte_t *pte;
>>>> +    unsigned long addr;
>>>> +    struct folio *folio;
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    unsigned long orders;
>>>> +    int order;
>>>
>>> Nit: reverse christmas tree encouraged ;)
>>
>> ACK will fix.
>>
>>>
>>>> +
>>>> +    /*
>>>> +     * If uffd is active for the vma we need per-page fault fidelity to
>>>> +     * maintain the uffd semantics.
>>>> +     */
>>>> +    if (userfaultfd_armed(vma))
>>>
>>> Nit: unlikely()
>>
>> ACK will fix.
>>
>>>
>>>> +        goto fallback;
>>>> +
>>>> +    /*
>>>> +     * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>>> +     * for this vma. Then filter out the orders that can't be allocated over
>>>> +     * the faulting address and still be fully contained in the vma.
>>>> +     */
>>>> +    orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>>>> +                      BIT(PMD_ORDER) - 1);
>>>> +    orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>>>
>>> Comment: Both will eventually loop over all orders, correct? Could eventually be
>>> sped up in the future.
>>
>> No only thp_vma_suitable_orders() will loop. thp_vma_allowable_orders() only
>> loops if in_pf=false (it's true here).
>>
>>>
>>> Nit: the orders = ... order = ... looks like this might deserve a helper
>>> function that makes this easier to read.
>>
>> To be honest, the existing function that I've modified is a bit of a mess.
>> thp_vma_allowable_orders() calls thp_vma_suitable_orders() if we are not in a
>> page fault, because the page fault handlers already do that check themselves. It
>> would be nice to refactor the whole thing so that thp_vma_allowable_orders() is
>> a strict superset of thp_vma_suitable_orders(). Then this can just call
>> thp_vma_allowable_orders(). But that's going to start touching the PMD and PUD
>> handlers, so prefer if we leave that for a separate patch set.
>>
>>>
>>> Nit: Why call thp_vma_suitable_orders if the orders are already 0? Again, some
>>> helper might be reasonable where that is handled internally.
>>
>> Because thp_vma_suitable_orders() will handle it safely and is inline, so it
>> should just as efficient? This would go away with the refactoring described above.
>>
>>>
>>> Comment: For order-0 we'll always perform a function call to both
>>> thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should perform some
>>> fast and efficient check if any <PMD THP are even enabled in the system / for
>>> this VMA, and in that case just fallback before doing more expensive checks.
>>
> 
> I just noticed I got these functions round the wrong way in my previous response:
> 
>> thp_vma_allowable_orders() is inline as you mentioned.
> 
> ^ Meant thp_vma_suitable_orders() here.
> 
>>
>> I was deliberately trying to keep all the decision logic in one place
>> (thp_vma_suitable_orders) because it's already pretty complicated. But if you
> 
> ^ Meant thp_vma_allowable_orders() here.
> 
> Sorry for the confusion.
> 
>> insist, how about this in the header:
>>
>> static inline
>> unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>> 				       unsigned long vm_flags, bool smaps,
>> 				       bool in_pf, bool enforce_sysfs,
>> 				       unsigned long orders)
>> {
>> 	/* Optimization to check if required orders are enabled early. */
>> 	if (enforce_sysfs && vma_is_anonymous(vma)) {
>> 		unsigned long mask = READ_ONCE(huge_anon_orders_always);
>>
>> 		if (vm_flags & VM_HUGEPAGE)
>> 			mask |= READ_ONCE(huge_anon_orders_madvise);
>> 		if (hugepage_global_always() ||
>> 			((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
>> 			mask |= READ_ONCE(huge_anon_orders_inherit);
>>
>> 		orders &= mask;
>> 		if (!orders)
>> 			return 0;
>> 		
>> 		enforce_sysfs = false;
>> 	}
>>
>> 	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
>> 					  enforce_sysfs, orders);
>> }
>>
>> Then the above check can be removed from __thp_vma_allowable_orders() - it will
>> still retain the `if (enforce_sysfs && !vma_is_anonymous(vma))` part.
>>
>>
>>>
>>>> +
>>>> +    if (!orders)
>>>> +        goto fallback;
>>>> +
>>>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>> +    if (!pte)
>>>> +        return ERR_PTR(-EAGAIN);
>>>> +
>>>> +    order = first_order(orders);
>>>> +    while (orders) {
>>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>> +        vmf->pte = pte + pte_index(addr);
>>>> +        if (pte_range_none(vmf->pte, 1 << order))
>>>> +            break;
>>>
>>> Comment: Likely it would make sense to scan only once and determine the "largest
>>> none range" around that address, having the largest suitable order in mind.
>>
>> Yes, that's how I used to do it, but Yu Zhou requested simplifying to this,
>> IIRC. Perhaps this an optimization opportunity for later?
>>
>>>
>>>> +        order = next_order(&orders, order);
>>>> +    }
>>>> +
>>>> +    vmf->pte = NULL;
>>>
>>> Nit: Can you elaborate why you are messing with vmf->pte here? A simple helper
>>> variable will make this code look less magical. Unless I am missing something
>>> important :)
>>
>> Gahh, I used to pass the vmf to what pte_range_none() was refactored into (an
>> approach that was suggested by Yu Zhou IIRC). But since I did some refactoring
>> based on some comments from JohnH, I see I don't need that anymore. Agreed; it
>> will be much clearer just to use a local variable. Will fix.
>>
>>>
>>>> +    pte_unmap(pte);
>>>> +
>>>> +    gfp = vma_thp_gfp_mask(vma);
>>>> +
>>>> +    while (orders) {
>>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>> +        if (folio) {
>>>> +            clear_huge_page(&folio->page, addr, 1 << order);
>>>> +            return folio;
>>>> +        }
>>>> +        order = next_order(&orders, order);
>>>> +    }
>>>> +
>>>
>>> Queestion: would it make sense to combine both loops? I suspect memory
>>> allocations with pte_offset_map()/kmao are problematic.
>>
>> They are both operating on separate orders; next_order() is "consuming" an order
>> by removing the current one from the orders bitfield and returning the next one.
>>
>> So the first loop starts at the highest order and keeps checking lower orders
>> until one fully fits in the VMA. And the second loop starts at the first order
>> that was found to fully fits and loops to lower orders until an allocation is
>> successful.
>>
>> So I don't see a need to combine the loops.
>>
>>>
>>>> +fallback:
>>>> +    return vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>>> +}
>>>> +#else
>>>> +#define alloc_anon_folio(vmf) \
>>>> +        vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
>>>> +#endif
>>>> +
>>>>   /*
>>>>    * We enter with non-exclusive mmap_lock (to exclude vma changes,
>>>>    * but allow concurrent faults), and pte mapped but not yet locked.
>>>> @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>    */
>>>>   static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>>   {
>>>> +    int i;
>>>> +    int nr_pages = 1;
>>>> +    unsigned long addr = vmf->address;
>>>>       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>>>       struct vm_area_struct *vma = vmf->vma;
>>>>       struct folio *folio;
>>>
>>> Nit: reverse christmas tree :)
>>
>> ACK
>>
>>>
>>>> @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>>       /* Allocate our own private page. */
>>>>       if (unlikely(anon_vma_prepare(vma)))
>>>>           goto oom;
>>>> -    folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>>> +    folio = alloc_anon_folio(vmf);
>>>> +    if (IS_ERR(folio))
>>>> +        return 0;
>>>>       if (!folio)
>>>>           goto oom;
>>>>   +    nr_pages = folio_nr_pages(folio);
>>>> +    addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>>> +
>>>>       if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>>>>           goto oom_free_page;
>>>>       folio_throttle_swaprate(folio, GFP_KERNEL);
>>>> @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>>       if (vma->vm_flags & VM_WRITE)
>>>>           entry = pte_mkwrite(pte_mkdirty(entry), vma);
>>>>   -    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>>>> -            &vmf->ptl);
>>>> +    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>>>>       if (!vmf->pte)
>>>>           goto release;
>>>> -    if (vmf_pte_changed(vmf)) {
>>>> -        update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>> +    if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
>>>> +        (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
>>>> +        for (i = 0; i < nr_pages; i++)
>>>> +            update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>>
>>> Comment: separating the order-0 case from the other case might make this easier
>>> to read.
>>
>> Yeah fair enough. Will fix.
>>
>>>
>>>>           goto release;
>>>>       }
>>>>   @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault
>>>> *vmf)
>>>>           return handle_userfault(vmf, VM_UFFD_MISSING);
>>>>       }
>>>>   -    inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>>> -    folio_add_new_anon_rmap(folio, vma, vmf->address);
>>>> +    folio_ref_add(folio, nr_pages - 1);
>>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>>>> +    folio_add_new_anon_rmap(folio, vma, addr);
>>>>       folio_add_lru_vma(folio, vma);
>>>>   setpte:
>>>>       if (uffd_wp)
>>>>           entry = pte_mkuffd_wp(entry);
>>>> -    set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
>>>> +    set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>>>>         /* No need to invalidate - it was non-present before */
>>>> -    update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
>>>> +    update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
>>>>   unlock:
>>>>       if (vmf->pte)
>>>>           pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>
>>> Benchmarking order-0 allocations might be interesting. There will be some added
>>> checks + multiple loops/conditionals for order-0 that could be avoided by having
>>> two separate code paths. If we can't measure a difference, all good.
>>
>> Yep will do - will post numbers once I have them. I've been assuming that the
>> major cost is clearing the page, but perhaps I'm wrong.
>>

I added a "write-fault-byte" benchmark to the microbenchmark tool you gave me.
This elides the normal memset page population routine, and instead writes the
first byte of every page while the timer is running.

I ran with 100 iterations per run, then ran the whole thing 16 times. I ran it
for a baseline kernel, as well as v8 (this series) and v9 (with changes from
your review). I repeated on Ampere Altra (bare metal) and Apple M2 (VM):

|              |        m2 vm        |        altra        |
|--------------|---------------------|--------------------:|
| kernel       |     mean |  std_rel |     mean |  std_rel |
|--------------|----------|----------|----------|---------:|
| baseline     |   0.000% |   0.341% |   0.000% |   3.581% |
| anonfolio-v8 |   0.005% |   0.272% |   5.068% |   1.128% |
| anonfolio-v9 |  -0.013% |   0.442% |   0.107% |   1.788% |

No measurable difference on M2, but altra has a slow down in v8 which is fixed
in v9; Looking at the changes, this is either down to the new unlikely() for the
uffd or due to moving the THP order check to be inline within
thp_vma_allowable_orders().

So I have all the changes done and perf numbers to show no regression for
order-0. I'm gonna do a final check and post v9 later today.

Thanks,
Ryan
David Hildenbrand Dec. 7, 2023, 10:40 a.m. UTC | #13
On 07.12.23 11:37, Ryan Roberts wrote:
> On 06/12/2023 15:44, Ryan Roberts wrote:
>> On 06/12/2023 14:19, Ryan Roberts wrote:
>>> On 05/12/2023 16:32, David Hildenbrand wrote:
>>>> On 04.12.23 11:20, Ryan Roberts wrote:
>>>>> Introduce the logic to allow THP to be configured (through the new sysfs
>>>>> interface we just added) to allocate large folios to back anonymous
>>>>> memory, which are larger than the base page size but smaller than
>>>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>>>>
>>>>> mTHP continues to be PTE-mapped, but in many cases can still provide
>>>>> similar benefits to traditional PMD-sized THP: Page faults are
>>>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>>>>> the configured order), but latency spikes are much less prominent
>>>>> because the size of each page isn't as huge as the PMD-sized variant and
>>>>> there is less memory to clear in each page fault. The number of per-page
>>>>> operations (e.g. ref counting, rmap management, lru list management) are
>>>>> also significantly reduced since those ops now become per-folio.
>>>>>
>>>>> Some architectures also employ TLB compression mechanisms to squeeze
>>>>> more entries in when a set of PTEs are virtually and physically
>>>>> contiguous and approporiately aligned. In this case, TLB misses will
>>>>> occur less often.
>>>>>
>>>>> The new behaviour is disabled by default, but can be enabled at runtime
>>>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>>>>> (see documentation in previous commit). The long term aim is to change
>>>>> the default to include suitable lower orders, but there are some risks
>>>>> around internal fragmentation that need to be better understood first.
>>>>>
>>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>>
>>>> In general, looks good to me, some comments/nits. And the usual "let's make sure
>>>> we don't degrade order-0 and keep that as fast as possible" comment.
>>>>
>>>>> ---
>>>>>    include/linux/huge_mm.h |   6 ++-
>>>>>    mm/memory.c             | 106 ++++++++++++++++++++++++++++++++++++----
>>>>>    2 files changed, 101 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index bd0eadd3befb..91a53b9835a4 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>>    #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>>>>      /*
>>>>> - * Mask of all large folio orders supported for anonymous THP.
>>>>> + * Mask of all large folio orders supported for anonymous THP; all orders up to
>>>>> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
>>>>> + * (which is a limitation of the THP implementation).
>>>>>     */
>>>>> -#define THP_ORDERS_ALL_ANON    BIT(PMD_ORDER)
>>>>> +#define THP_ORDERS_ALL_ANON    ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
>>>>>      /*
>>>>>     * Mask of all large folio orders supported for file THP.
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 3ceeb0f45bf5..bf7e93813018 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -4125,6 +4125,84 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>>        return ret;
>>>>>    }
>>>>>    +static bool pte_range_none(pte_t *pte, int nr_pages)
>>>>> +{
>>>>> +    int i;
>>>>> +
>>>>> +    for (i = 0; i < nr_pages; i++) {
>>>>> +        if (!pte_none(ptep_get_lockless(pte + i)))
>>>>> +            return false;
>>>>> +    }
>>>>> +
>>>>> +    return true;
>>>>> +}
>>>>> +
>>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>>>> +{
>>>>> +    gfp_t gfp;
>>>>> +    pte_t *pte;
>>>>> +    unsigned long addr;
>>>>> +    struct folio *folio;
>>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>>> +    unsigned long orders;
>>>>> +    int order;
>>>>
>>>> Nit: reverse christmas tree encouraged ;)
>>>
>>> ACK will fix.
>>>
>>>>
>>>>> +
>>>>> +    /*
>>>>> +     * If uffd is active for the vma we need per-page fault fidelity to
>>>>> +     * maintain the uffd semantics.
>>>>> +     */
>>>>> +    if (userfaultfd_armed(vma))
>>>>
>>>> Nit: unlikely()
>>>
>>> ACK will fix.
>>>
>>>>
>>>>> +        goto fallback;
>>>>> +
>>>>> +    /*
>>>>> +     * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>>>> +     * for this vma. Then filter out the orders that can't be allocated over
>>>>> +     * the faulting address and still be fully contained in the vma.
>>>>> +     */
>>>>> +    orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>>>>> +                      BIT(PMD_ORDER) - 1);
>>>>> +    orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>>>>
>>>> Comment: Both will eventually loop over all orders, correct? Could eventually be
>>>> sped up in the future.
>>>
>>> No only thp_vma_suitable_orders() will loop. thp_vma_allowable_orders() only
>>> loops if in_pf=false (it's true here).
>>>
>>>>
>>>> Nit: the orders = ... order = ... looks like this might deserve a helper
>>>> function that makes this easier to read.
>>>
>>> To be honest, the existing function that I've modified is a bit of a mess.
>>> thp_vma_allowable_orders() calls thp_vma_suitable_orders() if we are not in a
>>> page fault, because the page fault handlers already do that check themselves. It
>>> would be nice to refactor the whole thing so that thp_vma_allowable_orders() is
>>> a strict superset of thp_vma_suitable_orders(). Then this can just call
>>> thp_vma_allowable_orders(). But that's going to start touching the PMD and PUD
>>> handlers, so prefer if we leave that for a separate patch set.
>>>
>>>>
>>>> Nit: Why call thp_vma_suitable_orders if the orders are already 0? Again, some
>>>> helper might be reasonable where that is handled internally.
>>>
>>> Because thp_vma_suitable_orders() will handle it safely and is inline, so it
>>> should just as efficient? This would go away with the refactoring described above.
>>>
>>>>
>>>> Comment: For order-0 we'll always perform a function call to both
>>>> thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should perform some
>>>> fast and efficient check if any <PMD THP are even enabled in the system / for
>>>> this VMA, and in that case just fallback before doing more expensive checks.
>>>
>>
>> I just noticed I got these functions round the wrong way in my previous response:
>>
>>> thp_vma_allowable_orders() is inline as you mentioned.
>>
>> ^ Meant thp_vma_suitable_orders() here.
>>
>>>
>>> I was deliberately trying to keep all the decision logic in one place
>>> (thp_vma_suitable_orders) because it's already pretty complicated. But if you
>>
>> ^ Meant thp_vma_allowable_orders() here.
>>
>> Sorry for the confusion.
>>
>>> insist, how about this in the header:
>>>
>>> static inline
>>> unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>> 				       unsigned long vm_flags, bool smaps,
>>> 				       bool in_pf, bool enforce_sysfs,
>>> 				       unsigned long orders)
>>> {
>>> 	/* Optimization to check if required orders are enabled early. */
>>> 	if (enforce_sysfs && vma_is_anonymous(vma)) {
>>> 		unsigned long mask = READ_ONCE(huge_anon_orders_always);
>>>
>>> 		if (vm_flags & VM_HUGEPAGE)
>>> 			mask |= READ_ONCE(huge_anon_orders_madvise);
>>> 		if (hugepage_global_always() ||
>>> 			((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
>>> 			mask |= READ_ONCE(huge_anon_orders_inherit);
>>>
>>> 		orders &= mask;
>>> 		if (!orders)
>>> 			return 0;
>>> 		
>>> 		enforce_sysfs = false;
>>> 	}
>>>
>>> 	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
>>> 					  enforce_sysfs, orders);
>>> }
>>>
>>> Then the above check can be removed from __thp_vma_allowable_orders() - it will
>>> still retain the `if (enforce_sysfs && !vma_is_anonymous(vma))` part.
>>>
>>>
>>>>
>>>>> +
>>>>> +    if (!orders)
>>>>> +        goto fallback;
>>>>> +
>>>>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>>> +    if (!pte)
>>>>> +        return ERR_PTR(-EAGAIN);
>>>>> +
>>>>> +    order = first_order(orders);
>>>>> +    while (orders) {
>>>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>>> +        vmf->pte = pte + pte_index(addr);
>>>>> +        if (pte_range_none(vmf->pte, 1 << order))
>>>>> +            break;
>>>>
>>>> Comment: Likely it would make sense to scan only once and determine the "largest
>>>> none range" around that address, having the largest suitable order in mind.
>>>
>>> Yes, that's how I used to do it, but Yu Zhou requested simplifying to this,
>>> IIRC. Perhaps this an optimization opportunity for later?
>>>
>>>>
>>>>> +        order = next_order(&orders, order);
>>>>> +    }
>>>>> +
>>>>> +    vmf->pte = NULL;
>>>>
>>>> Nit: Can you elaborate why you are messing with vmf->pte here? A simple helper
>>>> variable will make this code look less magical. Unless I am missing something
>>>> important :)
>>>
>>> Gahh, I used to pass the vmf to what pte_range_none() was refactored into (an
>>> approach that was suggested by Yu Zhou IIRC). But since I did some refactoring
>>> based on some comments from JohnH, I see I don't need that anymore. Agreed; it
>>> will be much clearer just to use a local variable. Will fix.
>>>
>>>>
>>>>> +    pte_unmap(pte);
>>>>> +
>>>>> +    gfp = vma_thp_gfp_mask(vma);
>>>>> +
>>>>> +    while (orders) {
>>>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>>> +        if (folio) {
>>>>> +            clear_huge_page(&folio->page, addr, 1 << order);
>>>>> +            return folio;
>>>>> +        }
>>>>> +        order = next_order(&orders, order);
>>>>> +    }
>>>>> +
>>>>
>>>> Queestion: would it make sense to combine both loops? I suspect memory
>>>> allocations with pte_offset_map()/kmao are problematic.
>>>
>>> They are both operating on separate orders; next_order() is "consuming" an order
>>> by removing the current one from the orders bitfield and returning the next one.
>>>
>>> So the first loop starts at the highest order and keeps checking lower orders
>>> until one fully fits in the VMA. And the second loop starts at the first order
>>> that was found to fully fits and loops to lower orders until an allocation is
>>> successful.
>>>
>>> So I don't see a need to combine the loops.
>>>
>>>>
>>>>> +fallback:
>>>>> +    return vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>>>> +}
>>>>> +#else
>>>>> +#define alloc_anon_folio(vmf) \
>>>>> +        vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
>>>>> +#endif
>>>>> +
>>>>>    /*
>>>>>     * We enter with non-exclusive mmap_lock (to exclude vma changes,
>>>>>     * but allow concurrent faults), and pte mapped but not yet locked.
>>>>> @@ -4132,6 +4210,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>>     */
>>>>>    static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>>>    {
>>>>> +    int i;
>>>>> +    int nr_pages = 1;
>>>>> +    unsigned long addr = vmf->address;
>>>>>        bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>>        struct folio *folio;
>>>>
>>>> Nit: reverse christmas tree :)
>>>
>>> ACK
>>>
>>>>
>>>>> @@ -4176,10 +4257,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>>>        /* Allocate our own private page. */
>>>>>        if (unlikely(anon_vma_prepare(vma)))
>>>>>            goto oom;
>>>>> -    folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>>>> +    folio = alloc_anon_folio(vmf);
>>>>> +    if (IS_ERR(folio))
>>>>> +        return 0;
>>>>>        if (!folio)
>>>>>            goto oom;
>>>>>    +    nr_pages = folio_nr_pages(folio);
>>>>> +    addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>>>> +
>>>>>        if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>>>>>            goto oom_free_page;
>>>>>        folio_throttle_swaprate(folio, GFP_KERNEL);
>>>>> @@ -4196,12 +4282,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>>>        if (vma->vm_flags & VM_WRITE)
>>>>>            entry = pte_mkwrite(pte_mkdirty(entry), vma);
>>>>>    -    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>>>>> -            &vmf->ptl);
>>>>> +    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>>>>>        if (!vmf->pte)
>>>>>            goto release;
>>>>> -    if (vmf_pte_changed(vmf)) {
>>>>> -        update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>>> +    if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
>>>>> +        (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
>>>>> +        for (i = 0; i < nr_pages; i++)
>>>>> +            update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>>>
>>>> Comment: separating the order-0 case from the other case might make this easier
>>>> to read.
>>>
>>> Yeah fair enough. Will fix.
>>>
>>>>
>>>>>            goto release;
>>>>>        }
>>>>>    @@ -4216,16 +4303,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault
>>>>> *vmf)
>>>>>            return handle_userfault(vmf, VM_UFFD_MISSING);
>>>>>        }
>>>>>    -    inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>>>> -    folio_add_new_anon_rmap(folio, vma, vmf->address);
>>>>> +    folio_ref_add(folio, nr_pages - 1);
>>>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>>>>> +    folio_add_new_anon_rmap(folio, vma, addr);
>>>>>        folio_add_lru_vma(folio, vma);
>>>>>    setpte:
>>>>>        if (uffd_wp)
>>>>>            entry = pte_mkuffd_wp(entry);
>>>>> -    set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
>>>>> +    set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
>>>>>          /* No need to invalidate - it was non-present before */
>>>>> -    update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
>>>>> +    update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
>>>>>    unlock:
>>>>>        if (vmf->pte)
>>>>>            pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>>
>>>> Benchmarking order-0 allocations might be interesting. There will be some added
>>>> checks + multiple loops/conditionals for order-0 that could be avoided by having
>>>> two separate code paths. If we can't measure a difference, all good.
>>>
>>> Yep will do - will post numbers once I have them. I've been assuming that the
>>> major cost is clearing the page, but perhaps I'm wrong.
>>>
> 
> I added a "write-fault-byte" benchmark to the microbenchmark tool you gave me.
> This elides the normal memset page population routine, and instead writes the
> first byte of every page while the timer is running.
> 
> I ran with 100 iterations per run, then ran the whole thing 16 times. I ran it
> for a baseline kernel, as well as v8 (this series) and v9 (with changes from
> your review). I repeated on Ampere Altra (bare metal) and Apple M2 (VM):
> 
> |              |        m2 vm        |        altra        |
> |--------------|---------------------|--------------------:|
> | kernel       |     mean |  std_rel |     mean |  std_rel |
> |--------------|----------|----------|----------|---------:|
> | baseline     |   0.000% |   0.341% |   0.000% |   3.581% |
> | anonfolio-v8 |   0.005% |   0.272% |   5.068% |   1.128% |
> | anonfolio-v9 |  -0.013% |   0.442% |   0.107% |   1.788% |
> 
> No measurable difference on M2, but altra has a slow down in v8 which is fixed
> in v9; Looking at the changes, this is either down to the new unlikely() for the
> uffd or due to moving the THP order check to be inline within
> thp_vma_allowable_orders().

I suspect the last one.

> 
> So I have all the changes done and perf numbers to show no regression for
> order-0. I'm gonna do a final check and post v9 later today.

Good!

Let me catch up on your comments real quick.
David Hildenbrand Dec. 7, 2023, 11:08 a.m. UTC | #14
[...]

>>
>> Nit: the orders = ... order = ... looks like this might deserve a helper
>> function that makes this easier to read.
> 
> To be honest, the existing function that I've modified is a bit of a mess.

It's all an ugly mess and I hate it.

It would be cleanest if we'd just have "thp_vma_configured_orders()" 
that gives us all configured orders for the given VMA+flags combination. 
No passing in of orders, try handling the masking in the caller.

Then, we move that nasty "transhuge_vma_suitable" handling for !in_pf 
out of there and handle that in the callers. The comment "Huge fault 
does the check in fault handlers. And this check is not suitable for 
huge PUD fault handlers." already makes me angry, what a mess.


Then, we'd have a thp_vma_fitting_orders() / thp_vma_is_fitting_order() 
function that does the filtering only based on the given address + vma 
size/alignment. That's roughly "thp_vma_suitable_orders()".


Finding a good name to combine both could be something like
"thp_vma_possible_orders()".


Would make more sense to me (but again, German guy, so it's probably all 
wrong).


> thp_vma_allowable_orders() calls thp_vma_suitable_orders() if we are not in a
> page fault, because the page fault handlers already do that check themselves. It
> would be nice to refactor the whole thing so that thp_vma_allowable_orders() is
> a strict superset of thp_vma_suitable_orders(). Then this can just call
> thp_vma_allowable_orders(). But that's going to start touching the PMD and PUD
> handlers, so prefer if we leave that for a separate patch set.
> 
>>
>> Nit: Why call thp_vma_suitable_orders if the orders are already 0? Again, some
>> helper might be reasonable where that is handled internally.
> 
> Because thp_vma_suitable_orders() will handle it safely and is inline, so it
> should just as efficient? This would go away with the refactoring described above.

Right. Won't win in a beauty contest. Some simple helper might make this 
much easier to digest.

> 
>>
>> Comment: For order-0 we'll always perform a function call to both
>> thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should perform some
>> fast and efficient check if any <PMD THP are even enabled in the system / for
>> this VMA, and in that case just fallback before doing more expensive checks.
> 
> thp_vma_allowable_orders() is inline as you mentioned.
> 
> I was deliberately trying to keep all the decision logic in one place
> (thp_vma_suitable_orders) because it's already pretty complicated. But if you
> insist, how about this in the header:
> 
> static inline
> unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> 				       unsigned long vm_flags, bool smaps,
> 				       bool in_pf, bool enforce_sysfs,
> 				       unsigned long orders)
> {
> 	/* Optimization to check if required orders are enabled early. */
> 	if (enforce_sysfs && vma_is_anonymous(vma)) {
> 		unsigned long mask = READ_ONCE(huge_anon_orders_always);
> 
> 		if (vm_flags & VM_HUGEPAGE)
> 			mask |= READ_ONCE(huge_anon_orders_madvise);
> 		if (hugepage_global_always() ||
> 			((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
> 			mask |= READ_ONCE(huge_anon_orders_inherit);
> 
> 		orders &= mask;
> 		if (!orders)
> 			return 0;
> 		
> 		enforce_sysfs = false;
> 	}
> 
> 	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
> 					  enforce_sysfs, orders);
> }
> 
> Then the above check can be removed from __thp_vma_allowable_orders() - it will
> still retain the `if (enforce_sysfs && !vma_is_anonymous(vma))` part.
> 

Better. I still kind-of hate having to pass in orders here. Such masking 
is better done in the caller (see above how it might be done when moving 
the transhuge_vma_suitable() check out).

> 
>>
>>> +
>>> +    if (!orders)
>>> +        goto fallback;
>>> +
>>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>> +    if (!pte)
>>> +        return ERR_PTR(-EAGAIN);
>>> +
>>> +    order = first_order(orders);
>>> +    while (orders) {
>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +        vmf->pte = pte + pte_index(addr);
>>> +        if (pte_range_none(vmf->pte, 1 << order))
>>> +            break;
>>
>> Comment: Likely it would make sense to scan only once and determine the "largest
>> none range" around that address, having the largest suitable order in mind.
> 
> Yes, that's how I used to do it, but Yu Zhou requested simplifying to this,
> IIRC. Perhaps this an optimization opportunity for later?

Yes, definetly.

> 
>>
>>> +        order = next_order(&orders, order);
>>> +    }
>>> +
>>> +    vmf->pte = NULL;
>>
>> Nit: Can you elaborate why you are messing with vmf->pte here? A simple helper
>> variable will make this code look less magical. Unless I am missing something
>> important :)
> 
> Gahh, I used to pass the vmf to what pte_range_none() was refactored into (an
> approach that was suggested by Yu Zhou IIRC). But since I did some refactoring
> based on some comments from JohnH, I see I don't need that anymore. Agreed; it
> will be much clearer just to use a local variable. Will fix.
> 
>>
>>> +    pte_unmap(pte);
>>> +
>>> +    gfp = vma_thp_gfp_mask(vma);
>>> +
>>> +    while (orders) {
>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>> +        if (folio) {
>>> +            clear_huge_page(&folio->page, addr, 1 << order);
>>> +            return folio;
>>> +        }
>>> +        order = next_order(&orders, order);
>>> +    }
>>> +
>>
>> Queestion: would it make sense to combine both loops? I suspect memory
>> allocations with pte_offset_map()/kmao are problematic.
> 
> They are both operating on separate orders; next_order() is "consuming" an order
> by removing the current one from the orders bitfield and returning the next one.
> 
> So the first loop starts at the highest order and keeps checking lower orders
> until one fully fits in the VMA. And the second loop starts at the first order
> that was found to fully fits and loops to lower orders until an allocation is
> successful.

Right, but you know from the first loop which order is applicable (and 
will be fed to the second loop) and could just pte_unmap(pte) + 
tryalloc. If that fails, remap and try with the next orders.

That would make the code certainly easier to understand. That "orders" 
magic of constructing, filtering, walking is confusing :)


I might find some time today to see if there is an easy way to cleanup 
all what I spelled out above. It really is a mess. But likely that 
cleanup could be deferred (but you're touching it, so ... :) ).
Ryan Roberts Dec. 7, 2023, 12:08 p.m. UTC | #15
On 07/12/2023 11:08, David Hildenbrand wrote:
> [...]
> 
>>>
>>> Nit: the orders = ... order = ... looks like this might deserve a helper
>>> function that makes this easier to read.
>>
>> To be honest, the existing function that I've modified is a bit of a mess.
> 
> It's all an ugly mess and I hate it.
> 
> It would be cleanest if we'd just have "thp_vma_configured_orders()" that gives
> us all configured orders for the given VMA+flags combination. No passing in of
> orders, try handling the masking in the caller.
> 
> Then, we move that nasty "transhuge_vma_suitable" handling for !in_pf out of
> there and handle that in the callers. The comment "Huge fault does the check in
> fault handlers. And this check is not suitable for huge PUD fault handlers."
> already makes me angry, what a mess.

My thp_vma_suitable_order[s]() does now at least work correctly for PUD.

> 
> 
> Then, we'd have a thp_vma_fitting_orders() / thp_vma_is_fitting_order() function
> that does the filtering only based on the given address + vma size/alignment.
> That's roughly "thp_vma_suitable_orders()".
> 
> 
> Finding a good name to combine both could be something like
> "thp_vma_possible_orders()".
> 
> 
> Would make more sense to me (but again, German guy, so it's probably all wrong).
> 
> 
>> thp_vma_allowable_orders() calls thp_vma_suitable_orders() if we are not in a
>> page fault, because the page fault handlers already do that check themselves. It
>> would be nice to refactor the whole thing so that thp_vma_allowable_orders() is
>> a strict superset of thp_vma_suitable_orders(). Then this can just call
>> thp_vma_allowable_orders(). But that's going to start touching the PMD and PUD
>> handlers, so prefer if we leave that for a separate patch set.
>>
>>>
>>> Nit: Why call thp_vma_suitable_orders if the orders are already 0? Again, some
>>> helper might be reasonable where that is handled internally.
>>
>> Because thp_vma_suitable_orders() will handle it safely and is inline, so it
>> should just as efficient? This would go away with the refactoring described
>> above.
> 
> Right. Won't win in a beauty contest. Some simple helper might make this much
> easier to digest.
> 
>>
>>>
>>> Comment: For order-0 we'll always perform a function call to both
>>> thp_vma_allowable_orders() / thp_vma_suitable_orders(). We should perform some
>>> fast and efficient check if any <PMD THP are even enabled in the system / for
>>> this VMA, and in that case just fallback before doing more expensive checks.
>>
>> thp_vma_allowable_orders() is inline as you mentioned.
>>
>> I was deliberately trying to keep all the decision logic in one place
>> (thp_vma_suitable_orders) because it's already pretty complicated. But if you
>> insist, how about this in the header:
>>
>> static inline
>> unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>                        unsigned long vm_flags, bool smaps,
>>                        bool in_pf, bool enforce_sysfs,
>>                        unsigned long orders)
>> {
>>     /* Optimization to check if required orders are enabled early. */
>>     if (enforce_sysfs && vma_is_anonymous(vma)) {
>>         unsigned long mask = READ_ONCE(huge_anon_orders_always);
>>
>>         if (vm_flags & VM_HUGEPAGE)
>>             mask |= READ_ONCE(huge_anon_orders_madvise);
>>         if (hugepage_global_always() ||
>>             ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
>>             mask |= READ_ONCE(huge_anon_orders_inherit);
>>
>>         orders &= mask;
>>         if (!orders)
>>             return 0;
>>        
>>         enforce_sysfs = false;
>>     }
>>
>>     return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
>>                       enforce_sysfs, orders);
>> }
>>
>> Then the above check can be removed from __thp_vma_allowable_orders() - it will
>> still retain the `if (enforce_sysfs && !vma_is_anonymous(vma))` part.
>>
> 
> Better. I still kind-of hate having to pass in orders here. Such masking is
> better done in the caller (see above how it might be done when moving the
> transhuge_vma_suitable() check out).
> 
>>
>>>
>>>> +
>>>> +    if (!orders)
>>>> +        goto fallback;
>>>> +
>>>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>> +    if (!pte)
>>>> +        return ERR_PTR(-EAGAIN);
>>>> +
>>>> +    order = first_order(orders);
>>>> +    while (orders) {
>>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>> +        vmf->pte = pte + pte_index(addr);
>>>> +        if (pte_range_none(vmf->pte, 1 << order))
>>>> +            break;
>>>
>>> Comment: Likely it would make sense to scan only once and determine the "largest
>>> none range" around that address, having the largest suitable order in mind.
>>
>> Yes, that's how I used to do it, but Yu Zhou requested simplifying to this,
>> IIRC. Perhaps this an optimization opportunity for later?
> 
> Yes, definetly.
> 
>>
>>>
>>>> +        order = next_order(&orders, order);
>>>> +    }
>>>> +
>>>> +    vmf->pte = NULL;
>>>
>>> Nit: Can you elaborate why you are messing with vmf->pte here? A simple helper
>>> variable will make this code look less magical. Unless I am missing something
>>> important :)
>>
>> Gahh, I used to pass the vmf to what pte_range_none() was refactored into (an
>> approach that was suggested by Yu Zhou IIRC). But since I did some refactoring
>> based on some comments from JohnH, I see I don't need that anymore. Agreed; it
>> will be much clearer just to use a local variable. Will fix.
>>
>>>
>>>> +    pte_unmap(pte);
>>>> +
>>>> +    gfp = vma_thp_gfp_mask(vma);
>>>> +
>>>> +    while (orders) {
>>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>> +        if (folio) {
>>>> +            clear_huge_page(&folio->page, addr, 1 << order);
>>>> +            return folio;
>>>> +        }
>>>> +        order = next_order(&orders, order);
>>>> +    }
>>>> +
>>>
>>> Queestion: would it make sense to combine both loops? I suspect memory
>>> allocations with pte_offset_map()/kmao are problematic.
>>
>> They are both operating on separate orders; next_order() is "consuming" an order
>> by removing the current one from the orders bitfield and returning the next one.
>>
>> So the first loop starts at the highest order and keeps checking lower orders
>> until one fully fits in the VMA. And the second loop starts at the first order
>> that was found to fully fits and loops to lower orders until an allocation is
>> successful.
> 
> Right, but you know from the first loop which order is applicable (and will be
> fed to the second loop) and could just pte_unmap(pte) + tryalloc. If that fails,
> remap and try with the next orders.

You mean something like this?

	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
	if (!pte)
		return ERR_PTR(-EAGAIN);

	order = highest_order(orders);
	while (orders) {
		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
		if (!pte_range_none(pte + pte_index(addr), 1 << order)) {
			order = next_order(&orders, order);
			continue;
		}

		pte_unmap(pte);
		
		folio = vma_alloc_folio(gfp, order, vma, addr, true);
		if (folio) {
			clear_huge_page(&folio->page, vmf->address, 1 << order);
			return folio;
		}

		pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
		if (!pte)
			return ERR_PTR(-EAGAIN);

		order = next_order(&orders, order);
	}

	pte_unmap(pte);

I don't really like that because if high order folio allocations fail, then you
are calling pte_range_none() again for the next lower order; once that check has
succeeded for an order it shouldn't be required for any lower orders. In this
case you also have lots of pte map/unmap.

The original version feels more efficient to me.

> 
> That would make the code certainly easier to understand. That "orders" magic of
> constructing, filtering, walking is confusing :)
> 
> 
> I might find some time today to see if there is an easy way to cleanup all what
> I spelled out above. It really is a mess. But likely that cleanup could be
> deferred (but you're touching it, so ... :) ).

I'm going to ignore the last 5 words. I heard the "that cleanup could be
deferred" part loud and clear though :)


>
David Hildenbrand Dec. 7, 2023, 1:28 p.m. UTC | #16
>>
>> Right, but you know from the first loop which order is applicable (and will be
>> fed to the second loop) and could just pte_unmap(pte) + tryalloc. If that fails,
>> remap and try with the next orders.
> 
> You mean something like this?
> 
> 	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> 	if (!pte)
> 		return ERR_PTR(-EAGAIN);
> 
> 	order = highest_order(orders);
> 	while (orders) {
> 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> 		if (!pte_range_none(pte + pte_index(addr), 1 << order)) {
> 			order = next_order(&orders, order);
> 			continue;
> 		}
> 
> 		pte_unmap(pte);
> 		
> 		folio = vma_alloc_folio(gfp, order, vma, addr, true);
> 		if (folio) {
> 			clear_huge_page(&folio->page, vmf->address, 1 << order);
> 			return folio;
> 		}
> 
> 		pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> 		if (!pte)
> 			return ERR_PTR(-EAGAIN);
> 
> 		order = next_order(&orders, order);
> 	}
> 
> 	pte_unmap(pte);
> 
> I don't really like that because if high order folio allocations fail, then you
> are calling pte_range_none() again for the next lower order; once that check has
> succeeded for an order it shouldn't be required for any lower orders. In this
> case you also have lots of pte map/unmap.

I see what you mean.

> 
> The original version feels more efficient to me.
Yes it is. Adding in some comments might help, like

/*
  * Find the largest order where the aligned range is completely prot_none(). Note
  * that all remaining orders will be completely prot_none().
  */
...

/* Try allocating the largest of the remaining orders. */

> 
>>
>> That would make the code certainly easier to understand. That "orders" magic of
>> constructing, filtering, walking is confusing :)
>>
>>
>> I might find some time today to see if there is an easy way to cleanup all what
>> I spelled out above. It really is a mess. But likely that cleanup could be
>> deferred (but you're touching it, so ... :) ).
> 
> I'm going to ignore the last 5 words. I heard the "that cleanup could be
> deferred" part loud and clear though :)

:)

If we could stop passing orders into thp_vma_allowable_orders(), that would probably
be the biggest win. It's just all a confusing mess.
Ryan Roberts Dec. 7, 2023, 2:45 p.m. UTC | #17
On 07/12/2023 13:28, David Hildenbrand wrote:
>>>
>>> Right, but you know from the first loop which order is applicable (and will be
>>> fed to the second loop) and could just pte_unmap(pte) + tryalloc. If that fails,
>>> remap and try with the next orders.
>>
>> You mean something like this?
>>
>>     pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>     if (!pte)
>>         return ERR_PTR(-EAGAIN);
>>
>>     order = highest_order(orders);
>>     while (orders) {
>>         addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>         if (!pte_range_none(pte + pte_index(addr), 1 << order)) {
>>             order = next_order(&orders, order);
>>             continue;
>>         }
>>
>>         pte_unmap(pte);
>>        
>>         folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>         if (folio) {
>>             clear_huge_page(&folio->page, vmf->address, 1 << order);
>>             return folio;
>>         }
>>
>>         pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>         if (!pte)
>>             return ERR_PTR(-EAGAIN);
>>
>>         order = next_order(&orders, order);
>>     }
>>
>>     pte_unmap(pte);
>>
>> I don't really like that because if high order folio allocations fail, then you
>> are calling pte_range_none() again for the next lower order; once that check has
>> succeeded for an order it shouldn't be required for any lower orders. In this
>> case you also have lots of pte map/unmap.
> 
> I see what you mean.
> 
>>
>> The original version feels more efficient to me.
> Yes it is. Adding in some comments might help, like
> 
> /*
>  * Find the largest order where the aligned range is completely prot_none(). Note
>  * that all remaining orders will be completely prot_none().
>  */
> ...
> 
> /* Try allocating the largest of the remaining orders. */

OK added.

> 
>>
>>>
>>> That would make the code certainly easier to understand. That "orders" magic of
>>> constructing, filtering, walking is confusing :)
>>>
>>>
>>> I might find some time today to see if there is an easy way to cleanup all what
>>> I spelled out above. It really is a mess. But likely that cleanup could be
>>> deferred (but you're touching it, so ... :) ).
>>
>> I'm going to ignore the last 5 words. I heard the "that cleanup could be
>> deferred" part loud and clear though :)
> 
> :)
> 
> If we could stop passing orders into thp_vma_allowable_orders(), that would
> probably
> be the biggest win. It's just all a confusing mess.



I tried an approach like you suggested in the other thread originally, but I
struggled to define exactly what "thp_vma_configured_orders()" should mean;
Ideally, I just want "all the THP orders that are currently enabled for this
VMA+flags". But some callers want to enforce_sysfs and others don't, so you
probably have to at least pass that flag. Then you have DAX which explicitly
ignores enforce_sysfs, but only in a page fault. And shmem, which ignores
enforce_sysfs, but only outside of a page fault. So it quickly becomes pretty
complex. It is basically thp_vma_allowable_orders() as currently defined.

If this could be a simple function then it could be inline and as you say, we
can do the masking in the caller and exit early for the order-0 case. But it is
very complex (at least if you want to retain the equivalent logic to what
thp_vma_allowable_orders() has) so I'm not sure how to do the order-0 early exit
without passing in the orders bitfield. And we are unlikely to exit early
because PMD-sized THP is likely enabled and because we didn't pass in a orders
bitfield, that wasn't filtered out.

In short, I can't see a solution that's better than the one I have. But if you
have something in mind, if you can spell it out, then I'll have a go at tidying
it up and integrating it into the series. Otherwise I really would prefer to
leave it for a separate series.
David Hildenbrand Dec. 7, 2023, 3:01 p.m. UTC | #18
On 07.12.23 15:45, Ryan Roberts wrote:
> On 07/12/2023 13:28, David Hildenbrand wrote:
>>>>
>>>> Right, but you know from the first loop which order is applicable (and will be
>>>> fed to the second loop) and could just pte_unmap(pte) + tryalloc. If that fails,
>>>> remap and try with the next orders.
>>>
>>> You mean something like this?
>>>
>>>      pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>      if (!pte)
>>>          return ERR_PTR(-EAGAIN);
>>>
>>>      order = highest_order(orders);
>>>      while (orders) {
>>>          addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>          if (!pte_range_none(pte + pte_index(addr), 1 << order)) {
>>>              order = next_order(&orders, order);
>>>              continue;
>>>          }
>>>
>>>          pte_unmap(pte);
>>>         
>>>          folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>          if (folio) {
>>>              clear_huge_page(&folio->page, vmf->address, 1 << order);
>>>              return folio;
>>>          }
>>>
>>>          pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>          if (!pte)
>>>              return ERR_PTR(-EAGAIN);
>>>
>>>          order = next_order(&orders, order);
>>>      }
>>>
>>>      pte_unmap(pte);
>>>
>>> I don't really like that because if high order folio allocations fail, then you
>>> are calling pte_range_none() again for the next lower order; once that check has
>>> succeeded for an order it shouldn't be required for any lower orders. In this
>>> case you also have lots of pte map/unmap.
>>
>> I see what you mean.
>>
>>>
>>> The original version feels more efficient to me.
>> Yes it is. Adding in some comments might help, like
>>
>> /*
>>   * Find the largest order where the aligned range is completely prot_none(). Note
>>   * that all remaining orders will be completely prot_none().
>>   */
>> ...
>>
>> /* Try allocating the largest of the remaining orders. */
> 
> OK added.
> 
>>
>>>
>>>>
>>>> That would make the code certainly easier to understand. That "orders" magic of
>>>> constructing, filtering, walking is confusing :)
>>>>
>>>>
>>>> I might find some time today to see if there is an easy way to cleanup all what
>>>> I spelled out above. It really is a mess. But likely that cleanup could be
>>>> deferred (but you're touching it, so ... :) ).
>>>
>>> I'm going to ignore the last 5 words. I heard the "that cleanup could be
>>> deferred" part loud and clear though :)
>>
>> :)
>>
>> If we could stop passing orders into thp_vma_allowable_orders(), that would
>> probably
>> be the biggest win. It's just all a confusing mess.
> 
> 
> 
> I tried an approach like you suggested in the other thread originally, but I
> struggled to define exactly what "thp_vma_configured_orders()" should mean;
> Ideally, I just want "all the THP orders that are currently enabled for this
> VMA+flags". But some callers want to enforce_sysfs and others don't, so you
> probably have to at least pass that flag. Then you have DAX which explicitly

Yes, the flags would still be passed. It's kind of the "context".

> ignores enforce_sysfs, but only in a page fault. And shmem, which ignores
> enforce_sysfs, but only outside of a page fault. So it quickly becomes pretty
> complex. It is basically thp_vma_allowable_orders() as currently defined.

Yeah, but moving the "can we actually fit a THP in there" check out of 
the picture.

> 
> If this could be a simple function then it could be inline and as you say, we
> can do the masking in the caller and exit early for the order-0 case. But it is
> very complex (at least if you want to retain the equivalent logic to what
> thp_vma_allowable_orders() has) so I'm not sure how to do the order-0 early exit
> without passing in the orders bitfield. And we are unlikely to exit early
> because PMD-sized THP is likely enabled and because we didn't pass in a orders
> bitfield, that wasn't filtered out.
> 
> In short, I can't see a solution that's better than the one I have. But if you
> have something in mind, if you can spell it out, then I'll have a go at tidying
> it up and integrating it into the series. Otherwise I really would prefer to
> leave it for a separate series.

I'm playing with some cleanups, but they can all be built on top if they 
materialize.
Ryan Roberts Dec. 7, 2023, 3:12 p.m. UTC | #19
On 07/12/2023 15:01, David Hildenbrand wrote:
> On 07.12.23 15:45, Ryan Roberts wrote:
>> On 07/12/2023 13:28, David Hildenbrand wrote:
>>>>>
>>>>> Right, but you know from the first loop which order is applicable (and will be
>>>>> fed to the second loop) and could just pte_unmap(pte) + tryalloc. If that
>>>>> fails,
>>>>> remap and try with the next orders.
>>>>
>>>> You mean something like this?
>>>>
>>>>      pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>>      if (!pte)
>>>>          return ERR_PTR(-EAGAIN);
>>>>
>>>>      order = highest_order(orders);
>>>>      while (orders) {
>>>>          addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>>          if (!pte_range_none(pte + pte_index(addr), 1 << order)) {
>>>>              order = next_order(&orders, order);
>>>>              continue;
>>>>          }
>>>>
>>>>          pte_unmap(pte);
>>>>                  folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>>>          if (folio) {
>>>>              clear_huge_page(&folio->page, vmf->address, 1 << order);
>>>>              return folio;
>>>>          }
>>>>
>>>>          pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>>>          if (!pte)
>>>>              return ERR_PTR(-EAGAIN);
>>>>
>>>>          order = next_order(&orders, order);
>>>>      }
>>>>
>>>>      pte_unmap(pte);
>>>>
>>>> I don't really like that because if high order folio allocations fail, then you
>>>> are calling pte_range_none() again for the next lower order; once that check
>>>> has
>>>> succeeded for an order it shouldn't be required for any lower orders. In this
>>>> case you also have lots of pte map/unmap.
>>>
>>> I see what you mean.
>>>
>>>>
>>>> The original version feels more efficient to me.
>>> Yes it is. Adding in some comments might help, like
>>>
>>> /*
>>>   * Find the largest order where the aligned range is completely prot_none().
>>> Note
>>>   * that all remaining orders will be completely prot_none().
>>>   */
>>> ...
>>>
>>> /* Try allocating the largest of the remaining orders. */
>>
>> OK added.
>>
>>>
>>>>
>>>>>
>>>>> That would make the code certainly easier to understand. That "orders"
>>>>> magic of
>>>>> constructing, filtering, walking is confusing :)
>>>>>
>>>>>
>>>>> I might find some time today to see if there is an easy way to cleanup all
>>>>> what
>>>>> I spelled out above. It really is a mess. But likely that cleanup could be
>>>>> deferred (but you're touching it, so ... :) ).
>>>>
>>>> I'm going to ignore the last 5 words. I heard the "that cleanup could be
>>>> deferred" part loud and clear though :)
>>>
>>> :)
>>>
>>> If we could stop passing orders into thp_vma_allowable_orders(), that would
>>> probably
>>> be the biggest win. It's just all a confusing mess.
>>
>>
>>
>> I tried an approach like you suggested in the other thread originally, but I
>> struggled to define exactly what "thp_vma_configured_orders()" should mean;
>> Ideally, I just want "all the THP orders that are currently enabled for this
>> VMA+flags". But some callers want to enforce_sysfs and others don't, so you
>> probably have to at least pass that flag. Then you have DAX which explicitly
> 
> Yes, the flags would still be passed. It's kind of the "context".
> 
>> ignores enforce_sysfs, but only in a page fault. And shmem, which ignores
>> enforce_sysfs, but only outside of a page fault. So it quickly becomes pretty
>> complex. It is basically thp_vma_allowable_orders() as currently defined.
> 
> Yeah, but moving the "can we actually fit a THP in there" check out of the picture.
> 
>>
>> If this could be a simple function then it could be inline and as you say, we
>> can do the masking in the caller and exit early for the order-0 case. But it is
>> very complex (at least if you want to retain the equivalent logic to what
>> thp_vma_allowable_orders() has) so I'm not sure how to do the order-0 early exit
>> without passing in the orders bitfield. And we are unlikely to exit early
>> because PMD-sized THP is likely enabled and because we didn't pass in a orders
>> bitfield, that wasn't filtered out.
>>
>> In short, I can't see a solution that's better than the one I have. But if you
>> have something in mind, if you can spell it out, then I'll have a go at tidying
>> it up and integrating it into the series. Otherwise I really would prefer to
>> leave it for a separate series.
> 
> I'm playing with some cleanups, but they can all be built on top if they
> materialize.

OK, I'm going to post a v9 then. And cross my fingers and hope that's the final
version.
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index bd0eadd3befb..91a53b9835a4 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -68,9 +68,11 @@  extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 /*
- * Mask of all large folio orders supported for anonymous THP.
+ * Mask of all large folio orders supported for anonymous THP; all orders up to
+ * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
+ * (which is a limitation of the THP implementation).
  */
-#define THP_ORDERS_ALL_ANON	BIT(PMD_ORDER)
+#define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
 
 /*
  * Mask of all large folio orders supported for file THP.
diff --git a/mm/memory.c b/mm/memory.c
index 3ceeb0f45bf5..bf7e93813018 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4125,6 +4125,84 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	return ret;
 }
 
+static bool pte_range_none(pte_t *pte, int nr_pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (!pte_none(ptep_get_lockless(pte + i)))
+			return false;
+	}
+
+	return true;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static struct folio *alloc_anon_folio(struct vm_fault *vmf)
+{
+	gfp_t gfp;
+	pte_t *pte;
+	unsigned long addr;
+	struct folio *folio;
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long orders;
+	int order;
+
+	/*
+	 * If uffd is active for the vma we need per-page fault fidelity to
+	 * maintain the uffd semantics.
+	 */
+	if (userfaultfd_armed(vma))
+		goto fallback;
+
+	/*
+	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
+	 * for this vma. Then filter out the orders that can't be allocated over
+	 * the faulting address and still be fully contained in the vma.
+	 */
+	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
+					  BIT(PMD_ORDER) - 1);
+	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+
+	if (!orders)
+		goto fallback;
+
+	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
+	if (!pte)
+		return ERR_PTR(-EAGAIN);
+
+	order = first_order(orders);
+	while (orders) {
+		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+		vmf->pte = pte + pte_index(addr);
+		if (pte_range_none(vmf->pte, 1 << order))
+			break;
+		order = next_order(&orders, order);
+	}
+
+	vmf->pte = NULL;
+	pte_unmap(pte);
+
+	gfp = vma_thp_gfp_mask(vma);
+
+	while (orders) {
+		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+		folio = vma_alloc_folio(gfp, order, vma, addr, true);
+		if (folio) {
+			clear_huge_page(&folio->page, addr, 1 << order);
+			return folio;
+		}
+		order = next_order(&orders, order);
+	}
+
+fallback:
+	return vma_alloc_zeroed_movable_folio(vma, vmf->address);
+}
+#else
+#define alloc_anon_folio(vmf) \
+		vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
+#endif
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -4132,6 +4210,9 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
  */
 static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
+	int i;
+	int nr_pages = 1;
+	unsigned long addr = vmf->address;
 	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *folio;
@@ -4176,10 +4257,15 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	/* Allocate our own private page. */
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+	folio = alloc_anon_folio(vmf);
+	if (IS_ERR(folio))
+		return 0;
 	if (!folio)
 		goto oom;
 
+	nr_pages = folio_nr_pages(folio);
+	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+
 	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
 		goto oom_free_page;
 	folio_throttle_swaprate(folio, GFP_KERNEL);
@@ -4196,12 +4282,13 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry), vma);
 
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-			&vmf->ptl);
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
 	if (!vmf->pte)
 		goto release;
-	if (vmf_pte_changed(vmf)) {
-		update_mmu_tlb(vma, vmf->address, vmf->pte);
+	if ((nr_pages == 1 && vmf_pte_changed(vmf)) ||
+	    (nr_pages  > 1 && !pte_range_none(vmf->pte, nr_pages))) {
+		for (i = 0; i < nr_pages; i++)
+			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
 		goto release;
 	}
 
@@ -4216,16 +4303,17 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 		return handle_userfault(vmf, VM_UFFD_MISSING);
 	}
 
-	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-	folio_add_new_anon_rmap(folio, vma, vmf->address);
+	folio_ref_add(folio, nr_pages - 1);
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+	folio_add_new_anon_rmap(folio, vma, addr);
 	folio_add_lru_vma(folio, vma);
 setpte:
 	if (uffd_wp)
 		entry = pte_mkuffd_wp(entry);
-	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
 
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);