diff mbox series

[v5,4/5] mm/sparse-vmemmap: improve memory savings for compound devmaps

Message ID 20220210193345.23628-5-joao.m.martins@oracle.com (mailing list archive)
State New
Headers show
Series sparse-vmemmap: memory savings for compound devmaps (device-dax) | expand

Commit Message

Joao Martins Feb. 10, 2022, 7:33 p.m. UTC
A compound devmap is a dev_pagemap with @vmemmap_shift > 0 and it
means that pages are mapped at a given huge page alignment and utilize
uses compound pages as opposed to order-0 pages.

Take advantage of the fact that most tail pages look the same (except
the first two) to minimize struct page overhead. Allocate a separate
page for the vmemmap area which contains the head page and separate for
the next 64 pages. The rest of the subsections then reuse this tail
vmemmap page to initialize the rest of the tail pages.

Sections are arch-dependent (e.g. on x86 it's 64M, 128M or 512M) and
when initializing compound devmap with big enough @vmemmap_shift (e.g.
1G PUD) it may cross multiple sections. The vmemmap code needs to
consult @pgmap so that multiple sections that all map the same tail
data can refer back to the first copy of that data for a given
gigantic page.

On compound devmaps with 2M align, this mechanism lets 6 pages be
saved out of the 8 necessary PFNs necessary to set the subsection's
512 struct pages being mapped. On a 1G compound devmap it saves
4094 pages.

Altmap isn't supported yet, given various restrictions in altmap pfn
allocator, thus fallback to the already in use vmemmap_populate().  It
is worth noting that altmap for devmap mappings was there to relieve the
pressure of inordinate amounts of memmap space to map terabytes of pmem.
With compound pages the motivation for altmaps for pmem gets reduced.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 Documentation/vm/vmemmap_dedup.rst |  56 ++++++++++-
 include/linux/mm.h                 |   2 +-
 mm/memremap.c                      |   1 +
 mm/sparse-vmemmap.c                | 150 +++++++++++++++++++++++++++--
 4 files changed, 197 insertions(+), 12 deletions(-)

Comments

Muchun Song Feb. 11, 2022, 7:54 a.m. UTC | #1
On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
[...]
>  pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> -                                      struct vmem_altmap *altmap)
> +                                      struct vmem_altmap *altmap,
> +                                      struct page *block)

Why not use the name of "reuse" instead of "block"?
Seems like "reuse" is more clear.

>  {
>         pte_t *pte = pte_offset_kernel(pmd, addr);
>         if (pte_none(*pte)) {
>                 pte_t entry;
>                 void *p;
>
> -               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> -               if (!p)
> -                       return NULL;
> +               if (!block) {
> +                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> +                       if (!p)
> +                               return NULL;
> +               } else {
> +                       /*
> +                        * When a PTE/PMD entry is freed from the init_mm
> +                        * there's a a free_pages() call to this page allocated
> +                        * above. Thus this get_page() is paired with the
> +                        * put_page_testzero() on the freeing path.
> +                        * This can only called by certain ZONE_DEVICE path,
> +                        * and through vmemmap_populate_compound_pages() when
> +                        * slab is available.
> +                        */
> +                       get_page(block);
> +                       p = page_to_virt(block);
> +               }
>                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
>                 set_pte_at(&init_mm, addr, pte, entry);
>         }
> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
>  }
>
>  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
> -                                             struct vmem_altmap *altmap)
> +                                             struct vmem_altmap *altmap,
> +                                             struct page *reuse, struct page **page)

We can remove the last argument (struct page **page) if we change
the return type to "pte_t *".  More simple, don't you think?

>  {
>         pgd_t *pgd;
>         p4d_t *p4d;
> @@ -629,11 +645,13 @@ static int __meminit vmemmap_populate_address(unsigned long addr, int node,
>         pmd = vmemmap_pmd_populate(pud, addr, node);
>         if (!pmd)
>                 return -ENOMEM;
> -       pte = vmemmap_pte_populate(pmd, addr, node, altmap);
> +       pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
>         if (!pte)
>                 return -ENOMEM;
>         vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>
> +       if (page)
> +               *page = pte_page(*pte);
>         return 0;
>  }
>
> @@ -644,10 +662,120 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>         int rc;
>
>         for (; addr < end; addr += PAGE_SIZE) {
> -               rc = vmemmap_populate_address(addr, node, altmap);
> +               rc = vmemmap_populate_address(addr, node, altmap, NULL, NULL);
>                 if (rc)
>                         return rc;
> +       }
> +
> +       return 0;
> +}
> +
> +static int __meminit vmemmap_populate_range(unsigned long start,
> +                                           unsigned long end,
> +                                           int node, struct page *page)
> +{
> +       unsigned long addr = start;
> +       int rc;
>
> +       for (; addr < end; addr += PAGE_SIZE) {
> +               rc = vmemmap_populate_address(addr, node, NULL, page, NULL);
> +               if (rc)
> +                       return rc;
> +       }
> +
> +       return 0;
> +}
> +
> +static inline int __meminit vmemmap_populate_page(unsigned long addr, int node,
> +                                                 struct page **page)
> +{
> +       return vmemmap_populate_address(addr, node, NULL, NULL, page);
> +}
> +
> +/*
> + * For compound pages bigger than section size (e.g. x86 1G compound
> + * pages with 2M subsection size) fill the rest of sections as tail
> + * pages.
> + *
> + * Note that memremap_pages() resets @nr_range value and will increment
> + * it after each range successful onlining. Thus the value or @nr_range
> + * at section memmap populate corresponds to the in-progress range
> + * being onlined here.
> + */
> +static bool __meminit reuse_compound_section(unsigned long start_pfn,
> +                                            struct dev_pagemap *pgmap)
> +{
> +       unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
> +       unsigned long offset = start_pfn -
> +               PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
> +
> +       return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
> +}
> +
> +static struct page * __meminit compound_section_tail_page(unsigned long addr)
> +{
> +       pte_t *ptep;
> +
> +       addr -= PAGE_SIZE;
> +
> +       /*
> +        * Assuming sections are populated sequentially, the previous section's
> +        * page data can be reused.
> +        */
> +       ptep = pte_offset_kernel(pmd_off_k(addr), addr);
> +       if (!ptep)
> +               return NULL;
> +
> +       return pte_page(*ptep);
> +}
> +
> +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
> +                                                    unsigned long start,
> +                                                    unsigned long end, int node,
> +                                                    struct dev_pagemap *pgmap)
> +{
> +       unsigned long size, addr;
> +
> +       if (reuse_compound_section(start_pfn, pgmap)) {
> +               struct page *page;
> +
> +               page = compound_section_tail_page(start);
> +               if (!page)
> +                       return -ENOMEM;
> +
> +               /*
> +                * Reuse the page that was populated in the prior iteration
> +                * with just tail struct pages.
> +                */
> +               return vmemmap_populate_range(start, end, node, page);
> +       }
> +
> +       size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
> +       for (addr = start; addr < end; addr += size) {
> +               unsigned long next = addr, last = addr + size;
> +               struct page *block;
> +               int rc;
> +
> +               /* Populate the head page vmemmap page */
> +               rc = vmemmap_populate_page(addr, node, NULL);
> +               if (rc)
> +                       return rc;
> +
> +               /* Populate the tail pages vmemmap page */
> +               block = NULL;
> +               next = addr + PAGE_SIZE;
> +               rc = vmemmap_populate_page(next, node, &block);
> +               if (rc)
> +                       return rc;
> +
> +               /*
> +                * Reuse the previous page for the rest of tail pages
> +                * See layout diagram in Documentation/vm/vmemmap_dedup.rst
> +                */
> +               next += PAGE_SIZE;
> +               rc = vmemmap_populate_range(next, last, node, block);
> +               if (rc)
> +                       return rc;
>         }
>
>         return 0;
> @@ -659,12 +787,18 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>  {
>         unsigned long start = (unsigned long) pfn_to_page(pfn);
>         unsigned long end = start + nr_pages * sizeof(struct page);
> +       int r;
>
>         if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
>                 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>                 return NULL;
>
> -       if (vmemmap_populate(start, end, nid, altmap))
> +       if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)

Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
this optimization is only applied when the size of the struct page does not
cross page boundaries?

Thanks.
Joao Martins Feb. 11, 2022, 12:37 p.m. UTC | #2
On 2/11/22 07:54, Muchun Song wrote:
> On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> [...]
>>  pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
>> -                                      struct vmem_altmap *altmap)
>> +                                      struct vmem_altmap *altmap,
>> +                                      struct page *block)
> 
> Why not use the name of "reuse" instead of "block"?
> Seems like "reuse" is more clear.
> 
Good idea, let me rename that to @reuse.

>>  {
>>         pte_t *pte = pte_offset_kernel(pmd, addr);
>>         if (pte_none(*pte)) {
>>                 pte_t entry;
>>                 void *p;
>>
>> -               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
>> -               if (!p)
>> -                       return NULL;
>> +               if (!block) {
>> +                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
>> +                       if (!p)
>> +                               return NULL;
>> +               } else {
>> +                       /*
>> +                        * When a PTE/PMD entry is freed from the init_mm
>> +                        * there's a a free_pages() call to this page allocated
>> +                        * above. Thus this get_page() is paired with the
>> +                        * put_page_testzero() on the freeing path.
>> +                        * This can only called by certain ZONE_DEVICE path,
>> +                        * and through vmemmap_populate_compound_pages() when
>> +                        * slab is available.
>> +                        */
>> +                       get_page(block);
>> +                       p = page_to_virt(block);
>> +               }
>>                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
>>                 set_pte_at(&init_mm, addr, pte, entry);
>>         }
>> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
>>  }
>>
>>  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
>> -                                             struct vmem_altmap *altmap)
>> +                                             struct vmem_altmap *altmap,
>> +                                             struct page *reuse, struct page **page)
> 
> We can remove the last argument (struct page **page) if we change
> the return type to "pte_t *".  More simple, don't you think?
> 

Hmmm, perhaps it is simpler, specially provided the only error code is ENOMEM.

Albeit perhaps what we want is a `struct page *` rather than a pte.

>>  {
>>         pgd_t *pgd;
>>         p4d_t *p4d;
>> @@ -629,11 +645,13 @@ static int __meminit vmemmap_populate_address(unsigned long addr, int node,
>>         pmd = vmemmap_pmd_populate(pud, addr, node);
>>         if (!pmd)
>>                 return -ENOMEM;
>> -       pte = vmemmap_pte_populate(pmd, addr, node, altmap);
>> +       pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
>>         if (!pte)
>>                 return -ENOMEM;
>>         vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>>
>> +       if (page)
>> +               *page = pte_page(*pte);
>>         return 0;
>>  }
>>
>> @@ -644,10 +662,120 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>>         int rc;
>>
>>         for (; addr < end; addr += PAGE_SIZE) {
>> -               rc = vmemmap_populate_address(addr, node, altmap);
>> +               rc = vmemmap_populate_address(addr, node, altmap, NULL, NULL);
>>                 if (rc)
>>                         return rc;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int __meminit vmemmap_populate_range(unsigned long start,
>> +                                           unsigned long end,
>> +                                           int node, struct page *page)
>> +{
>> +       unsigned long addr = start;
>> +       int rc;
>>
>> +       for (; addr < end; addr += PAGE_SIZE) {
>> +               rc = vmemmap_populate_address(addr, node, NULL, page, NULL);
>> +               if (rc)
>> +                       return rc;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static inline int __meminit vmemmap_populate_page(unsigned long addr, int node,
>> +                                                 struct page **page)
>> +{
>> +       return vmemmap_populate_address(addr, node, NULL, NULL, page);
>> +}
>> +
>> +/*
>> + * For compound pages bigger than section size (e.g. x86 1G compound
>> + * pages with 2M subsection size) fill the rest of sections as tail
>> + * pages.
>> + *
>> + * Note that memremap_pages() resets @nr_range value and will increment
>> + * it after each range successful onlining. Thus the value or @nr_range
>> + * at section memmap populate corresponds to the in-progress range
>> + * being onlined here.
>> + */
>> +static bool __meminit reuse_compound_section(unsigned long start_pfn,
>> +                                            struct dev_pagemap *pgmap)
>> +{
>> +       unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
>> +       unsigned long offset = start_pfn -
>> +               PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
>> +
>> +       return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
>> +}
>> +
>> +static struct page * __meminit compound_section_tail_page(unsigned long addr)
>> +{
>> +       pte_t *ptep;
>> +
>> +       addr -= PAGE_SIZE;
>> +
>> +       /*
>> +        * Assuming sections are populated sequentially, the previous section's
>> +        * page data can be reused.
>> +        */
>> +       ptep = pte_offset_kernel(pmd_off_k(addr), addr);
>> +       if (!ptep)
>> +               return NULL;
>> +
>> +       return pte_page(*ptep);
>> +}
>> +
>> +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
>> +                                                    unsigned long start,
>> +                                                    unsigned long end, int node,
>> +                                                    struct dev_pagemap *pgmap)
>> +{
>> +       unsigned long size, addr;
>> +
>> +       if (reuse_compound_section(start_pfn, pgmap)) {
>> +               struct page *page;
>> +
>> +               page = compound_section_tail_page(start);
>> +               if (!page)
>> +                       return -ENOMEM;
>> +
>> +               /*
>> +                * Reuse the page that was populated in the prior iteration
>> +                * with just tail struct pages.
>> +                */
>> +               return vmemmap_populate_range(start, end, node, page);
>> +       }
>> +
>> +       size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
>> +       for (addr = start; addr < end; addr += size) {
>> +               unsigned long next = addr, last = addr + size;
>> +               struct page *block;
>> +               int rc;
>> +
>> +               /* Populate the head page vmemmap page */
>> +               rc = vmemmap_populate_page(addr, node, NULL);
>> +               if (rc)
>> +                       return rc;
>> +
>> +               /* Populate the tail pages vmemmap page */
>> +               block = NULL;
>> +               next = addr + PAGE_SIZE;
>> +               rc = vmemmap_populate_page(next, node, &block);
>> +               if (rc)
>> +                       return rc;
>> +
>> +               /*
>> +                * Reuse the previous page for the rest of tail pages
>> +                * See layout diagram in Documentation/vm/vmemmap_dedup.rst
>> +                */
>> +               next += PAGE_SIZE;
>> +               rc = vmemmap_populate_range(next, last, node, block);
>> +               if (rc)
>> +                       return rc;
>>         }
>>
>>         return 0;
>> @@ -659,12 +787,18 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>>  {
>>         unsigned long start = (unsigned long) pfn_to_page(pfn);
>>         unsigned long end = start + nr_pages * sizeof(struct page);
>> +       int r;
>>
>>         if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
>>                 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>>                 return NULL;
>>
>> -       if (vmemmap_populate(start, end, nid, altmap))
>> +       if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
> 
> Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
> this optimization is only applied when the size of the struct page does not
> cross page boundaries?

Totally miss that -- let me make that adjustment.

Can I ask which architectures/conditions this happens?
Muchun Song Feb. 12, 2022, 10:08 a.m. UTC | #3
On Fri, Feb 11, 2022 at 8:37 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 2/11/22 07:54, Muchun Song wrote:
> > On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> > [...]
> >>  pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> >> -                                      struct vmem_altmap *altmap)
> >> +                                      struct vmem_altmap *altmap,
> >> +                                      struct page *block)
> >
> > Why not use the name of "reuse" instead of "block"?
> > Seems like "reuse" is more clear.
> >
> Good idea, let me rename that to @reuse.
>
> >>  {
> >>         pte_t *pte = pte_offset_kernel(pmd, addr);
> >>         if (pte_none(*pte)) {
> >>                 pte_t entry;
> >>                 void *p;
> >>
> >> -               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> >> -               if (!p)
> >> -                       return NULL;
> >> +               if (!block) {
> >> +                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> >> +                       if (!p)
> >> +                               return NULL;
> >> +               } else {
> >> +                       /*
> >> +                        * When a PTE/PMD entry is freed from the init_mm
> >> +                        * there's a a free_pages() call to this page allocated
> >> +                        * above. Thus this get_page() is paired with the
> >> +                        * put_page_testzero() on the freeing path.
> >> +                        * This can only called by certain ZONE_DEVICE path,
> >> +                        * and through vmemmap_populate_compound_pages() when
> >> +                        * slab is available.
> >> +                        */
> >> +                       get_page(block);
> >> +                       p = page_to_virt(block);
> >> +               }
> >>                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
> >>                 set_pte_at(&init_mm, addr, pte, entry);
> >>         }
> >> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
> >>  }
> >>
> >>  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
> >> -                                             struct vmem_altmap *altmap)
> >> +                                             struct vmem_altmap *altmap,
> >> +                                             struct page *reuse, struct page **page)
> >
> > We can remove the last argument (struct page **page) if we change
> > the return type to "pte_t *".  More simple, don't you think?
> >
>
> Hmmm, perhaps it is simpler, specially provided the only error code is ENOMEM.
>
> Albeit perhaps what we want is a `struct page *` rather than a pte.

The caller can extract `struct page` from a pte.

[...]

> >> -       if (vmemmap_populate(start, end, nid, altmap))
> >> +       if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
> >
> > Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
> > this optimization is only applied when the size of the struct page does not
> > cross page boundaries?
>
> Totally miss that -- let me make that adjustment.
>
> Can I ask which architectures/conditions this happens?

E.g. arm64 when !CONFIG_MEMCG.

Thanks.
Muchun Song Feb. 12, 2022, 2:49 p.m. UTC | #4
On Sat, Feb 12, 2022 at 6:08 PM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Fri, Feb 11, 2022 at 8:37 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> >
> > On 2/11/22 07:54, Muchun Song wrote:
> > > On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> > > [...]
> > >>  pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> > >> -                                      struct vmem_altmap *altmap)
> > >> +                                      struct vmem_altmap *altmap,
> > >> +                                      struct page *block)
> > >
> > > Why not use the name of "reuse" instead of "block"?
> > > Seems like "reuse" is more clear.
> > >
> > Good idea, let me rename that to @reuse.
> >
> > >>  {
> > >>         pte_t *pte = pte_offset_kernel(pmd, addr);
> > >>         if (pte_none(*pte)) {
> > >>                 pte_t entry;
> > >>                 void *p;
> > >>
> > >> -               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> > >> -               if (!p)
> > >> -                       return NULL;
> > >> +               if (!block) {
> > >> +                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> > >> +                       if (!p)
> > >> +                               return NULL;
> > >> +               } else {
> > >> +                       /*
> > >> +                        * When a PTE/PMD entry is freed from the init_mm
> > >> +                        * there's a a free_pages() call to this page allocated
> > >> +                        * above. Thus this get_page() is paired with the
> > >> +                        * put_page_testzero() on the freeing path.
> > >> +                        * This can only called by certain ZONE_DEVICE path,
> > >> +                        * and through vmemmap_populate_compound_pages() when
> > >> +                        * slab is available.
> > >> +                        */
> > >> +                       get_page(block);
> > >> +                       p = page_to_virt(block);
> > >> +               }
> > >>                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
> > >>                 set_pte_at(&init_mm, addr, pte, entry);
> > >>         }
> > >> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
> > >>  }
> > >>
> > >>  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
> > >> -                                             struct vmem_altmap *altmap)
> > >> +                                             struct vmem_altmap *altmap,
> > >> +                                             struct page *reuse, struct page **page)
> > >
> > > We can remove the last argument (struct page **page) if we change
> > > the return type to "pte_t *".  More simple, don't you think?
> > >
> >
> > Hmmm, perhaps it is simpler, specially provided the only error code is ENOMEM.
> >
> > Albeit perhaps what we want is a `struct page *` rather than a pte.
>
> The caller can extract `struct page` from a pte.
>
> [...]
>
> > >> -       if (vmemmap_populate(start, end, nid, altmap))
> > >> +       if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
> > >
> > > Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
> > > this optimization is only applied when the size of the struct page does not
> > > cross page boundaries?
> >
> > Totally miss that -- let me make that adjustment.
> >
> > Can I ask which architectures/conditions this happens?
>
> E.g. arm64 when !CONFIG_MEMCG.

Plus !CONFIG_SLUB even on x86_64.

>
> Thanks.
Joao Martins Feb. 14, 2022, 10:55 a.m. UTC | #5
On 2/12/22 10:08, Muchun Song wrote:
> On Fri, Feb 11, 2022 at 8:37 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 2/11/22 07:54, Muchun Song wrote:
>>> On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
>>>>  }
>>>>
>>>>  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
>>>> -                                             struct vmem_altmap *altmap)
>>>> +                                             struct vmem_altmap *altmap,
>>>> +                                             struct page *reuse, struct page **page)
>>>
>>> We can remove the last argument (struct page **page) if we change
>>> the return type to "pte_t *".  More simple, don't you think?
>>
>> Hmmm, perhaps it is simpler, specially provided the only error code is ENOMEM.
>>
>> Albeit perhaps what we want is a `struct page *` rather than a pte.
> 
> The caller can extract `struct page` from a pte.
> 

Yeap, we do that here already. Anyway, I can try switching to the style you suggest
and see how it looks.
Joao Martins Feb. 14, 2022, 10:57 a.m. UTC | #6
On 2/12/22 14:49, Muchun Song wrote:
> On Sat, Feb 12, 2022 at 6:08 PM Muchun Song <songmuchun@bytedance.com> wrote:
>> On Fri, Feb 11, 2022 at 8:37 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 2/11/22 07:54, Muchun Song wrote:
>>>> On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>> -       if (vmemmap_populate(start, end, nid, altmap))
>>>>> +       if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
>>>>
>>>> Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
>>>> this optimization is only applied when the size of the struct page does not
>>>> cross page boundaries?
>>>
>>> Totally miss that -- let me make that adjustment.
>>>
>>> Can I ask which architectures/conditions this happens?
>>
>> E.g. arm64 when !CONFIG_MEMCG.
> 
> Plus !CONFIG_SLUB even on x86_64.

Oh, thanks for the ref -- hadn't realized that this was
possible on arm64/x86.
diff mbox series

Patch

diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst
index 8143b2ce414d..de958bbbf78c 100644
--- a/Documentation/vm/vmemmap_dedup.rst
+++ b/Documentation/vm/vmemmap_dedup.rst
@@ -2,9 +2,12 @@ 
 
 .. _vmemmap_dedup:
 
-==================================
-Free some vmemmap pages of HugeTLB
-==================================
+=========================================
+A vmemmap diet for HugeTLB and Device DAX
+=========================================
+
+HugeTLB
+=======
 
 The struct page structures (page structs) are used to describe a physical
 page frame. By default, there is a one-to-one mapping from a page frame to
@@ -173,3 +176,50 @@  tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
 more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
 associated with each HugeTLB page. The compound_head() can handle this
 correctly (more details refer to the comment above compound_head()).
+
+Device DAX
+==========
+
+The device-dax interface uses the same tail deduplication technique explained
+in the previous chapter, except when used with the vmemmap in
+the device (altmap).
+
+The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
+PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+
+The differences with HugeTLB are relatively minor.
+
+It only use 3 page structs for storing all information as opposed
+to 4 on HugeTLB pages.
+
+There's no remapping of vmemmap given that device-dax memory is not part of
+System RAM ranges initialized at boot. Thus the tail page deduplication
+happens at a later stage when we populate the sections. HugeTLB reuses the
+the head vmemmap page representing, whereas device-dax reuses the tail
+vmemmap page. This results in only half of the savings compared to HugeTLB.
+
+Deduplicated tail pages are not mapped read-only.
+
+Here's how things look like on device-dax after the sections are populated:
+
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                   | | | | |
+ |           |                     |     3     | ------------------+ | | | |
+ |           |                     +-----------+                     | | | |
+ |           |                     |     4     | --------------------+ | | |
+ |    PMD    |                     +-----------+                       | | |
+ |   level   |                     |     5     | ----------------------+ | |
+ |  mapping  |                     +-----------+                         | |
+ |           |                     |     6     | ------------------------+ |
+ |           |                     +-----------+                           |
+ |           |                     |     7     | --------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6a439582f63..0b7028b9ff2f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3172,7 +3172,7 @@  p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-			    struct vmem_altmap *altmap);
+			    struct vmem_altmap *altmap, struct page *block);
 void *vmemmap_alloc_block(unsigned long size, int node);
 struct vmem_altmap;
 void *vmemmap_alloc_block_buf(unsigned long size, int node,
diff --git a/mm/memremap.c b/mm/memremap.c
index 71b8d42d820c..a0ef95f09397 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -323,6 +323,7 @@  void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 {
 	struct mhp_params params = {
 		.altmap = pgmap_altmap(pgmap),
+		.pgmap = pgmap,
 		.pgprot = PAGE_KERNEL,
 	};
 	const int nr_range = pgmap->nr_range;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index e7be2ef4454b..2e2b063ed285 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -533,16 +533,31 @@  void __meminit vmemmap_verify(pte_t *pte, int node,
 }
 
 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-				       struct vmem_altmap *altmap)
+				       struct vmem_altmap *altmap,
+				       struct page *block)
 {
 	pte_t *pte = pte_offset_kernel(pmd, addr);
 	if (pte_none(*pte)) {
 		pte_t entry;
 		void *p;
 
-		p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
-		if (!p)
-			return NULL;
+		if (!block) {
+			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+			if (!p)
+				return NULL;
+		} else {
+			/*
+			 * When a PTE/PMD entry is freed from the init_mm
+			 * there's a a free_pages() call to this page allocated
+			 * above. Thus this get_page() is paired with the
+			 * put_page_testzero() on the freeing path.
+			 * This can only called by certain ZONE_DEVICE path,
+			 * and through vmemmap_populate_compound_pages() when
+			 * slab is available.
+			 */
+			get_page(block);
+			p = page_to_virt(block);
+		}
 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
@@ -609,7 +624,8 @@  pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 }
 
 static int __meminit vmemmap_populate_address(unsigned long addr, int node,
-					      struct vmem_altmap *altmap)
+					      struct vmem_altmap *altmap,
+					      struct page *reuse, struct page **page)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -629,11 +645,13 @@  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
 	pmd = vmemmap_pmd_populate(pud, addr, node);
 	if (!pmd)
 		return -ENOMEM;
-	pte = vmemmap_pte_populate(pmd, addr, node, altmap);
+	pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
 	if (!pte)
 		return -ENOMEM;
 	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 
+	if (page)
+		*page = pte_page(*pte);
 	return 0;
 }
 
@@ -644,10 +662,120 @@  int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
 	int rc;
 
 	for (; addr < end; addr += PAGE_SIZE) {
-		rc = vmemmap_populate_address(addr, node, altmap);
+		rc = vmemmap_populate_address(addr, node, altmap, NULL, NULL);
 		if (rc)
 			return rc;
+	}
+
+	return 0;
+}
+
+static int __meminit vmemmap_populate_range(unsigned long start,
+					    unsigned long end,
+					    int node, struct page *page)
+{
+	unsigned long addr = start;
+	int rc;
 
+	for (; addr < end; addr += PAGE_SIZE) {
+		rc = vmemmap_populate_address(addr, node, NULL, page, NULL);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static inline int __meminit vmemmap_populate_page(unsigned long addr, int node,
+						  struct page **page)
+{
+	return vmemmap_populate_address(addr, node, NULL, NULL, page);
+}
+
+/*
+ * For compound pages bigger than section size (e.g. x86 1G compound
+ * pages with 2M subsection size) fill the rest of sections as tail
+ * pages.
+ *
+ * Note that memremap_pages() resets @nr_range value and will increment
+ * it after each range successful onlining. Thus the value or @nr_range
+ * at section memmap populate corresponds to the in-progress range
+ * being onlined here.
+ */
+static bool __meminit reuse_compound_section(unsigned long start_pfn,
+					     struct dev_pagemap *pgmap)
+{
+	unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+	unsigned long offset = start_pfn -
+		PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
+
+	return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
+}
+
+static struct page * __meminit compound_section_tail_page(unsigned long addr)
+{
+	pte_t *ptep;
+
+	addr -= PAGE_SIZE;
+
+	/*
+	 * Assuming sections are populated sequentially, the previous section's
+	 * page data can be reused.
+	 */
+	ptep = pte_offset_kernel(pmd_off_k(addr), addr);
+	if (!ptep)
+		return NULL;
+
+	return pte_page(*ptep);
+}
+
+static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+						     unsigned long start,
+						     unsigned long end, int node,
+						     struct dev_pagemap *pgmap)
+{
+	unsigned long size, addr;
+
+	if (reuse_compound_section(start_pfn, pgmap)) {
+		struct page *page;
+
+		page = compound_section_tail_page(start);
+		if (!page)
+			return -ENOMEM;
+
+		/*
+		 * Reuse the page that was populated in the prior iteration
+		 * with just tail struct pages.
+		 */
+		return vmemmap_populate_range(start, end, node, page);
+	}
+
+	size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
+	for (addr = start; addr < end; addr += size) {
+		unsigned long next = addr, last = addr + size;
+		struct page *block;
+		int rc;
+
+		/* Populate the head page vmemmap page */
+		rc = vmemmap_populate_page(addr, node, NULL);
+		if (rc)
+			return rc;
+
+		/* Populate the tail pages vmemmap page */
+		block = NULL;
+		next = addr + PAGE_SIZE;
+		rc = vmemmap_populate_page(next, node, &block);
+		if (rc)
+			return rc;
+
+		/*
+		 * Reuse the previous page for the rest of tail pages
+		 * See layout diagram in Documentation/vm/vmemmap_dedup.rst
+		 */
+		next += PAGE_SIZE;
+		rc = vmemmap_populate_range(next, last, node, block);
+		if (rc)
+			return rc;
 	}
 
 	return 0;
@@ -659,12 +787,18 @@  struct page * __meminit __populate_section_memmap(unsigned long pfn,
 {
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
+	int r;
 
 	if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
-	if (vmemmap_populate(start, end, nid, altmap))
+	if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
+		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
+	else
+		r = vmemmap_populate(start, end, nid, altmap);
+
+	if (r < 0)
 		return NULL;
 
 	return pfn_to_page(pfn);