diff mbox series

[RFC,3/9] sparse-vmemmap: Reuse vmemmap areas for a given page size

Message ID 20201208172901.17384-5-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Joao Martins Dec. 8, 2020, 5:28 p.m. UTC
Introduce a new flag, MEMHP_REUSE_VMEMMAP, which signals that
struct pages are onlined with a given alignment, and should reuse the
tail pages vmemmap areas. On that circunstamce we reuse the PFN backing
only the tail pages subsections, while letting the head page PFN remain
different. This presumes that the backing page structs are compound
pages, such as the case for compound pagemaps (i.e. ZONE_DEVICE with
PGMAP_COMPOUND set)

On 2M compound pagemaps, it lets us save 6 pages out of the 8 necessary
PFNs necessary to describe the subsection's 32K struct pages we are
onlining. On a 1G compound pagemap it let us save 4096 pages.

Sections are 128M (or bigger/smaller), and such when initializing a
compound memory map where we are initializing compound struct pages, we
need to preserve the tail page to be reused across the rest of the areas
for pagesizes which bigger than a section.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
I wonder, rather than separating vmem_context and mhp_params, that
one would just pick the latter. Albeit  semantically the ctx aren't
necessarily paramters, context passed from multiple sections onlining
(i.e. multiple calls to populate_section_memmap). Also provided that
this is internal state, which isn't passed to external modules, except
 @align and @flags for page size and requesting whether to reuse tail
page areas.
---
 include/linux/memory_hotplug.h | 10 ++++
 include/linux/mm.h             |  2 +-
 mm/memory_hotplug.c            | 12 ++++-
 mm/memremap.c                  |  3 ++
 mm/sparse-vmemmap.c            | 93 ++++++++++++++++++++++++++++------
 5 files changed, 103 insertions(+), 17 deletions(-)

Comments

Dan Williams Feb. 20, 2021, 3:34 a.m. UTC | #1
On Tue, Dec 8, 2020 at 9:32 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> Introduce a new flag, MEMHP_REUSE_VMEMMAP, which signals that
> struct pages are onlined with a given alignment, and should reuse the
> tail pages vmemmap areas. On that circunstamce we reuse the PFN backing

s/On that circunstamce we reuse/Reuse/

Kills a "we" and switches to imperative tense. I noticed a couple
other "we"s in the previous patches, but this crossed my threshold to
make a comment.

> only the tail pages subsections, while letting the head page PFN remain
> different. This presumes that the backing page structs are compound
> pages, such as the case for compound pagemaps (i.e. ZONE_DEVICE with
> PGMAP_COMPOUND set)
>
> On 2M compound pagemaps, it lets us save 6 pages out of the 8 necessary

s/lets us save/saves/

> PFNs necessary

s/8 necessary PFNs necessary/8 PFNs necessary/

> to describe the subsection's 32K struct pages we are
> onlining.

s/we are onlining/being mapped/

...because ZONE_DEVICE pages are never "onlined".

> On a 1G compound pagemap it let us save 4096 pages.

s/lets us save/saves/

>
> Sections are 128M (or bigger/smaller),

Huh?

> and such when initializing a
> compound memory map where we are initializing compound struct pages, we
> need to preserve the tail page to be reused across the rest of the areas
> for pagesizes which bigger than a section.
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
> I wonder, rather than separating vmem_context and mhp_params, that
> one would just pick the latter. Albeit  semantically the ctx aren't
> necessarily paramters, context passed from multiple sections onlining
> (i.e. multiple calls to populate_section_memmap). Also provided that
> this is internal state, which isn't passed to external modules, except
>  @align and @flags for page size and requesting whether to reuse tail
> page areas.
> ---
>  include/linux/memory_hotplug.h | 10 ++++
>  include/linux/mm.h             |  2 +-
>  mm/memory_hotplug.c            | 12 ++++-
>  mm/memremap.c                  |  3 ++
>  mm/sparse-vmemmap.c            | 93 ++++++++++++++++++++++++++++------
>  5 files changed, 103 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 73f8bcbb58a4..e15bb82805a3 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -70,6 +70,10 @@ typedef int __bitwise mhp_t;
>   */
>  #define MEMHP_MERGE_RESOURCE   ((__force mhp_t)BIT(0))
>
> +/*
> + */
> +#define MEMHP_REUSE_VMEMMAP    ((__force mhp_t)BIT(1))
> +
>  /*
>   * Extended parameters for memory hotplug:
>   * altmap: alternative allocator for memmap array (optional)
> @@ -79,10 +83,16 @@ typedef int __bitwise mhp_t;
>  struct mhp_params {
>         struct vmem_altmap *altmap;
>         pgprot_t pgprot;
> +       unsigned int align;
> +       mhp_t flags;
>  };
>
>  struct vmem_context {
>         struct vmem_altmap *altmap;
> +       mhp_t flags;
> +       unsigned int align;
> +       void *block;
> +       unsigned long block_page;
>  };
>
>  /*
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2eb44318bb2d..8b0155441835 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3006,7 +3006,7 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
>  pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
>  pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
>  pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> -                           struct vmem_altmap *altmap);
> +                           struct vmem_altmap *altmap, void *block);
>  void *vmemmap_alloc_block(unsigned long size, int node);
>  struct vmem_altmap;
>  void *vmemmap_alloc_block_buf(unsigned long size, int node,
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index f8870c53fe5e..56121dfcc44b 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -300,6 +300,14 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
>         return 0;
>  }
>
> +static void vmem_context_init(struct vmem_context *ctx, struct mhp_params *params)
> +{
> +       memset(ctx, 0, sizeof(*ctx));
> +       ctx->align = params->align;
> +       ctx->altmap = params->altmap;
> +       ctx->flags = params->flags;
> +}
> +
>  /*
>   * Reasonably generic function for adding memory.  It is
>   * expected that archs that support memory hotplug will
> @@ -313,7 +321,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
>         unsigned long cur_nr_pages;
>         int err;
>         struct vmem_altmap *altmap = params->altmap;
> -       struct vmem_context ctx = { .altmap = params->altmap };
> +       struct vmem_context ctx;
>
>         if (WARN_ON_ONCE(!params->pgprot.pgprot))
>                 return -EINVAL;
> @@ -338,6 +346,8 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
>         if (err)
>                 return err;
>
> +       vmem_context_init(&ctx, params);
> +
>         for (; pfn < end_pfn; pfn += cur_nr_pages) {
>                 /* Select all remaining pages up to the next section boundary */
>                 cur_nr_pages = min(end_pfn - pfn,
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 287a24b7a65a..ecfa74848ac6 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -253,6 +253,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
>                         goto err_kasan;
>                 }
>
> +               if (pgmap->flags & PGMAP_COMPOUND)
> +                       params->align = pgmap->align;
> +
>                 error = arch_add_memory(nid, range->start, range_len(range),
>                                         params);
>         }
> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index bcda68ba1381..364c071350e8 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -141,16 +141,20 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
>  }
>
>  pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> -                                      struct vmem_altmap *altmap)
> +                                      struct vmem_altmap *altmap, void *block)
>  {
>         pte_t *pte = pte_offset_kernel(pmd, addr);
>         if (pte_none(*pte)) {
>                 pte_t entry;
> -               void *p;
> -
> -               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> -               if (!p)
> -                       return NULL;
> +               void *p = block;
> +
> +               if (!block) {
> +                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> +                       if (!p)
> +                               return NULL;
> +               } else {
> +                       get_page(virt_to_page(block));
> +               }
>                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
>                 set_pte_at(&init_mm, addr, pte, entry);
>         }
> @@ -216,8 +220,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
>         return pgd;
>  }
>
> -int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> -                                        int node, struct vmem_altmap *altmap)
> +static void *__meminit __vmemmap_populate_basepages(unsigned long start,
> +                                          unsigned long end, int node,
> +                                          struct vmem_altmap *altmap,
> +                                          void *block)
>  {
>         unsigned long addr = start;
>         pgd_t *pgd;
> @@ -229,38 +235,95 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>         for (; addr < end; addr += PAGE_SIZE) {
>                 pgd = vmemmap_pgd_populate(addr, node);
>                 if (!pgd)
> -                       return -ENOMEM;
> +                       return NULL;
>                 p4d = vmemmap_p4d_populate(pgd, addr, node);
>                 if (!p4d)
> -                       return -ENOMEM;
> +                       return NULL;
>                 pud = vmemmap_pud_populate(p4d, addr, node);
>                 if (!pud)
> -                       return -ENOMEM;
> +                       return NULL;
>                 pmd = vmemmap_pmd_populate(pud, addr, node);
>                 if (!pmd)
> -                       return -ENOMEM;
> -               pte = vmemmap_pte_populate(pmd, addr, node, altmap);
> +                       return NULL;
> +               pte = vmemmap_pte_populate(pmd, addr, node, altmap, block);
>                 if (!pte)
> -                       return -ENOMEM;
> +                       return NULL;
>                 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>         }
>
> +       return __va(__pfn_to_phys(pte_pfn(*pte)));
> +}
> +
> +int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> +                                        int node, struct vmem_altmap *altmap)
> +{
> +       if (!__vmemmap_populate_basepages(start, end, node, altmap, NULL))
> +               return -ENOMEM;
>         return 0;
>  }
>
> +static struct page * __meminit vmemmap_populate_reuse(unsigned long start,
> +                                       unsigned long end, int node,
> +                                       struct vmem_context *ctx)
> +{
> +       unsigned long size, addr = start;
> +       unsigned long psize = PHYS_PFN(ctx->align) * sizeof(struct page);
> +
> +       size = min(psize, end - start);
> +
> +       for (; addr < end; addr += size) {
> +               unsigned long head = addr + PAGE_SIZE;
> +               unsigned long tail = addr;
> +               unsigned long last = addr + size;
> +               void *area;
> +
> +               if (ctx->block_page &&
> +                   IS_ALIGNED((addr - ctx->block_page), psize))
> +                       ctx->block = NULL;
> +
> +               area  = ctx->block;
> +               if (!area) {
> +                       if (!__vmemmap_populate_basepages(addr, head, node,
> +                                                         ctx->altmap, NULL))
> +                               return NULL;
> +
> +                       tail = head + PAGE_SIZE;
> +                       area = __vmemmap_populate_basepages(head, tail, node,
> +                                                           ctx->altmap, NULL);
> +                       if (!area)
> +                               return NULL;
> +
> +                       ctx->block = area;
> +                       ctx->block_page = addr;
> +               }
> +
> +               if (!__vmemmap_populate_basepages(tail, last, node,
> +                                                 ctx->altmap, area))
> +                       return NULL;
> +       }

I think that compound page accounting and combined altmap accounting
makes this difficult to read, and I think the compound page case
deserves it's own first class loop rather than reusing
vmemmap_populate_basepages(). With the suggestion to drop altmap
support I'd expect a vmmemap_populate_compound that takes a compound
page size and goes the right think with respect to mapping all the
tail pages to the same pfn.
Joao Martins Feb. 22, 2021, 11:42 a.m. UTC | #2
On 2/20/21 3:34 AM, Dan Williams wrote:
> On Tue, Dec 8, 2020 at 9:32 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> Introduce a new flag, MEMHP_REUSE_VMEMMAP, which signals that
>> struct pages are onlined with a given alignment, and should reuse the
>> tail pages vmemmap areas. On that circunstamce we reuse the PFN backing
> 
> s/On that circunstamce we reuse/Reuse/
> 
> Kills a "we" and switches to imperative tense. I noticed a couple
> other "we"s in the previous patches, but this crossed my threshold to
> make a comment.
> 
/me nods. Will fix.

>> only the tail pages subsections, while letting the head page PFN remain
>> different. This presumes that the backing page structs are compound
>> pages, such as the case for compound pagemaps (i.e. ZONE_DEVICE with
>> PGMAP_COMPOUND set)
>>
>> On 2M compound pagemaps, it lets us save 6 pages out of the 8 necessary
> 
> s/lets us save/saves/
> 
Will fix.

>> PFNs necessary
> 
> s/8 necessary PFNs necessary/8 PFNs necessary/

Will fix.

> 
>> to describe the subsection's 32K struct pages we are
>> onlining.
> 
> s/we are onlining/being mapped/
> 
> ...because ZONE_DEVICE pages are never "onlined".
> 
>> On a 1G compound pagemap it let us save 4096 pages.
> 
> s/lets us save/saves/
> 

Will fix both.

>>
>> Sections are 128M (or bigger/smaller),
> 
> Huh?
> 

Section size is arch-dependent if we are being hollistic.
On x86 it's 64M, 128M or 512M right?

 #ifdef CONFIG_X86_32
 # ifdef CONFIG_X86_PAE
 #  define SECTION_SIZE_BITS     29
 #  define MAX_PHYSMEM_BITS      36
 # else
 #  define SECTION_SIZE_BITS     26
 #  define MAX_PHYSMEM_BITS      32
 # endif
 #else /* CONFIG_X86_32 */
 # define SECTION_SIZE_BITS      27 /* matt - 128 is convenient right now */
 # define MAX_PHYSMEM_BITS       (pgtable_l5_enabled() ? 52 : 46)
 #endif

Also, me pointing about section sizes, is because a 1GB+ page vmemmap population will
cross sections in how sparsemem populates the vmemmap. And on that case we gotta reuse the
the PTE/PMD pages across multiple invocations of vmemmap_populate_basepages(). Either
that, or looking at the previous page PTE, but that might be ineficient.

>> @@ -229,38 +235,95 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>>         for (; addr < end; addr += PAGE_SIZE) {
>>                 pgd = vmemmap_pgd_populate(addr, node);
>>                 if (!pgd)
>> -                       return -ENOMEM;
>> +                       return NULL;
>>                 p4d = vmemmap_p4d_populate(pgd, addr, node);
>>                 if (!p4d)
>> -                       return -ENOMEM;
>> +                       return NULL;
>>                 pud = vmemmap_pud_populate(p4d, addr, node);
>>                 if (!pud)
>> -                       return -ENOMEM;
>> +                       return NULL;
>>                 pmd = vmemmap_pmd_populate(pud, addr, node);
>>                 if (!pmd)
>> -                       return -ENOMEM;
>> -               pte = vmemmap_pte_populate(pmd, addr, node, altmap);
>> +                       return NULL;
>> +               pte = vmemmap_pte_populate(pmd, addr, node, altmap, block);
>>                 if (!pte)
>> -                       return -ENOMEM;
>> +                       return NULL;
>>                 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>>         }
>>
>> +       return __va(__pfn_to_phys(pte_pfn(*pte)));
>> +}
>> +
>> +int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>> +                                        int node, struct vmem_altmap *altmap)
>> +{
>> +       if (!__vmemmap_populate_basepages(start, end, node, altmap, NULL))
>> +               return -ENOMEM;
>>         return 0;
>>  }
>>
>> +static struct page * __meminit vmemmap_populate_reuse(unsigned long start,
>> +                                       unsigned long end, int node,
>> +                                       struct vmem_context *ctx)
>> +{
>> +       unsigned long size, addr = start;
>> +       unsigned long psize = PHYS_PFN(ctx->align) * sizeof(struct page);
>> +
>> +       size = min(psize, end - start);
>> +
>> +       for (; addr < end; addr += size) {
>> +               unsigned long head = addr + PAGE_SIZE;
>> +               unsigned long tail = addr;
>> +               unsigned long last = addr + size;
>> +               void *area;
>> +
>> +               if (ctx->block_page &&
>> +                   IS_ALIGNED((addr - ctx->block_page), psize))
>> +                       ctx->block = NULL;
>> +
>> +               area  = ctx->block;
>> +               if (!area) {
>> +                       if (!__vmemmap_populate_basepages(addr, head, node,
>> +                                                         ctx->altmap, NULL))
>> +                               return NULL;
>> +
>> +                       tail = head + PAGE_SIZE;
>> +                       area = __vmemmap_populate_basepages(head, tail, node,
>> +                                                           ctx->altmap, NULL);
>> +                       if (!area)
>> +                               return NULL;
>> +
>> +                       ctx->block = area;
>> +                       ctx->block_page = addr;
>> +               }
>> +
>> +               if (!__vmemmap_populate_basepages(tail, last, node,
>> +                                                 ctx->altmap, area))
>> +                       return NULL;
>> +       }
> 
> I think that compound page accounting and combined altmap accounting
> makes this difficult to read, and I think the compound page case
> deserves it's own first class loop rather than reusing
> vmemmap_populate_basepages(). With the suggestion to drop altmap
> support I'd expect a vmmemap_populate_compound that takes a compound
> page size and goes the right think with respect to mapping all the
> tail pages to the same pfn.
> 
I can move this to a separate loop as suggested.

But to be able to map all tail pages in one call of vmemmap_populate_compound()
this might requires changes in sparsemem generic code that I am not so sure
they are warranted the added complexity. Otherwise I'll have to probably keep
this logic of @ctx to be able to pass the page to be reused (i.e. @block and
@block_page). That's actually the main reason that made me introduce
a struct vmem_context.
Dan Williams Feb. 22, 2021, 10:40 p.m. UTC | #3
On Mon, Feb 22, 2021 at 3:42 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
>
>
> On 2/20/21 3:34 AM, Dan Williams wrote:
> > On Tue, Dec 8, 2020 at 9:32 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>
> >> Introduce a new flag, MEMHP_REUSE_VMEMMAP, which signals that
> >> struct pages are onlined with a given alignment, and should reuse the
> >> tail pages vmemmap areas. On that circunstamce we reuse the PFN backing
> >
> > s/On that circunstamce we reuse/Reuse/
> >
> > Kills a "we" and switches to imperative tense. I noticed a couple
> > other "we"s in the previous patches, but this crossed my threshold to
> > make a comment.
> >
> /me nods. Will fix.
>
> >> only the tail pages subsections, while letting the head page PFN remain
> >> different. This presumes that the backing page structs are compound
> >> pages, such as the case for compound pagemaps (i.e. ZONE_DEVICE with
> >> PGMAP_COMPOUND set)
> >>
> >> On 2M compound pagemaps, it lets us save 6 pages out of the 8 necessary
> >
> > s/lets us save/saves/
> >
> Will fix.
>
> >> PFNs necessary
> >
> > s/8 necessary PFNs necessary/8 PFNs necessary/
>
> Will fix.
>
> >
> >> to describe the subsection's 32K struct pages we are
> >> onlining.
> >
> > s/we are onlining/being mapped/
> >
> > ...because ZONE_DEVICE pages are never "onlined".
> >
> >> On a 1G compound pagemap it let us save 4096 pages.
> >
> > s/lets us save/saves/
> >
>
> Will fix both.
>
> >>
> >> Sections are 128M (or bigger/smaller),
> >
> > Huh?
> >
>
> Section size is arch-dependent if we are being hollistic.
> On x86 it's 64M, 128M or 512M right?
>
>  #ifdef CONFIG_X86_32
>  # ifdef CONFIG_X86_PAE
>  #  define SECTION_SIZE_BITS     29
>  #  define MAX_PHYSMEM_BITS      36
>  # else
>  #  define SECTION_SIZE_BITS     26
>  #  define MAX_PHYSMEM_BITS      32
>  # endif
>  #else /* CONFIG_X86_32 */
>  # define SECTION_SIZE_BITS      27 /* matt - 128 is convenient right now */
>  # define MAX_PHYSMEM_BITS       (pgtable_l5_enabled() ? 52 : 46)
>  #endif
>
> Also, me pointing about section sizes, is because a 1GB+ page vmemmap population will
> cross sections in how sparsemem populates the vmemmap. And on that case we gotta reuse the
> the PTE/PMD pages across multiple invocations of vmemmap_populate_basepages(). Either
> that, or looking at the previous page PTE, but that might be ineficient.

Ok, makes sense I think saying this description of needing to handle
section crossing is clearer than mentioning one of the section sizes.

>
> >> @@ -229,38 +235,95 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> >>         for (; addr < end; addr += PAGE_SIZE) {
> >>                 pgd = vmemmap_pgd_populate(addr, node);
> >>                 if (!pgd)
> >> -                       return -ENOMEM;
> >> +                       return NULL;
> >>                 p4d = vmemmap_p4d_populate(pgd, addr, node);
> >>                 if (!p4d)
> >> -                       return -ENOMEM;
> >> +                       return NULL;
> >>                 pud = vmemmap_pud_populate(p4d, addr, node);
> >>                 if (!pud)
> >> -                       return -ENOMEM;
> >> +                       return NULL;
> >>                 pmd = vmemmap_pmd_populate(pud, addr, node);
> >>                 if (!pmd)
> >> -                       return -ENOMEM;
> >> -               pte = vmemmap_pte_populate(pmd, addr, node, altmap);
> >> +                       return NULL;
> >> +               pte = vmemmap_pte_populate(pmd, addr, node, altmap, block);
> >>                 if (!pte)
> >> -                       return -ENOMEM;
> >> +                       return NULL;
> >>                 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
> >>         }
> >>
> >> +       return __va(__pfn_to_phys(pte_pfn(*pte)));
> >> +}
> >> +
> >> +int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> >> +                                        int node, struct vmem_altmap *altmap)
> >> +{
> >> +       if (!__vmemmap_populate_basepages(start, end, node, altmap, NULL))
> >> +               return -ENOMEM;
> >>         return 0;
> >>  }
> >>
> >> +static struct page * __meminit vmemmap_populate_reuse(unsigned long start,
> >> +                                       unsigned long end, int node,
> >> +                                       struct vmem_context *ctx)
> >> +{
> >> +       unsigned long size, addr = start;
> >> +       unsigned long psize = PHYS_PFN(ctx->align) * sizeof(struct page);
> >> +
> >> +       size = min(psize, end - start);
> >> +
> >> +       for (; addr < end; addr += size) {
> >> +               unsigned long head = addr + PAGE_SIZE;
> >> +               unsigned long tail = addr;
> >> +               unsigned long last = addr + size;
> >> +               void *area;
> >> +
> >> +               if (ctx->block_page &&
> >> +                   IS_ALIGNED((addr - ctx->block_page), psize))
> >> +                       ctx->block = NULL;
> >> +
> >> +               area  = ctx->block;
> >> +               if (!area) {
> >> +                       if (!__vmemmap_populate_basepages(addr, head, node,
> >> +                                                         ctx->altmap, NULL))
> >> +                               return NULL;
> >> +
> >> +                       tail = head + PAGE_SIZE;
> >> +                       area = __vmemmap_populate_basepages(head, tail, node,
> >> +                                                           ctx->altmap, NULL);
> >> +                       if (!area)
> >> +                               return NULL;
> >> +
> >> +                       ctx->block = area;
> >> +                       ctx->block_page = addr;
> >> +               }
> >> +
> >> +               if (!__vmemmap_populate_basepages(tail, last, node,
> >> +                                                 ctx->altmap, area))
> >> +                       return NULL;
> >> +       }
> >
> > I think that compound page accounting and combined altmap accounting
> > makes this difficult to read, and I think the compound page case
> > deserves it's own first class loop rather than reusing
> > vmemmap_populate_basepages(). With the suggestion to drop altmap
> > support I'd expect a vmmemap_populate_compound that takes a compound
> > page size and goes the right think with respect to mapping all the
> > tail pages to the same pfn.
> >
> I can move this to a separate loop as suggested.
>
> But to be able to map all tail pages in one call of vmemmap_populate_compound()
> this might requires changes in sparsemem generic code that I am not so sure
> they are warranted the added complexity. Otherwise I'll have to probably keep
> this logic of @ctx to be able to pass the page to be reused (i.e. @block and
> @block_page). That's actually the main reason that made me introduce
> a struct vmem_context.

Do you need to pass in a vmem_context, isn't that context local to
vmemmap_populate_compound_pages()?
Joao Martins Feb. 23, 2021, 3:46 p.m. UTC | #4
On 2/22/21 10:40 PM, Dan Williams wrote:
> On Mon, Feb 22, 2021 at 3:42 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 2/20/21 3:34 AM, Dan Williams wrote:
>>> On Tue, Dec 8, 2020 at 9:32 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> Sections are 128M (or bigger/smaller),
>>>
>>> Huh?
>>>
>>
>> Section size is arch-dependent if we are being hollistic.
>> On x86 it's 64M, 128M or 512M right?
>>
>>  #ifdef CONFIG_X86_32
>>  # ifdef CONFIG_X86_PAE
>>  #  define SECTION_SIZE_BITS     29
>>  #  define MAX_PHYSMEM_BITS      36
>>  # else
>>  #  define SECTION_SIZE_BITS     26
>>  #  define MAX_PHYSMEM_BITS      32
>>  # endif
>>  #else /* CONFIG_X86_32 */
>>  # define SECTION_SIZE_BITS      27 /* matt - 128 is convenient right now */
>>  # define MAX_PHYSMEM_BITS       (pgtable_l5_enabled() ? 52 : 46)
>>  #endif
>>
>> Also, me pointing about section sizes, is because a 1GB+ page vmemmap population will
>> cross sections in how sparsemem populates the vmemmap. And on that case we gotta reuse the
>> the PTE/PMD pages across multiple invocations of vmemmap_populate_basepages(). Either
>> that, or looking at the previous page PTE, but that might be ineficient.
> 
> Ok, makes sense I think saying this description of needing to handle
> section crossing is clearer than mentioning one of the section sizes.
> 
I'll amend the commit message to have this.

>>
>>>> @@ -229,38 +235,95 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>>>>         for (; addr < end; addr += PAGE_SIZE) {
>>>>                 pgd = vmemmap_pgd_populate(addr, node);
>>>>                 if (!pgd)
>>>> -                       return -ENOMEM;
>>>> +                       return NULL;
>>>>                 p4d = vmemmap_p4d_populate(pgd, addr, node);
>>>>                 if (!p4d)
>>>> -                       return -ENOMEM;
>>>> +                       return NULL;
>>>>                 pud = vmemmap_pud_populate(p4d, addr, node);
>>>>                 if (!pud)
>>>> -                       return -ENOMEM;
>>>> +                       return NULL;
>>>>                 pmd = vmemmap_pmd_populate(pud, addr, node);
>>>>                 if (!pmd)
>>>> -                       return -ENOMEM;
>>>> -               pte = vmemmap_pte_populate(pmd, addr, node, altmap);
>>>> +                       return NULL;
>>>> +               pte = vmemmap_pte_populate(pmd, addr, node, altmap, block);
>>>>                 if (!pte)
>>>> -                       return -ENOMEM;
>>>> +                       return NULL;
>>>>                 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>>>>         }
>>>>
>>>> +       return __va(__pfn_to_phys(pte_pfn(*pte)));
>>>> +}
>>>> +
>>>> +int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>>>> +                                        int node, struct vmem_altmap *altmap)
>>>> +{
>>>> +       if (!__vmemmap_populate_basepages(start, end, node, altmap, NULL))
>>>> +               return -ENOMEM;
>>>>         return 0;
>>>>  }
>>>>
>>>> +static struct page * __meminit vmemmap_populate_reuse(unsigned long start,
>>>> +                                       unsigned long end, int node,
>>>> +                                       struct vmem_context *ctx)
>>>> +{
>>>> +       unsigned long size, addr = start;
>>>> +       unsigned long psize = PHYS_PFN(ctx->align) * sizeof(struct page);
>>>> +
>>>> +       size = min(psize, end - start);
>>>> +
>>>> +       for (; addr < end; addr += size) {
>>>> +               unsigned long head = addr + PAGE_SIZE;
>>>> +               unsigned long tail = addr;
>>>> +               unsigned long last = addr + size;
>>>> +               void *area;
>>>> +
>>>> +               if (ctx->block_page &&
>>>> +                   IS_ALIGNED((addr - ctx->block_page), psize))
>>>> +                       ctx->block = NULL;
>>>> +
>>>> +               area  = ctx->block;
>>>> +               if (!area) {
>>>> +                       if (!__vmemmap_populate_basepages(addr, head, node,
>>>> +                                                         ctx->altmap, NULL))
>>>> +                               return NULL;
>>>> +
>>>> +                       tail = head + PAGE_SIZE;
>>>> +                       area = __vmemmap_populate_basepages(head, tail, node,
>>>> +                                                           ctx->altmap, NULL);
>>>> +                       if (!area)
>>>> +                               return NULL;
>>>> +
>>>> +                       ctx->block = area;
>>>> +                       ctx->block_page = addr;
>>>> +               }
>>>> +
>>>> +               if (!__vmemmap_populate_basepages(tail, last, node,
>>>> +                                                 ctx->altmap, area))
>>>> +                       return NULL;
>>>> +       }
>>>
>>> I think that compound page accounting and combined altmap accounting
>>> makes this difficult to read, and I think the compound page case
>>> deserves it's own first class loop rather than reusing
>>> vmemmap_populate_basepages(). With the suggestion to drop altmap
>>> support I'd expect a vmmemap_populate_compound that takes a compound
>>> page size and goes the right think with respect to mapping all the
>>> tail pages to the same pfn.
>>>
>> I can move this to a separate loop as suggested.
>>
>> But to be able to map all tail pages in one call of vmemmap_populate_compound()
>> this might requires changes in sparsemem generic code that I am not so sure
>> they are warranted the added complexity. Otherwise I'll have to probably keep
>> this logic of @ctx to be able to pass the page to be reused (i.e. @block and
>> @block_page). That's actually the main reason that made me introduce
>> a struct vmem_context.
> 
> Do you need to pass in a vmem_context, isn't that context local to
> vmemmap_populate_compound_pages()?
> 

Hmm, so we allocate a vmem_context (inited to zeroes) in __add_pages(), and then we use
the same vmem_context across all sections we are onling from the pfn range passed in
__add_pages(). So all sections use the same vmem_context. Then we take care in
vmemmap_populate_compound_pages() to check whether there was a @block allocated that needs
to be reused.

So while the content itself is private/local to vmemmap_populate_compound_pages() we still
rely on the ability that vmemmap_populate_compound_pages() always gets the same
vmem_context location passed in for all sections being onlined in the whole pfn range.
diff mbox series

Patch

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 73f8bcbb58a4..e15bb82805a3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -70,6 +70,10 @@  typedef int __bitwise mhp_t;
  */
 #define MEMHP_MERGE_RESOURCE	((__force mhp_t)BIT(0))
 
+/*
+ */
+#define MEMHP_REUSE_VMEMMAP	((__force mhp_t)BIT(1))
+
 /*
  * Extended parameters for memory hotplug:
  * altmap: alternative allocator for memmap array (optional)
@@ -79,10 +83,16 @@  typedef int __bitwise mhp_t;
 struct mhp_params {
 	struct vmem_altmap *altmap;
 	pgprot_t pgprot;
+	unsigned int align;
+	mhp_t flags;
 };
 
 struct vmem_context {
 	struct vmem_altmap *altmap;
+	mhp_t flags;
+	unsigned int align;
+	void *block;
+	unsigned long block_page;
 };
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2eb44318bb2d..8b0155441835 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3006,7 +3006,7 @@  p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-			    struct vmem_altmap *altmap);
+			    struct vmem_altmap *altmap, void *block);
 void *vmemmap_alloc_block(unsigned long size, int node);
 struct vmem_altmap;
 void *vmemmap_alloc_block_buf(unsigned long size, int node,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f8870c53fe5e..56121dfcc44b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,6 +300,14 @@  static int check_hotplug_memory_addressable(unsigned long pfn,
 	return 0;
 }
 
+static void vmem_context_init(struct vmem_context *ctx, struct mhp_params *params)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->align = params->align;
+	ctx->altmap = params->altmap;
+	ctx->flags = params->flags;
+}
+
 /*
  * Reasonably generic function for adding memory.  It is
  * expected that archs that support memory hotplug will
@@ -313,7 +321,7 @@  int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 	unsigned long cur_nr_pages;
 	int err;
 	struct vmem_altmap *altmap = params->altmap;
-	struct vmem_context ctx = { .altmap = params->altmap };
+	struct vmem_context ctx;
 
 	if (WARN_ON_ONCE(!params->pgprot.pgprot))
 		return -EINVAL;
@@ -338,6 +346,8 @@  int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 	if (err)
 		return err;
 
+	vmem_context_init(&ctx, params);
+
 	for (; pfn < end_pfn; pfn += cur_nr_pages) {
 		/* Select all remaining pages up to the next section boundary */
 		cur_nr_pages = min(end_pfn - pfn,
diff --git a/mm/memremap.c b/mm/memremap.c
index 287a24b7a65a..ecfa74848ac6 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -253,6 +253,9 @@  static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 			goto err_kasan;
 		}
 
+		if (pgmap->flags & PGMAP_COMPOUND)
+			params->align = pgmap->align;
+
 		error = arch_add_memory(nid, range->start, range_len(range),
 					params);
 	}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bcda68ba1381..364c071350e8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -141,16 +141,20 @@  void __meminit vmemmap_verify(pte_t *pte, int node,
 }
 
 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-				       struct vmem_altmap *altmap)
+				       struct vmem_altmap *altmap, void *block)
 {
 	pte_t *pte = pte_offset_kernel(pmd, addr);
 	if (pte_none(*pte)) {
 		pte_t entry;
-		void *p;
-
-		p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
-		if (!p)
-			return NULL;
+		void *p = block;
+
+		if (!block) {
+			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+			if (!p)
+				return NULL;
+		} else {
+			get_page(virt_to_page(block));
+		}
 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
@@ -216,8 +220,10 @@  pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 	return pgd;
 }
 
-int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
-					 int node, struct vmem_altmap *altmap)
+static void *__meminit __vmemmap_populate_basepages(unsigned long start,
+					   unsigned long end, int node,
+					   struct vmem_altmap *altmap,
+					   void *block)
 {
 	unsigned long addr = start;
 	pgd_t *pgd;
@@ -229,38 +235,95 @@  int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
 	for (; addr < end; addr += PAGE_SIZE) {
 		pgd = vmemmap_pgd_populate(addr, node);
 		if (!pgd)
-			return -ENOMEM;
+			return NULL;
 		p4d = vmemmap_p4d_populate(pgd, addr, node);
 		if (!p4d)
-			return -ENOMEM;
+			return NULL;
 		pud = vmemmap_pud_populate(p4d, addr, node);
 		if (!pud)
-			return -ENOMEM;
+			return NULL;
 		pmd = vmemmap_pmd_populate(pud, addr, node);
 		if (!pmd)
-			return -ENOMEM;
-		pte = vmemmap_pte_populate(pmd, addr, node, altmap);
+			return NULL;
+		pte = vmemmap_pte_populate(pmd, addr, node, altmap, block);
 		if (!pte)
-			return -ENOMEM;
+			return NULL;
 		vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 	}
 
+	return __va(__pfn_to_phys(pte_pfn(*pte)));
+}
+
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+					 int node, struct vmem_altmap *altmap)
+{
+	if (!__vmemmap_populate_basepages(start, end, node, altmap, NULL))
+		return -ENOMEM;
 	return 0;
 }
 
+static struct page * __meminit vmemmap_populate_reuse(unsigned long start,
+					unsigned long end, int node,
+					struct vmem_context *ctx)
+{
+	unsigned long size, addr = start;
+	unsigned long psize = PHYS_PFN(ctx->align) * sizeof(struct page);
+
+	size = min(psize, end - start);
+
+	for (; addr < end; addr += size) {
+		unsigned long head = addr + PAGE_SIZE;
+		unsigned long tail = addr;
+		unsigned long last = addr + size;
+		void *area;
+
+		if (ctx->block_page &&
+		    IS_ALIGNED((addr - ctx->block_page), psize))
+			ctx->block = NULL;
+
+		area  = ctx->block;
+		if (!area) {
+			if (!__vmemmap_populate_basepages(addr, head, node,
+							  ctx->altmap, NULL))
+				return NULL;
+
+			tail = head + PAGE_SIZE;
+			area = __vmemmap_populate_basepages(head, tail, node,
+							    ctx->altmap, NULL);
+			if (!area)
+				return NULL;
+
+			ctx->block = area;
+			ctx->block_page = addr;
+		}
+
+		if (!__vmemmap_populate_basepages(tail, last, node,
+						  ctx->altmap, area))
+			return NULL;
+	}
+
+	return (struct page *) start;
+}
+
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_context *ctx)
 {
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
 	struct vmem_altmap *altmap = NULL;
+	int flags = 0;
 
 	if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
-	if (ctx)
+	if (ctx) {
 		altmap = ctx->altmap;
+		flags = ctx->flags;
+	}
+
+	if (flags & MEMHP_REUSE_VMEMMAP)
+		return vmemmap_populate_reuse(start, end, nid, ctx);
 
 	if (vmemmap_populate(start, end, nid, altmap))
 		return NULL;