diff mbox series

[RFC,4/9] mm/page_alloc: Reuse tail struct pages for compound pagemaps

Message ID 20201208172901.17384-6-joao.m.martins@oracle.com (mailing list archive)
State New
Headers show
Series mm, sparse-vmemmap: Introduce compound pagemaps | expand

Commit Message

Joao Martins Dec. 8, 2020, 5:28 p.m. UTC
When PGMAP_COMPOUND is set, all pages are onlined at a given huge page
alignment and using compound pages to describe them as opposed to a
struct per 4K.

To minimize struct page overhead and given the usage of compound pages we
utilize the fact that most tail pages look the same, we online the
subsection while pointing to the same pages. Thus request VMEMMAP_REUSE
in add_pages.

With VMEMMAP_REUSE, provided we reuse most tail pages the amount of
struct pages we need to initialize is a lot smaller that the total
amount of structs we would normnally online. Thus allow an @init_order
to be passed to specify how much pages we want to prep upon creating a
compound page.

Finally when onlining all struct pages in memmap_init_zone_device, make
sure that we only initialize the unique struct pages i.e. the first 2
4K pages from @align which means 128 struct pages out of 32768 for 2M
@align or 262144 for a 1G @align.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 mm/memremap.c   |  4 +++-
 mm/page_alloc.c | 23 ++++++++++++++++++++---
 2 files changed, 23 insertions(+), 4 deletions(-)

Comments

Dan Williams Feb. 20, 2021, 6:17 a.m. UTC | #1
On Tue, Dec 8, 2020 at 9:31 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> When PGMAP_COMPOUND is set, all pages are onlined at a given huge page
> alignment and using compound pages to describe them as opposed to a
> struct per 4K.
>

Same s/online/mapped/ comment as other changelogs.

> To minimize struct page overhead and given the usage of compound pages we
> utilize the fact that most tail pages look the same, we online the
> subsection while pointing to the same pages. Thus request VMEMMAP_REUSE
> in add_pages.
>
> With VMEMMAP_REUSE, provided we reuse most tail pages the amount of
> struct pages we need to initialize is a lot smaller that the total
> amount of structs we would normnally online. Thus allow an @init_order
> to be passed to specify how much pages we want to prep upon creating a
> compound page.
>
> Finally when onlining all struct pages in memmap_init_zone_device, make
> sure that we only initialize the unique struct pages i.e. the first 2
> 4K pages from @align which means 128 struct pages out of 32768 for 2M
> @align or 262144 for a 1G @align.
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  mm/memremap.c   |  4 +++-
>  mm/page_alloc.c | 23 ++++++++++++++++++++---
>  2 files changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memremap.c b/mm/memremap.c
> index ecfa74848ac6..3eca07916b9d 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -253,8 +253,10 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
>                         goto err_kasan;
>                 }
>
> -               if (pgmap->flags & PGMAP_COMPOUND)
> +               if (pgmap->flags & PGMAP_COMPOUND) {
>                         params->align = pgmap->align;
> +                       params->flags = MEMHP_REUSE_VMEMMAP;

The "reuse" naming is not my favorite. Yes, page reuse is happening,
but what is more relevant is that the vmemmap is in a given minimum
page size mode. So it's less of a flag and more of enum that selects
between PAGE_SIZE, HPAGE_SIZE, and PUD_PAGE_SIZE (GPAGE_SIZE?).

> +               }
>
>                 error = arch_add_memory(nid, range->start, range_len(range),
>                                         params);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 9716ecd58e29..180a7d4e9285 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -691,10 +691,11 @@ void free_compound_page(struct page *page)
>         __free_pages_ok(page, compound_order(page), FPI_NONE);
>  }
>
> -void prep_compound_page(struct page *page, unsigned int order)
> +static void __prep_compound_page(struct page *page, unsigned int order,
> +                                unsigned int init_order)
>  {
>         int i;
> -       int nr_pages = 1 << order;
> +       int nr_pages = 1 << init_order;
>
>         __SetPageHead(page);
>         for (i = 1; i < nr_pages; i++) {
> @@ -711,6 +712,11 @@ void prep_compound_page(struct page *page, unsigned int order)
>                 atomic_set(compound_pincount_ptr(page), 0);
>  }
>
> +void prep_compound_page(struct page *page, unsigned int order)
> +{
> +       __prep_compound_page(page, order, order);
> +}
> +
>  #ifdef CONFIG_DEBUG_PAGEALLOC
>  unsigned int _debug_guardpage_minorder;
>
> @@ -6108,6 +6114,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>  }
>
>  #ifdef CONFIG_ZONE_DEVICE
> +
> +#define MEMMAP_COMPOUND_SIZE (2 * (PAGE_SIZE/sizeof(struct page)))
> +
>  void __ref memmap_init_zone_device(struct zone *zone,
>                                    unsigned long start_pfn,
>                                    unsigned long nr_pages,
> @@ -6138,6 +6147,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
>         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>                 struct page *page = pfn_to_page(pfn);
>
> +               /* Skip already initialized pages. */
> +               if (compound && (pfn % align >= MEMMAP_COMPOUND_SIZE)) {
> +                       pfn = ALIGN(pfn, align) - 1;
> +                       continue;
> +               }
> +
>                 __init_single_page(page, pfn, zone_idx, nid);
>
>                 /*
> @@ -6175,7 +6190,9 @@ void __ref memmap_init_zone_device(struct zone *zone,
>
>         if (compound) {
>                 for (pfn = start_pfn; pfn < end_pfn; pfn += align)
> -                       prep_compound_page(pfn_to_page(pfn), order_base_2(align));
> +                       __prep_compound_page(pfn_to_page(pfn),
> +                                          order_base_2(align),
> +                                          order_base_2(MEMMAP_COMPOUND_SIZE));
>         }

Alex did quite a bit of work to optimize this path, and this
organization appears to undo it. I'd prefer to keep it all in one loop
so a 'struct page' is only initialized once. Otherwise by the time the
above loop finishes and this one starts the 'struct page's are
probably cache cold again.

So I'd break prep_compoud_page into separate head and tail  init and
call them at the right time in one loop.
Joao Martins Feb. 22, 2021, 12:01 p.m. UTC | #2
On 2/20/21 6:17 AM, Dan Williams wrote:
> On Tue, Dec 8, 2020 at 9:31 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> When PGMAP_COMPOUND is set, all pages are onlined at a given huge page
>> alignment and using compound pages to describe them as opposed to a
>> struct per 4K.
>>
> 
> Same s/online/mapped/ comment as other changelogs.
> 
Ack.

>> To minimize struct page overhead and given the usage of compound pages we
>> utilize the fact that most tail pages look the same, we online the
>> subsection while pointing to the same pages. Thus request VMEMMAP_REUSE
>> in add_pages.
>>
>> With VMEMMAP_REUSE, provided we reuse most tail pages the amount of
>> struct pages we need to initialize is a lot smaller that the total
>> amount of structs we would normnally online. Thus allow an @init_order
>> to be passed to specify how much pages we want to prep upon creating a
>> compound page.
>>
>> Finally when onlining all struct pages in memmap_init_zone_device, make
>> sure that we only initialize the unique struct pages i.e. the first 2
>> 4K pages from @align which means 128 struct pages out of 32768 for 2M
>> @align or 262144 for a 1G @align.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>  mm/memremap.c   |  4 +++-
>>  mm/page_alloc.c | 23 ++++++++++++++++++++---
>>  2 files changed, 23 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/memremap.c b/mm/memremap.c
>> index ecfa74848ac6..3eca07916b9d 100644
>> --- a/mm/memremap.c
>> +++ b/mm/memremap.c
>> @@ -253,8 +253,10 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
>>                         goto err_kasan;
>>                 }
>>
>> -               if (pgmap->flags & PGMAP_COMPOUND)
>> +               if (pgmap->flags & PGMAP_COMPOUND) {
>>                         params->align = pgmap->align;
>> +                       params->flags = MEMHP_REUSE_VMEMMAP;
> 
> The "reuse" naming is not my favorite. 

I also dislike it, but couldn't come up with a better one :(

> Yes, page reuse is happening,
> but what is more relevant is that the vmemmap is in a given minimum
> page size mode. So it's less of a flag and more of enum that selects
> between PAGE_SIZE, HPAGE_SIZE, and PUD_PAGE_SIZE (GPAGE_SIZE?).
> 
That does sound cleaner, but at the same time, won't we get confused
with pgmap @align and the vmemmap/memhp @align ?

Hmm, I also I think there's value in having two different attributes as
they have two different intents. A pgmap @align means is 'represent its
metadata as a huge page of a given size' and the vmemmap/memhp @align
lets tell the sparsemem that we are mapping metadata of a given @align.

The compound pages (pgmap->align) might be useful for other ZONE_DEVICE
users. But I am not sure everybody will want to immediately switch to the
'struct page reuse' trick.

>> +               }
>>
>>                 error = arch_add_memory(nid, range->start, range_len(range),
>>                                         params);
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 9716ecd58e29..180a7d4e9285 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -691,10 +691,11 @@ void free_compound_page(struct page *page)
>>         __free_pages_ok(page, compound_order(page), FPI_NONE);
>>  }
>>
>> -void prep_compound_page(struct page *page, unsigned int order)
>> +static void __prep_compound_page(struct page *page, unsigned int order,
>> +                                unsigned int init_order)
>>  {
>>         int i;
>> -       int nr_pages = 1 << order;
>> +       int nr_pages = 1 << init_order;
>>
>>         __SetPageHead(page);
>>         for (i = 1; i < nr_pages; i++) {
>> @@ -711,6 +712,11 @@ void prep_compound_page(struct page *page, unsigned int order)
>>                 atomic_set(compound_pincount_ptr(page), 0);
>>  }
>>
>> +void prep_compound_page(struct page *page, unsigned int order)
>> +{
>> +       __prep_compound_page(page, order, order);
>> +}
>> +
>>  #ifdef CONFIG_DEBUG_PAGEALLOC
>>  unsigned int _debug_guardpage_minorder;
>>
>> @@ -6108,6 +6114,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>>  }
>>
>>  #ifdef CONFIG_ZONE_DEVICE
>> +
>> +#define MEMMAP_COMPOUND_SIZE (2 * (PAGE_SIZE/sizeof(struct page)))
>> +
>>  void __ref memmap_init_zone_device(struct zone *zone,
>>                                    unsigned long start_pfn,
>>                                    unsigned long nr_pages,
>> @@ -6138,6 +6147,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>                 struct page *page = pfn_to_page(pfn);
>>
>> +               /* Skip already initialized pages. */
>> +               if (compound && (pfn % align >= MEMMAP_COMPOUND_SIZE)) {
>> +                       pfn = ALIGN(pfn, align) - 1;
>> +                       continue;
>> +               }
>> +
>>                 __init_single_page(page, pfn, zone_idx, nid);
>>
>>                 /*
>> @@ -6175,7 +6190,9 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>
>>         if (compound) {
>>                 for (pfn = start_pfn; pfn < end_pfn; pfn += align)
>> -                       prep_compound_page(pfn_to_page(pfn), order_base_2(align));
>> +                       __prep_compound_page(pfn_to_page(pfn),
>> +                                          order_base_2(align),
>> +                                          order_base_2(MEMMAP_COMPOUND_SIZE));
>>         }
> 
> Alex did quite a bit of work to optimize this path, and this
> organization appears to undo it. I'd prefer to keep it all in one loop
> so a 'struct page' is only initialized once. Otherwise by the time the
> above loop finishes and this one starts the 'struct page's are
> probably cache cold again.
> 
> So I'd break prep_compoud_page into separate head and tail  init and
> call them at the right time in one loop.
> 
Ah, makes sense! I'll split into head/tail counter parts -- Might get even
faster that already is.

Which makes me wonder if we shouldn't replace that line:

"memmap_init_zone_device initialized NNNNNN pages in 0ms\n"

to use 'us' or 'ns' where applicable. That's ought to be more useful information
to the user.
diff mbox series

Patch

diff --git a/mm/memremap.c b/mm/memremap.c
index ecfa74848ac6..3eca07916b9d 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -253,8 +253,10 @@  static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 			goto err_kasan;
 		}
 
-		if (pgmap->flags & PGMAP_COMPOUND)
+		if (pgmap->flags & PGMAP_COMPOUND) {
 			params->align = pgmap->align;
+			params->flags = MEMHP_REUSE_VMEMMAP;
+		}
 
 		error = arch_add_memory(nid, range->start, range_len(range),
 					params);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9716ecd58e29..180a7d4e9285 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,10 +691,11 @@  void free_compound_page(struct page *page)
 	__free_pages_ok(page, compound_order(page), FPI_NONE);
 }
 
-void prep_compound_page(struct page *page, unsigned int order)
+static void __prep_compound_page(struct page *page, unsigned int order,
+				 unsigned int init_order)
 {
 	int i;
-	int nr_pages = 1 << order;
+	int nr_pages = 1 << init_order;
 
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
@@ -711,6 +712,11 @@  void prep_compound_page(struct page *page, unsigned int order)
 		atomic_set(compound_pincount_ptr(page), 0);
 }
 
+void prep_compound_page(struct page *page, unsigned int order)
+{
+	__prep_compound_page(page, order, order);
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 
@@ -6108,6 +6114,9 @@  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 }
 
 #ifdef CONFIG_ZONE_DEVICE
+
+#define MEMMAP_COMPOUND_SIZE (2 * (PAGE_SIZE/sizeof(struct page)))
+
 void __ref memmap_init_zone_device(struct zone *zone,
 				   unsigned long start_pfn,
 				   unsigned long nr_pages,
@@ -6138,6 +6147,12 @@  void __ref memmap_init_zone_device(struct zone *zone,
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 
+		/* Skip already initialized pages. */
+		if (compound && (pfn % align >= MEMMAP_COMPOUND_SIZE)) {
+			pfn = ALIGN(pfn, align) - 1;
+			continue;
+		}
+
 		__init_single_page(page, pfn, zone_idx, nid);
 
 		/*
@@ -6175,7 +6190,9 @@  void __ref memmap_init_zone_device(struct zone *zone,
 
 	if (compound) {
 		for (pfn = start_pfn; pfn < end_pfn; pfn += align)
-			prep_compound_page(pfn_to_page(pfn), order_base_2(align));
+			__prep_compound_page(pfn_to_page(pfn),
+					   order_base_2(align),
+					   order_base_2(MEMMAP_COMPOUND_SIZE));
 	}
 
 	pr_info("%s initialised %lu pages in %ums\n", __func__,