diff mbox series

[v2,04/14] mm/memremap: add ZONE_DEVICE support for compound pages

Message ID 20210617184507.3662-5-joao.m.martins@oracle.com (mailing list archive)
State New
Headers show
Series mm, sparse-vmemmap: Introduce compound pagemaps | expand

Commit Message

Joao Martins June 17, 2021, 6:44 p.m. UTC
Add a new align property for struct dev_pagemap which specifies that a
pagemap is composed of a set of compound pages of size @align, instead of
base pages. When a compound page geometry is requested, all but the first
page are initialised as tail pages instead of order-0 pages.

For certain ZONE_DEVICE users like device-dax which have a fixed page size,
this creates an opportunity to optimize GUP and GUP-fast walkers, treating
it the same way as THP or hugetlb pages.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 include/linux/memremap.h | 17 +++++++++++++++++
 mm/memremap.c            |  8 ++++++--
 mm/page_alloc.c          | 34 +++++++++++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 119f130ef8f1..e5ab6d4525c1 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -99,6 +99,10 @@  struct dev_pagemap_ops {
  * @done: completion for @internal_ref
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  * @flags: PGMAP_* flags to specify defailed behavior
+ * @geometry: structural definition of how the vmemmap metadata is populated.
+ *	A zero or PAGE_SIZE defaults to using base pages as the memmap metadata
+ *	representation. A bigger value but also multiple of PAGE_SIZE will set
+ *	up compound struct pages representative of the requested geometry size.
  * @ops: method table
  * @owner: an opaque pointer identifying the entity that manages this
  *	instance.  Used by various helpers to make sure that no
@@ -114,6 +118,7 @@  struct dev_pagemap {
 	struct completion done;
 	enum memory_type type;
 	unsigned int flags;
+	unsigned long geometry;
 	const struct dev_pagemap_ops *ops;
 	void *owner;
 	int nr_range;
@@ -130,6 +135,18 @@  static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
 	return NULL;
 }
 
+static inline unsigned long pgmap_geometry(struct dev_pagemap *pgmap)
+{
+	if (!pgmap || !pgmap->geometry)
+		return PAGE_SIZE;
+	return pgmap->geometry;
+}
+
+static inline unsigned long pgmap_pfn_geometry(struct dev_pagemap *pgmap)
+{
+	return PHYS_PFN(pgmap_geometry(pgmap));
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 bool pfn_zone_device_reserved(unsigned long pfn);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
diff --git a/mm/memremap.c b/mm/memremap.c
index 805d761740c4..ffcb924eb6a5 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -318,8 +318,12 @@  static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 				PHYS_PFN(range->start),
 				PHYS_PFN(range_len(range)), pgmap);
-	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
-			- pfn_first(pgmap, range_id));
+	if (pgmap_geometry(pgmap) > PAGE_SIZE)
+		percpu_ref_get_many(pgmap->ref, (pfn_end(pgmap, range_id)
+			- pfn_first(pgmap, range_id)) / pgmap_pfn_geometry(pgmap));
+	else
+		percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
+				- pfn_first(pgmap, range_id));
 	return 0;
 
 err_add_memory:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1264c025becb..42611c206d0a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6605,6 +6605,31 @@  static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 	}
 }
 
+static void __ref memmap_init_compound(struct page *page, unsigned long pfn,
+					unsigned long zone_idx, int nid,
+					struct dev_pagemap *pgmap,
+					unsigned long nr_pages)
+{
+	unsigned int order_align = order_base_2(nr_pages);
+	unsigned long i;
+
+	__SetPageHead(page);
+
+	for (i = 1; i < nr_pages; i++) {
+		__init_zone_device_page(page + i, pfn + i, zone_idx,
+					nid, pgmap);
+		prep_compound_tail(page, i);
+
+		/*
+		 * The first and second tail pages need to
+		 * initialized first, hence the head page is
+		 * prepared last.
+		 */
+		if (i == 2)
+			prep_compound_head(page, order_align);
+	}
+}
+
 void __ref memmap_init_zone_device(struct zone *zone,
 				   unsigned long start_pfn,
 				   unsigned long nr_pages,
@@ -6613,6 +6638,7 @@  void __ref memmap_init_zone_device(struct zone *zone,
 	unsigned long pfn, end_pfn = start_pfn + nr_pages;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+	unsigned int pfns_per_compound = pgmap_pfn_geometry(pgmap);
 	unsigned long zone_idx = zone_idx(zone);
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
@@ -6630,10 +6656,16 @@  void __ref memmap_init_zone_device(struct zone *zone,
 		nr_pages = end_pfn - start_pfn;
 	}
 
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
 		struct page *page = pfn_to_page(pfn);
 
 		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+
+		if (pfns_per_compound == 1)
+			continue;
+
+		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
+				     pfns_per_compound);
 	}
 
 	pr_info("%s initialised %lu pages in %ums\n", __func__,