diff mbox series

[RFC,02/12] net: create a 1G-huge-page-backed allocator

Message ID 20230707183935.997267-3-kuba@kernel.org (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series net: huge page backed page_pool | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1344 this patch: 1347
netdev/cc_maintainers warning 11 maintainers not CCed: tglx@linutronix.de hpa@zytor.com x86@kernel.org mingo@redhat.com ytcoode@gmail.com Jason@zx2c4.com dave.hansen@linux.intel.com davem@davemloft.net jgross@suse.com pabeni@redhat.com bp@alien8.de
netdev/build_clang success Errors and warnings before: 1364 this patch: 1364
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1367 this patch: 1368
netdev/checkpatch warning WARNING: externs should be avoided in .c files
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Jakub Kicinski July 7, 2023, 6:39 p.m. UTC
Get 1G pages from CMA, driver will be able to sub-allocate from
those for individual queues. Each Rx queue (taking recycling into
account) needs 32MB - 128MB of memory. With 32 active queues even
with 2MB pages we'll end up using a lot of IOTLB entries.

There are some workarounds for 2MB pages like trying to sort
the buffers (i.e. making sure that the buffers used by the NIC
at any time belong to as few 2MB pages as possible). But 1G pages
seem so much simpler.

Grab 4 pages for now, the real thing will probably need some
Kconfigs and command line params. And a lot more uAPI in general.

Also IDK how to hook this properly into early init :(

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 arch/x86/kernel/setup.c |   6 +-
 include/net/dcalloc.h   |  10 ++
 net/core/dcalloc.c      | 225 ++++++++++++++++++++++++++++++++++++++++
 net/core/dcalloc.h      |   3 +
 4 files changed, 243 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fd975a4a5200..cc6acd1fa67a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -843,6 +843,8 @@  static void __init x86_report_nx(void)
 	}
 }
 
+int __init mep_cma_init(void);
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1223,8 +1225,10 @@  void __init setup_arch(char **cmdline_p)
 	initmem_init();
 	dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
 
-	if (boot_cpu_has(X86_FEATURE_GBPAGES))
+	if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
 		hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+		mep_cma_init();
+	}
 
 	/*
 	 * Reserve memory for crash kernel after SRAT is parsed so that it
diff --git a/include/net/dcalloc.h b/include/net/dcalloc.h
index a85c59d7f844..21c0fcaaa163 100644
--- a/include/net/dcalloc.h
+++ b/include/net/dcalloc.h
@@ -15,4 +15,14 @@  void *dma_cocoa_alloc(struct dma_cocoa *cocoa, unsigned long size,
 void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
 		    dma_addr_t dma);
 
+struct mem_provider;
+
+struct mem_provider *mep_create(struct device *dev);
+void mep_destroy(struct mem_provider *mep);
+
+struct page *mep_alloc(struct mem_provider *mep, unsigned int order,
+		       dma_addr_t *dma, gfp_t gfp);
+void mep_free(struct mem_provider *mep, struct page *page,
+	      unsigned int order, dma_addr_t dma);
+
 #endif
diff --git a/net/core/dcalloc.c b/net/core/dcalloc.c
index af9029018353..821b9dbfb655 100644
--- a/net/core/dcalloc.c
+++ b/net/core/dcalloc.c
@@ -388,3 +388,228 @@  void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
 	size = roundup_pow_of_two(size);
 	return dma_sal_free(&cocoa->sal, addr, size, dma);
 }
+
+/*****************************
+ ***   DMA MEP allocator   ***
+ *****************************/
+
+#include <linux/cma.h>
+
+static struct cma *mep_cma;
+static int mep_err;
+
+int __init mep_cma_init(void);
+int __init mep_cma_init(void)
+{
+	int order_per_bit;
+
+	order_per_bit = min(30 - PAGE_SHIFT, MAX_ORDER - 1);
+	order_per_bit = min(order_per_bit, HUGETLB_PAGE_ORDER);
+
+	mep_err = cma_declare_contiguous_nid(0,		/* base */
+					     SZ_4G,	/* size */
+					     0,		/* limit */
+					     SZ_1G,	/* alignment */
+					     order_per_bit,  /* order_per_bit */
+					     false,	/* fixed */
+					     "net_mep",	/* name */
+					     &mep_cma,	/* res_cma */
+					     NUMA_NO_NODE);  /* nid */
+	if (mep_err)
+		pr_warn("Net MEP init failed: %d\n", mep_err);
+	else
+		pr_info("Net MEP reserved 4G of memory\n");
+
+	return 0;
+}
+
+/** ----- MEP (slow / ctrl) allocator ----- */
+
+void mp_huge_split(struct page *page, unsigned int order)
+{
+	int i;
+
+	split_page(page, order);
+	/* The subsequent pages have a poisoned next, and since we only
+	 * OR in the PP_SIGNATURE this will mess up PP detection.
+	 */
+	for (i = 0; i < (1 << order); i++)
+		page[i].pp_magic &= 3UL;
+}
+
+struct mem_provider {
+	struct dma_slow_allocator sal;
+
+	struct work_struct work;
+};
+
+static int
+dma_mep_alloc_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb,
+		   unsigned int size, gfp_t gfp)
+{
+	int order = get_order(size);
+
+	fb->addr = alloc_pages(gfp, order);
+	if (!fb->addr)
+		return -ENOMEM;
+
+	fb->dma = dma_map_page_attrs(sal->dev, fb->addr, 0, size,
+				     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+	if (dma_mapping_error(sal->dev, fb->dma)) {
+		put_page(fb->addr);
+		return -ENOMEM;
+	}
+
+	mp_huge_split(fb->addr, order);
+	return 0;
+}
+
+static void
+dma_mep_free_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb)
+{
+	int order = get_order(fb->size);
+	struct page *page;
+	int i;
+
+	page = fb->addr;
+	dma_unmap_page_attrs(sal->dev, fb->dma, fb->size,
+			     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+	for (i = 0; i < (1 << order); i++)
+		put_page(page + i);
+}
+
+static void mep_release_work(struct work_struct *work)
+{
+	struct mem_provider *mep;
+
+	mep = container_of(work, struct mem_provider, work);
+
+	while (!list_empty(&mep->sal.huge)) {
+		struct dma_slow_buddy *bud;
+		struct dma_slow_huge *shu;
+
+		shu = list_first_entry(&mep->sal.huge, typeof(*shu), huge);
+
+		dma_unmap_page_attrs(mep->sal.dev, shu->dma, SZ_1G,
+				     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+		cma_release(mep_cma, shu->addr, SZ_1G / PAGE_SIZE);
+
+		bud = list_first_entry_or_null(&shu->buddy_list,
+					       typeof(*bud), list);
+		if (WARN_ON(!bud || bud->size != SZ_1G))
+			continue;
+		kfree(bud);
+
+		list_del(&shu->huge);
+		kfree(shu);
+	}
+	put_device(mep->sal.dev);
+	kfree(mep);
+}
+
+static void dma_mep_release(struct dma_slow_allocator *sal)
+{
+	struct mem_provider *mep;
+
+	mep = container_of(sal, struct mem_provider, sal);
+
+	INIT_WORK(&mep->work, mep_release_work);
+	schedule_work(&mep->work);
+}
+
+struct dma_slow_allocator_ops dma_mep_ops = {
+	.ptr_shf	= PAGE_SHIFT - order_base_2(sizeof(struct page)),
+
+	.alloc_fall	= dma_mep_alloc_fall,
+	.free_fall	= dma_mep_free_fall,
+
+	.release	= dma_mep_release,
+};
+
+struct mem_provider *mep_create(struct device *dev)
+{
+	struct mem_provider *mep;
+	int i;
+
+	mep = kzalloc(sizeof(*mep), GFP_KERNEL);
+	if (!mep)
+		return NULL;
+
+	dma_sal_init(&mep->sal, &dma_mep_ops, dev);
+	get_device(mep->sal.dev);
+
+	if (mep_err)
+		goto done;
+
+	/* Hardcoded for now */
+	for (i = 0; i < 2; i++) {
+		const unsigned int order = 30 - PAGE_SHIFT; /* 1G */
+		struct dma_slow_huge *shu;
+		struct page *page;
+
+		shu = kzalloc(sizeof(*shu), GFP_KERNEL);
+		if (!shu)
+			break;
+
+		page = cma_alloc(mep_cma, SZ_1G / PAGE_SIZE, order, false);
+		if (!page) {
+			pr_err("mep: CMA alloc failed\n");
+			goto err_free_shu;
+		}
+
+		shu->dma = dma_map_page_attrs(mep->sal.dev, page, 0,
+					      PAGE_SIZE << order,
+					      DMA_BIDIRECTIONAL,
+					      DMA_ATTR_SKIP_CPU_SYNC);
+		if (dma_mapping_error(mep->sal.dev, shu->dma)) {
+			pr_err("mep: DMA map failed\n");
+			goto err_free_page;
+		}
+
+		if (dma_slow_huge_init(shu, page, SZ_1G, shu->dma,
+				       GFP_KERNEL)) {
+			pr_err("mep: shu init failed\n");
+			goto err_unmap;
+		}
+
+		mp_huge_split(page, 30 - PAGE_SHIFT);
+
+		list_add(&shu->huge, &mep->sal.huge);
+		continue;
+
+err_unmap:
+		dma_unmap_page_attrs(mep->sal.dev, shu->dma, SZ_1G,
+				     DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+err_free_page:
+		put_page(page);
+err_free_shu:
+		kfree(shu);
+		break;
+	}
+done:
+	if (list_empty(&mep->sal.huge))
+		pr_warn("mep: no huge pages acquired\n");
+
+	return mep;
+}
+EXPORT_SYMBOL_GPL(mep_create);
+
+void mep_destroy(struct mem_provider *mep)
+{
+	dma_slow_put(&mep->sal);
+}
+EXPORT_SYMBOL_GPL(mep_destroy);
+
+struct page *mep_alloc(struct mem_provider *mep, unsigned int order,
+		       dma_addr_t *dma, gfp_t gfp)
+{
+	return dma_sal_alloc(&mep->sal, PAGE_SIZE << order, dma, gfp);
+}
+EXPORT_SYMBOL_GPL(mep_alloc);
+
+void mep_free(struct mem_provider *mep, struct page *page,
+	      unsigned int order, dma_addr_t dma)
+{
+	dma_sal_free(&mep->sal, page, PAGE_SIZE << order, dma);
+}
+EXPORT_SYMBOL_GPL(mep_free);
diff --git a/net/core/dcalloc.h b/net/core/dcalloc.h
index c7e75ef0cb81..2664f933c8e1 100644
--- a/net/core/dcalloc.h
+++ b/net/core/dcalloc.h
@@ -90,4 +90,7 @@  static inline void dma_slow_put(struct dma_slow_allocator *sal)
 		sal->ops->release(sal);
 }
 
+/* misc */
+void mp_huge_split(struct page *page, unsigned int order);
+
 #endif