diff mbox series

[v2,bpf,1/3] vmalloc: replace VM_NO_HUGE_VMAP with VM_ALLOW_HUGE_VMAP

Message ID 20220411233549.740157-2-song@kernel.org (mailing list archive)
State New
Headers show
Series vmalloc: bpf: introduce VM_ALLOW_HUGE_VMAP | expand

Commit Message

Song Liu April 11, 2022, 11:35 p.m. UTC
Huge page backed vmalloc memory could benefit performance in many cases.
Since some users of vmalloc may not be ready to handle huge pages,
VM_NO_HUGE_VMAP was introduced to allow vmalloc users to opt-out huge
pages. However, it is not easy to add VM_NO_HUGE_VMAP to all the users
that may try to allocate >= PMD_SIZE pages, but are not ready to handle
huge pages properly.

Replace VM_NO_HUGE_VMAP with an opt-in flag, VM_ALLOW_HUGE_VMAP, so that
users that benefit from huge pages could ask specificially.

Also, replace vmalloc_no_huge() with opt-in helper vmalloc_huge().

Fixes: fac54e2bfb5b ("x86/Kconfig: Select HAVE_ARCH_HUGE_VMALLOC with
                     HAVE_ARCH_HUGE_VMAP")
Link: https://lore.kernel.org/netdev/14444103-d51b-0fb3-ee63-c3f182f0b546@molgen.mpg.de/"
Signed-off-by: Song Liu <song@kernel.org>
---
 arch/Kconfig                 |  6 ++----
 arch/powerpc/kernel/module.c |  2 +-
 arch/s390/kvm/pv.c           |  2 +-
 include/linux/vmalloc.h      |  4 ++--
 mm/vmalloc.c                 | 39 +++++++++++++++++++-----------------
 5 files changed, 27 insertions(+), 26 deletions(-)

Comments

Christoph Hellwig April 12, 2022, 4:18 a.m. UTC | #1
On Mon, Apr 11, 2022 at 04:35:46PM -0700, Song Liu wrote:
> Huge page backed vmalloc memory could benefit performance in many cases.
> Since some users of vmalloc may not be ready to handle huge pages,
> VM_NO_HUGE_VMAP was introduced to allow vmalloc users to opt-out huge
> pages. However, it is not easy to add VM_NO_HUGE_VMAP to all the users
> that may try to allocate >= PMD_SIZE pages, but are not ready to handle
> huge pages properly.

This is a good place to document what the problems are, and how they are
hard to track down (e.g. because the allocations are passed down I/O
stacks)

> 
> Replace VM_NO_HUGE_VMAP with an opt-in flag, VM_ALLOW_HUGE_VMAP, so that
> users that benefit from huge pages could ask specificially.
> 
> Also, replace vmalloc_no_huge() with opt-in helper vmalloc_huge().

We still need to find out what the primary users of the large vmalloc
hashes was and convert them.

> +extern void *vmalloc_huge(unsigned long size) __alloc_size(1);

No need for the extern.

> +EXPORT_SYMBOL(vmalloc_huge);

EXPORT_SYMBOL_GPL for all advanced vmalloc functionality, please.
Song Liu April 12, 2022, 6 a.m. UTC | #2
On Mon, Apr 11, 2022 at 9:18 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Mon, Apr 11, 2022 at 04:35:46PM -0700, Song Liu wrote:
> > Huge page backed vmalloc memory could benefit performance in many cases.
> > Since some users of vmalloc may not be ready to handle huge pages,
> > VM_NO_HUGE_VMAP was introduced to allow vmalloc users to opt-out huge
> > pages. However, it is not easy to add VM_NO_HUGE_VMAP to all the users
> > that may try to allocate >= PMD_SIZE pages, but are not ready to handle
> > huge pages properly.
>
> This is a good place to document what the problems are, and how they are
> hard to track down (e.g. because the allocations are passed down I/O
> stacks)

Will add it in v3.

>
> >
> > Replace VM_NO_HUGE_VMAP with an opt-in flag, VM_ALLOW_HUGE_VMAP, so that
> > users that benefit from huge pages could ask specificially.
> >
> > Also, replace vmalloc_no_huge() with opt-in helper vmalloc_huge().
>
> We still need to find out what the primary users of the large vmalloc
> hashes was and convert them.

@ Claudio and Nicholas,

Could you please help identify users of large vmalloc? So far, I found
alloc_large_system_hash(), and something like the following seems to
work:

diff --git i/mm/page_alloc.c w/mm/page_alloc.c
index 6e5b4488a0c5..20d38b8482c4 100644
--- i/mm/page_alloc.c
+++ w/mm/page_alloc.c
@@ -8919,7 +8919,7 @@ void *__init alloc_large_system_hash(const char
*tablename,
                                table = memblock_alloc_raw(size,
                                                           SMP_CACHE_BYTES);
                } else if (get_order(size) >= MAX_ORDER || hashdist) {
-                       table = __vmalloc(size, gfp_flags);
+                       table = __vmalloc_huge(size, gfp_flags);
                        virt = true;
                        if (table)
                                huge = is_vm_area_hugepages(table);
diff --git i/mm/vmalloc.c w/mm/vmalloc.c
index 7cc2be6a7554..cbadbe83e6a6 100644
--- i/mm/vmalloc.c
+++ w/mm/vmalloc.c
@@ -3253,6 +3253,14 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__vmalloc);

+void *__vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+{
+       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+                                   gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+                                   NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(__vmalloc_huge);
+
 /**
  * vmalloc - allocate virtually contiguous memory
  * @size:    allocation size


>
> > +extern void *vmalloc_huge(unsigned long size) __alloc_size(1);
>
> No need for the extern.
>
> > +EXPORT_SYMBOL(vmalloc_huge);
>
> EXPORT_SYMBOL_GPL for all advanced vmalloc functionality, please.

Will fix these in v3.

Thanks,
Song
Nicholas Piggin April 21, 2022, 2:24 a.m. UTC | #3
Excerpts from Song Liu's message of April 12, 2022 4:00 pm:
> On Mon, Apr 11, 2022 at 9:18 PM Christoph Hellwig <hch@infradead.org> wrote:
>>
>> On Mon, Apr 11, 2022 at 04:35:46PM -0700, Song Liu wrote:
>> > Huge page backed vmalloc memory could benefit performance in many cases.
>> > Since some users of vmalloc may not be ready to handle huge pages,
>> > VM_NO_HUGE_VMAP was introduced to allow vmalloc users to opt-out huge
>> > pages. However, it is not easy to add VM_NO_HUGE_VMAP to all the users
>> > that may try to allocate >= PMD_SIZE pages, but are not ready to handle
>> > huge pages properly.
>>
>> This is a good place to document what the problems are, and how they are
>> hard to track down (e.g. because the allocations are passed down I/O
>> stacks)
> 
> Will add it in v3.
> 
>>
>> >
>> > Replace VM_NO_HUGE_VMAP with an opt-in flag, VM_ALLOW_HUGE_VMAP, so that
>> > users that benefit from huge pages could ask specificially.
>> >
>> > Also, replace vmalloc_no_huge() with opt-in helper vmalloc_huge().
>>
>> We still need to find out what the primary users of the large vmalloc
>> hashes was and convert them.
> 
> @ Claudio and Nicholas,
> 
> Could you please help identify users of large vmalloc? So far, I found
> alloc_large_system_hash(), and something like the following seems to
> work:

The large system hashes were the main ones I was interested in. IIRC 
there was a few more in some drivers or tracing things depending on
config but those are less important (to me at least).

Curious what the problem is though. powerpc so far has not required
any special case outside arch/powerpc/ for this so I would much
prefer x86 to fix itself rather than add APIs which non-arch code
really shouldn't need to know about.

Thanks,
Nick
Nicholas Piggin April 21, 2022, 3:35 a.m. UTC | #4
Excerpts from Nicholas Piggin's message of April 21, 2022 12:24 pm:
> Excerpts from Song Liu's message of April 12, 2022 4:00 pm:
>> On Mon, Apr 11, 2022 at 9:18 PM Christoph Hellwig <hch@infradead.org> wrote:
>>>
>>> On Mon, Apr 11, 2022 at 04:35:46PM -0700, Song Liu wrote:
>>> > Huge page backed vmalloc memory could benefit performance in many cases.
>>> > Since some users of vmalloc may not be ready to handle huge pages,
>>> > VM_NO_HUGE_VMAP was introduced to allow vmalloc users to opt-out huge
>>> > pages. However, it is not easy to add VM_NO_HUGE_VMAP to all the users
>>> > that may try to allocate >= PMD_SIZE pages, but are not ready to handle
>>> > huge pages properly.
>>>
>>> This is a good place to document what the problems are, and how they are
>>> hard to track down (e.g. because the allocations are passed down I/O
>>> stacks)
>> 
>> Will add it in v3.
>> 
>>>
>>> >
>>> > Replace VM_NO_HUGE_VMAP with an opt-in flag, VM_ALLOW_HUGE_VMAP, so that
>>> > users that benefit from huge pages could ask specificially.
>>> >
>>> > Also, replace vmalloc_no_huge() with opt-in helper vmalloc_huge().
>>>
>>> We still need to find out what the primary users of the large vmalloc
>>> hashes was and convert them.
>> 
>> @ Claudio and Nicholas,
>> 
>> Could you please help identify users of large vmalloc? So far, I found
>> alloc_large_system_hash(), and something like the following seems to
>> work:
> 
> The large system hashes were the main ones I was interested in. IIRC 
> there was a few more in some drivers or tracing things depending on
> config but those are less important (to me at least).

Oh there is also a reverse map array in KVM now I think of it.

Thanks,
Nick
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 29b0167c088b..31c4fdc4a4ba 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -854,10 +854,8 @@  config HAVE_ARCH_HUGE_VMAP
 
 #
 #  Archs that select this would be capable of PMD-sized vmaps (i.e.,
-#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
-#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
-#  can be used to prohibit arch-specific allocations from using hugepages to
-#  help with this (e.g., modules may require it).
+#  arch_vmap_pmd_supported() returns true). The VM_ALLOW_HUGE_VMAP flag
+#  must be used to enable allocations to use hugepages.
 #
 config HAVE_ARCH_HUGE_VMALLOC
 	depends on HAVE_ARCH_HUGE_VMAP
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 40a583e9d3c7..97a76a8619fb 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -101,7 +101,7 @@  __module_alloc(unsigned long size, unsigned long start, unsigned long end, bool
 	 * too.
 	 */
 	return __vmalloc_node_range(size, 1, start, end, gfp, prot,
-				    VM_FLUSH_RESET_PERMS | VM_NO_HUGE_VMAP,
+				    VM_FLUSH_RESET_PERMS,
 				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 7f7c0d6af2ce..8afede243903 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -142,7 +142,7 @@  static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
 	 * using large pages for the virtual memory area.
 	 * This is a hardware limitation.
 	 */
-	kvm->arch.pv.stor_var = vmalloc_no_huge(vlen);
+	kvm->arch.pv.stor_var = vmalloc(vlen);
 	if (!kvm->arch.pv.stor_var)
 		goto out_err;
 	return 0;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3b1df7da402d..1024517d7937 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -26,7 +26,7 @@  struct notifier_block;		/* in notifier.h */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
 #define VM_FLUSH_RESET_PERMS	0x00000100	/* reset direct map and flush TLB on unmap, can't be freed in atomic context */
 #define VM_MAP_PUT_PAGES	0x00000200	/* put pages and free array in vfree */
-#define VM_NO_HUGE_VMAP		0x00000400	/* force PAGE_SIZE pte mapping */
+#define VM_ALLOW_HUGE_VMAP	0x00000400      /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */
 
 #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
 	!defined(CONFIG_KASAN_VMALLOC)
@@ -153,7 +153,7 @@  extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			const void *caller) __alloc_size(1);
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
-void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
+extern void *vmalloc_huge(unsigned long size) __alloc_size(1);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
 extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e163372d3967..7cc2be6a7554 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3106,7 +3106,7 @@  void *__vmalloc_node_range(unsigned long size, unsigned long align,
 		return NULL;
 	}
 
-	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
+	if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
 		unsigned long size_per_node;
 
 		/*
@@ -3272,23 +3272,6 @@  void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
 
-/**
- * vmalloc_no_huge - allocate virtually contiguous memory using small pages
- * @size:    allocation size
- *
- * Allocate enough non-huge pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_no_huge(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-				    GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
-				    NUMA_NO_NODE, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(vmalloc_no_huge);
-
 /**
  * vzalloc - allocate virtually contiguous memory with zero fill
  * @size:    allocation size
@@ -3347,6 +3330,26 @@  void *vmalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vmalloc_node);
 
+/**
+ * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * @size:    allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * If @size is greater than or equal to PMD_SIZE, allow using
+ * huge pages for the memory
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_huge(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+				    GFP_KERNEL, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+				    NUMA_NO_NODE, __builtin_return_address(0));
+
+}
+EXPORT_SYMBOL(vmalloc_huge);
+
 /**
  * vzalloc_node - allocate memory on a specific node with zero fill
  * @size:	allocation size