diff mbox series

[v5,bpf-next,1/5] module: introduce module_alloc_huge

Message ID 20220624215712.3050672-2-song@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf_prog_pack followup | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers warning 18 maintainers not CCed: tglx@linutronix.de jpoimboe@kernel.org bp@alien8.de songliubraving@fb.com ast@kernel.org akpm@linux-foundation.org bpf@vger.kernel.org x86@kernel.org mingo@redhat.com linux-modules@vger.kernel.org wangkefeng.wang@huawei.com hpa@zytor.com yhs@fb.com john.fastabend@gmail.com kafai@fb.com peterz@infradead.org andrii@kernel.org kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 7 this patch: 7
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 10 this patch: 10
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 52 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Kernel LATEST on z15 with gcc
bpf/vmtest-bpf-next-VM_Test-1 fail Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 fail Logs for Kernel LATEST on ubuntu-latest with llvm-15

Commit Message

Song Liu June 24, 2022, 9:57 p.m. UTC
Introduce module_alloc_huge, which allocates huge page backed memory in
module memory space. The primary user of this memory is bpf_prog_pack
(multiple BPF programs sharing a huge page).

Signed-off-by: Song Liu <song@kernel.org>
---
 arch/x86/kernel/module.c     | 21 +++++++++++++++++++++
 include/linux/moduleloader.h |  5 +++++
 kernel/module/main.c         |  8 ++++++++
 3 files changed, 34 insertions(+)

Comments

Luis Chamberlain July 1, 2022, 11:20 p.m. UTC | #1
On Fri, Jun 24, 2022 at 02:57:08PM -0700, Song Liu wrote:
> Introduce module_alloc_huge, which allocates huge page backed memory in
> module memory space. The primary user of this memory is bpf_prog_pack
> (multiple BPF programs sharing a huge page).
> 
> Signed-off-by: Song Liu <song@kernel.org>

I see mm not Cc'd. I'd like review from them.

> ---
>  arch/x86/kernel/module.c     | 21 +++++++++++++++++++++
>  include/linux/moduleloader.h |  5 +++++
>  kernel/module/main.c         |  8 ++++++++
>  3 files changed, 34 insertions(+)
> 
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index b98ffcf4d250..63f6a16c70dc 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -86,6 +86,27 @@ void *module_alloc(unsigned long size)
>  	return p;
>  }
>  
> +void *module_alloc_huge(unsigned long size)
> +{
> +	gfp_t gfp_mask = GFP_KERNEL;
> +	void *p;
> +
> +	if (PAGE_ALIGN(size) > MODULES_LEN)
> +		return NULL;
> +
> +	p = __vmalloc_node_range(size, MODULE_ALIGN,
> +				 MODULES_VADDR + get_module_load_offset(),
> +				 MODULES_END, gfp_mask, PAGE_KERNEL,
> +				 VM_DEFER_KMEMLEAK | VM_ALLOW_HUGE_VMAP,
> +				 NUMA_NO_NODE, __builtin_return_address(0));
> +	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> +		vfree(p);
> +		return NULL;
> +	}
> +
> +	return p;
> +}

1) When things like kernel/bpf/core.c start using a module alloc it
   is time to consider genearlizing this.

2) How we free is important, and each arch does something funky for
   this. This is not addressed here.

And yes I welcome generalizing generic module_alloc() too as suggested
before. The concern on my part is the sloppiness this enables.

  Luis

> +
>  #ifdef CONFIG_X86_32
>  int apply_relocate(Elf32_Shdr *sechdrs,
>  		   const char *strtab,
> diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
> index 9e09d11ffe5b..d34743a88938 100644
> --- a/include/linux/moduleloader.h
> +++ b/include/linux/moduleloader.h
> @@ -26,6 +26,11 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
>     sections.  Returns NULL on failure. */
>  void *module_alloc(unsigned long size);
>  
> +/* Allocator used for allocating memory in module memory space. If size is
> + * greater than PMD_SIZE, allow using huge pages. Returns NULL on failure.
> + */
> +void *module_alloc_huge(unsigned long size);
> +
>  /* Free memory returned from module_alloc. */
>  void module_memfree(void *module_region);
>  
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index fed58d30725d..349b2a8bd20f 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -1613,6 +1613,14 @@ void * __weak module_alloc(unsigned long size)
>  			NUMA_NO_NODE, __builtin_return_address(0));
>  }
>  
> +void * __weak module_alloc_huge(unsigned long size)
> +{
> +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +				    GFP_KERNEL, PAGE_KERNEL_EXEC,
> +				    VM_FLUSH_RESET_PERMS | VM_ALLOW_HUGE_VMAP,
> +				    NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +
>  bool __weak module_init_section(const char *name)
>  {
>  	return strstarts(name, ".init");
> -- 
> 2.30.2
>
Song Liu July 6, 2022, 4:39 a.m. UTC | #2
> On Jul 1, 2022, at 4:20 PM, Luis Chamberlain <mcgrof@kernel.org> wrote:
> 
> On Fri, Jun 24, 2022 at 02:57:08PM -0700, Song Liu wrote:
>> Introduce module_alloc_huge, which allocates huge page backed memory in
>> module memory space. The primary user of this memory is bpf_prog_pack
>> (multiple BPF programs sharing a huge page).
>> 
>> Signed-off-by: Song Liu <song@kernel.org>
> 
> I see mm not Cc'd. I'd like review from them.

I will CC mm in the next version (or resend). Thanks for the reminder. 

> 
>> ---
>> arch/x86/kernel/module.c | 21 +++++++++++++++++++++
>> include/linux/moduleloader.h | 5 +++++
>> kernel/module/main.c | 8 ++++++++
>> 3 files changed, 34 insertions(+)
>> 
>> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
>> index b98ffcf4d250..63f6a16c70dc 100644
>> --- a/arch/x86/kernel/module.c
>> +++ b/arch/x86/kernel/module.c
>> @@ -86,6 +86,27 @@ void *module_alloc(unsigned long size)
>> 	return p;
>> }
>> 
>> +void *module_alloc_huge(unsigned long size)
>> +{
>> +	gfp_t gfp_mask = GFP_KERNEL;
>> +	void *p;
>> +
>> +	if (PAGE_ALIGN(size) > MODULES_LEN)
>> +		return NULL;
>> +
>> +	p = __vmalloc_node_range(size, MODULE_ALIGN,
>> +				 MODULES_VADDR + get_module_load_offset(),
>> +				 MODULES_END, gfp_mask, PAGE_KERNEL,
>> +				 VM_DEFER_KMEMLEAK | VM_ALLOW_HUGE_VMAP,
>> +				 NUMA_NO_NODE, __builtin_return_address(0));
>> +	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
>> +		vfree(p);
>> +		return NULL;
>> +	}
>> +
>> +	return p;
>> +}
> 
> 1) When things like kernel/bpf/core.c start using a module alloc it
> is time to consider genearlizing this.

I am not quite sure what the generalization would look like. IMHO, the
ideal case would have:
  a) A kernel_text_rw_allocator, similar to current module_alloc.
  b) A kernel_text_ro_allocator, similar to current bpf_prog_pack_alloc.
     This is built on top of kernel_text_rw_allocator. Different 
     allocations could share a page, thus it requires text_poke like 
     support from the arch. 
  c) If the arch supports text_poke, kprobes, ftrace trampolines, and
     bpf trampolines should use kernel_text_ro_allocator.
  d) Major archs should support CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC,
     and they should use kernel_text_ro_allocator for module text. 

Does this sound reasonable to you?

I tried to enable CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC for x86_64, 
but that doesn't really work. Do we have plan to make this combination
work?

> 
> 2) How we free is important, and each arch does something funky for
> this. This is not addressed here.

How should we address this? IIUC, x86_64 just calls vfree. 

> 
> And yes I welcome generalizing generic module_alloc() too as suggested
> before. The concern on my part is the sloppiness this enables.

One question I have is, does module_alloc (or kernel_text_*_allocator 
above) belong to module code, or mm code (maybe vmalloc)?

I am planning to let BPF trampoline use bpf_prog_pack on x86_64, which 
is another baby step of c) above. 

Thanks,
Song
Luis Chamberlain July 7, 2022, 8:11 p.m. UTC | #3
On Wed, Jul 06, 2022 at 04:39:13AM +0000, Song Liu wrote:
> > On Jul 1, 2022, at 4:20 PM, Luis Chamberlain <mcgrof@kernel.org> wrote:
> > On Fri, Jun 24, 2022 at 02:57:08PM -0700, Song Liu wrote:
> >> +void *module_alloc_huge(unsigned long size)
> >> +{
> >> +	gfp_t gfp_mask = GFP_KERNEL;
> >> +	void *p;
> >> +
> >> +	if (PAGE_ALIGN(size) > MODULES_LEN)
> >> +		return NULL;
> >> +
> >> +	p = __vmalloc_node_range(size, MODULE_ALIGN,
> >> +				 MODULES_VADDR + get_module_load_offset(),
> >> +				 MODULES_END, gfp_mask, PAGE_KERNEL,
> >> +				 VM_DEFER_KMEMLEAK | VM_ALLOW_HUGE_VMAP,
> >> +				 NUMA_NO_NODE, __builtin_return_address(0));
> >> +	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
> >> +		vfree(p);
> >> +		return NULL;
> >> +	}
> >> +
> >> +	return p;
> >> +}
> > 
> > 1) When things like kernel/bpf/core.c start using a module alloc it
> > is time to consider genearlizing this.
> 
> I am not quite sure what the generalization would look like. IMHO, the
> ideal case would have:
>   a) A kernel_text_rw_allocator, similar to current module_alloc.
>   b) A kernel_text_ro_allocator, similar to current bpf_prog_pack_alloc.
>      This is built on top of kernel_text_rw_allocator. Different 
>      allocations could share a page, thus it requires text_poke like 
>      support from the arch. 
>   c) If the arch supports text_poke, kprobes, ftrace trampolines, and
>      bpf trampolines should use kernel_text_ro_allocator.
>   d) Major archs should support CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC,
>      and they should use kernel_text_ro_allocator for module text. 
> 
> Does this sound reasonable to you?

Yes, and a respective free call may have an arch specific stuff which
removes exec stuff.

In so far as the bikeshedding, I do think this is generic so
vmalloc_text_*() suffices or vmalloc_exec_*() take your pick for
a starter and let the world throw in their preference.

> I tried to enable CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC for x86_64, 
> but that doesn't really work. Do we have plan to make this combination
> work?

Oh nice.

Good stuff. Perhaps it just requires a little love from mm folks.
Don't beat yourself up if it does not yet. We can work towards that
later.

> > 2) How we free is important, and each arch does something funky for
> > this. This is not addressed here.
> 
> How should we address this? IIUC, x86_64 just calls vfree. 

That's not the case for all archs is it? I'm talking about the generic
module_alloc() too. I'd like to see that go the way we discussed above.

> > And yes I welcome generalizing generic module_alloc() too as suggested
> > before. The concern on my part is the sloppiness this enables.
> 
> One question I have is, does module_alloc (or kernel_text_*_allocator 
> above) belong to module code, or mm code (maybe vmalloc)?

The evolution of its uses indicates it is growing outside of modules and
so mm should be the new home.

> I am planning to let BPF trampoline use bpf_prog_pack on x86_64, which 
> is another baby step of c) above. 

OK!

  Luis
diff mbox series

Patch

diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b98ffcf4d250..63f6a16c70dc 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -86,6 +86,27 @@  void *module_alloc(unsigned long size)
 	return p;
 }
 
+void *module_alloc_huge(unsigned long size)
+{
+	gfp_t gfp_mask = GFP_KERNEL;
+	void *p;
+
+	if (PAGE_ALIGN(size) > MODULES_LEN)
+		return NULL;
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
+				 MODULES_VADDR + get_module_load_offset(),
+				 MODULES_END, gfp_mask, PAGE_KERNEL,
+				 VM_DEFER_KMEMLEAK | VM_ALLOW_HUGE_VMAP,
+				 NUMA_NO_NODE, __builtin_return_address(0));
+	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
+}
+
 #ifdef CONFIG_X86_32
 int apply_relocate(Elf32_Shdr *sechdrs,
 		   const char *strtab,
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 9e09d11ffe5b..d34743a88938 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -26,6 +26,11 @@  unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
    sections.  Returns NULL on failure. */
 void *module_alloc(unsigned long size);
 
+/* Allocator used for allocating memory in module memory space. If size is
+ * greater than PMD_SIZE, allow using huge pages. Returns NULL on failure.
+ */
+void *module_alloc_huge(unsigned long size);
+
 /* Free memory returned from module_alloc. */
 void module_memfree(void *module_region);
 
diff --git a/kernel/module/main.c b/kernel/module/main.c
index fed58d30725d..349b2a8bd20f 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1613,6 +1613,14 @@  void * __weak module_alloc(unsigned long size)
 			NUMA_NO_NODE, __builtin_return_address(0));
 }
 
+void * __weak module_alloc_huge(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+				    GFP_KERNEL, PAGE_KERNEL_EXEC,
+				    VM_FLUSH_RESET_PERMS | VM_ALLOW_HUGE_VMAP,
+				    NUMA_NO_NODE, __builtin_return_address(0));
+}
+
 bool __weak module_init_section(const char *name)
 {
 	return strstarts(name, ".init");