diff mbox series

[v4,05/15] mm: introduce execmem_alloc() and execmem_free()

Message ID 20240411160051.2093261-6-rppt@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series mm: jit/text allocator | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 961 this patch: 961
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 28 maintainers not CCed: yonghong.song@linux.dev naveen.n.rao@linux.ibm.com john.fastabend@gmail.com borntraeger@linux.ibm.com aou@eecs.berkeley.edu bp@alien8.de ast@kernel.org npiggin@gmail.com gor@linux.ibm.com anil.s.keshavamurthy@intel.com jinghao7@illinois.edu haoluo@google.com andrii@kernel.org eddyz87@gmail.com mhiramat@kernel.org svens@linux.ibm.com mingo@redhat.com dave.hansen@linux.intel.com jolsa@kernel.org kpsingh@kernel.org agordeev@linux.ibm.com sdf@google.com andreas@gaisler.com daniel@iogearbox.net martin.lau@linux.dev hpa@zytor.com aneesh.kumar@kernel.org paul.walmsley@sifive.com
netdev/build_clang success Errors and warnings before: 955 this patch: 955
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 974 this patch: 974
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 8 this patch: 8
netdev/source_inline success Was 0 now: 0

Commit Message

Mike Rapoport April 11, 2024, 4 p.m. UTC
From: "Mike Rapoport (IBM)" <rppt@kernel.org>

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystems
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

Start splitting code allocation from modules by introducing execmem_alloc()
and execmem_free() APIs.

Initially, execmem_alloc() is a wrapper for module_alloc() and
execmem_free() is a replacement of module_memfree() to allow updating all
call sites to use the new APIs.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem_alloc() takes
a type argument, that will be used to identify the calling subsystem and to
allow architectures define parameters for ranges suitable for that
subsystem.

Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
---
 arch/powerpc/kernel/kprobes.c    |  6 ++--
 arch/s390/kernel/ftrace.c        |  4 +--
 arch/s390/kernel/kprobes.c       |  4 +--
 arch/s390/kernel/module.c        |  5 +--
 arch/sparc/net/bpf_jit_comp_32.c |  8 ++---
 arch/x86/kernel/ftrace.c         |  6 ++--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 include/linux/execmem.h          | 57 ++++++++++++++++++++++++++++++++
 include/linux/moduleloader.h     |  3 --
 kernel/bpf/core.c                |  6 ++--
 kernel/kprobes.c                 |  8 ++---
 kernel/module/Kconfig            |  1 +
 kernel/module/main.c             | 25 +++++---------
 mm/Kconfig                       |  3 ++
 mm/Makefile                      |  1 +
 mm/execmem.c                     | 26 +++++++++++++++
 16 files changed, 122 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/execmem.h
 create mode 100644 mm/execmem.c

Comments

Luis Chamberlain April 11, 2024, 7:42 p.m. UTC | #1
On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> From: "Mike Rapoport (IBM)" <rppt@kernel.org>
> 
> module_alloc() is used everywhere as a mean to allocate memory for code.
> 
> Beside being semantically wrong, this unnecessarily ties all subsystems
> that need to allocate code, such as ftrace, kprobes and BPF to modules and
> puts the burden of code allocation to the modules code.
> 
> Several architectures override module_alloc() because of various
> constraints where the executable memory can be located and this causes
> additional obstacles for improvements of code allocation.
> 
> Start splitting code allocation from modules by introducing execmem_alloc()
> and execmem_free() APIs.
> 
> Initially, execmem_alloc() is a wrapper for module_alloc() and
> execmem_free() is a replacement of module_memfree() to allow updating all
> call sites to use the new APIs.
> 
> Since architectures define different restrictions on placement,
> permissions, alignment and other parameters for memory that can be used by
> different subsystems that allocate executable memory, execmem_alloc() takes
> a type argument, that will be used to identify the calling subsystem and to
> allow architectures define parameters for ranges suitable for that
> subsystem.

It would be good to describe this is a non-fuctional change.

> Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
> ---

> diff --git a/mm/execmem.c b/mm/execmem.c
> new file mode 100644
> index 000000000000..ed2ea41a2543
> --- /dev/null
> +++ b/mm/execmem.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0

And this just needs to copy over the copyright notices from the main.c file.

  Luis
Ingo Molnar April 12, 2024, 9:16 a.m. UTC | #2
* Mike Rapoport <rppt@kernel.org> wrote:

> +/**
> + * enum execmem_type - types of executable memory ranges
> + *
> + * There are several subsystems that allocate executable memory.
> + * Architectures define different restrictions on placement,
> + * permissions, alignment and other parameters for memory that can be used
> + * by these subsystems.
> + * Types in this enum identify subsystems that allocate executable memory
> + * and let architectures define parameters for ranges suitable for
> + * allocations by each subsystem.
> + *
> + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> + * are not explcitly defined.
> + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> + * @EXECMEM_KPROBES: parameters for kprobes
> + * @EXECMEM_FTRACE: parameters for ftrace
> + * @EXECMEM_BPF: parameters for BPF
> + * @EXECMEM_TYPE_MAX:
> + */
> +enum execmem_type {
> +	EXECMEM_DEFAULT,
> +	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> +	EXECMEM_KPROBES,
> +	EXECMEM_FTRACE,
> +	EXECMEM_BPF,
> +	EXECMEM_TYPE_MAX,
> +};

s/explcitly
 /explicitly

Thanks,

	Ingo
Mike Rapoport April 14, 2024, 6:53 a.m. UTC | #3
On Thu, Apr 11, 2024 at 12:42:05PM -0700, Luis Chamberlain wrote:
> On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (IBM)" <rppt@kernel.org>
> > 
> > module_alloc() is used everywhere as a mean to allocate memory for code.
> > 
> > Beside being semantically wrong, this unnecessarily ties all subsystems
> > that need to allocate code, such as ftrace, kprobes and BPF to modules and
> > puts the burden of code allocation to the modules code.
> > 
> > Several architectures override module_alloc() because of various
> > constraints where the executable memory can be located and this causes
> > additional obstacles for improvements of code allocation.
> > 
> > Start splitting code allocation from modules by introducing execmem_alloc()
> > and execmem_free() APIs.
> > 
> > Initially, execmem_alloc() is a wrapper for module_alloc() and
> > execmem_free() is a replacement of module_memfree() to allow updating all
> > call sites to use the new APIs.
> > 
> > Since architectures define different restrictions on placement,
> > permissions, alignment and other parameters for memory that can be used by
> > different subsystems that allocate executable memory, execmem_alloc() takes
> > a type argument, that will be used to identify the calling subsystem and to
> > allow architectures define parameters for ranges suitable for that
> > subsystem.
> 
> It would be good to describe this is a non-fuctional change.

Ok.
 
> > Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
> > ---
> 
> > diff --git a/mm/execmem.c b/mm/execmem.c
> > new file mode 100644
> > index 000000000000..ed2ea41a2543
> > --- /dev/null
> > +++ b/mm/execmem.c
> > @@ -0,0 +1,26 @@
> > +// SPDX-License-Identifier: GPL-2.0
> 
> And this just needs to copy over the copyright notices from the main.c file.

Will do.
 
>   Luis
Mike Rapoport April 14, 2024, 6:54 a.m. UTC | #4
On Fri, Apr 12, 2024 at 11:16:10AM +0200, Ingo Molnar wrote:
> 
> * Mike Rapoport <rppt@kernel.org> wrote:
> 
> > +/**
> > + * enum execmem_type - types of executable memory ranges
> > + *
> > + * There are several subsystems that allocate executable memory.
> > + * Architectures define different restrictions on placement,
> > + * permissions, alignment and other parameters for memory that can be used
> > + * by these subsystems.
> > + * Types in this enum identify subsystems that allocate executable memory
> > + * and let architectures define parameters for ranges suitable for
> > + * allocations by each subsystem.
> > + *
> > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > + * are not explcitly defined.
> > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > + * @EXECMEM_KPROBES: parameters for kprobes
> > + * @EXECMEM_FTRACE: parameters for ftrace
> > + * @EXECMEM_BPF: parameters for BPF
> > + * @EXECMEM_TYPE_MAX:
> > + */
> > +enum execmem_type {
> > +	EXECMEM_DEFAULT,
> > +	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > +	EXECMEM_KPROBES,
> > +	EXECMEM_FTRACE,
> > +	EXECMEM_BPF,
> > +	EXECMEM_TYPE_MAX,
> > +};
> 
> s/explcitly
>  /explicitly
 
Sure, thanks

> Thanks,
> 
> 	Ingo
Peter Zijlstra April 15, 2024, 7:52 a.m. UTC | #5
On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> +/**
> + * enum execmem_type - types of executable memory ranges
> + *
> + * There are several subsystems that allocate executable memory.
> + * Architectures define different restrictions on placement,
> + * permissions, alignment and other parameters for memory that can be used
> + * by these subsystems.
> + * Types in this enum identify subsystems that allocate executable memory
> + * and let architectures define parameters for ranges suitable for
> + * allocations by each subsystem.
> + *
> + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> + * are not explcitly defined.
> + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> + * @EXECMEM_KPROBES: parameters for kprobes
> + * @EXECMEM_FTRACE: parameters for ftrace
> + * @EXECMEM_BPF: parameters for BPF
> + * @EXECMEM_TYPE_MAX:
> + */
> +enum execmem_type {
> +	EXECMEM_DEFAULT,
> +	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> +	EXECMEM_KPROBES,
> +	EXECMEM_FTRACE,
> +	EXECMEM_BPF,
> +	EXECMEM_TYPE_MAX,
> +};

Can we please get a break-down of how all these types are actually
different from one another?

I'm thinking some platforms have a tiny immediate space (arm64 comes to
mind) and has less strict placement constraints for some of them?
Mike Rapoport April 15, 2024, 4:51 p.m. UTC | #6
On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > +/**
> > + * enum execmem_type - types of executable memory ranges
> > + *
> > + * There are several subsystems that allocate executable memory.
> > + * Architectures define different restrictions on placement,
> > + * permissions, alignment and other parameters for memory that can be used
> > + * by these subsystems.
> > + * Types in this enum identify subsystems that allocate executable memory
> > + * and let architectures define parameters for ranges suitable for
> > + * allocations by each subsystem.
> > + *
> > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > + * are not explcitly defined.
> > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > + * @EXECMEM_KPROBES: parameters for kprobes
> > + * @EXECMEM_FTRACE: parameters for ftrace
> > + * @EXECMEM_BPF: parameters for BPF
> > + * @EXECMEM_TYPE_MAX:
> > + */
> > +enum execmem_type {
> > +	EXECMEM_DEFAULT,
> > +	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > +	EXECMEM_KPROBES,
> > +	EXECMEM_FTRACE,
> > +	EXECMEM_BPF,
> > +	EXECMEM_TYPE_MAX,
> > +};
> 
> Can we please get a break-down of how all these types are actually
> different from one another?
> 
> I'm thinking some platforms have a tiny immediate space (arm64 comes to
> mind) and has less strict placement constraints for some of them?

loongarch, mips, nios2 and sparc define modules address space different
from vmalloc and use that for modules, kprobes and bpf (where supported).

parisc uses vmalloc range for everything, but it sets permissions to
PAGE_KERNEL_RWX because it's PAGE_KERNEL_EXEC is read only and it lacks
set_memory_* APIs.

arm has an address space for modules, but it fall back to the entire
vmalloc with CONFIG_ARM_MODULE_PLTS=y.

arm64 uses different ranges for modules and bpf/kprobes. For kprobes it
does vmalloc(PAGE_KERNEL_ROX) and for bpf just plain vmalloc().
For modules arm64 first tries to allocated from 128M below kernel_end and
if that fails it uses 2G below kernel_end as a fallback.

powerpc uses vmalloc space for everything for some configurations. For
book3s-32 and 8xx it defines two ranges that are used for module text,
kprobes and bpf and the module data can be allocated anywhere in vmalloc.

riscv has an address space for modules, a different address space for bpf
and uses vmalloc space for kprobes.

s390 and x86 have modules address space and use that space for all
executable allocations.

The EXECMEM_FTRACE type is only used on s390 and x86 and for now it's there
more for completeness rather to denote special constraints or properties.
Mark Rutland April 15, 2024, 5:36 p.m. UTC | #7
On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > +/**
> > + * enum execmem_type - types of executable memory ranges
> > + *
> > + * There are several subsystems that allocate executable memory.
> > + * Architectures define different restrictions on placement,
> > + * permissions, alignment and other parameters for memory that can be used
> > + * by these subsystems.
> > + * Types in this enum identify subsystems that allocate executable memory
> > + * and let architectures define parameters for ranges suitable for
> > + * allocations by each subsystem.
> > + *
> > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > + * are not explcitly defined.
> > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > + * @EXECMEM_KPROBES: parameters for kprobes
> > + * @EXECMEM_FTRACE: parameters for ftrace
> > + * @EXECMEM_BPF: parameters for BPF
> > + * @EXECMEM_TYPE_MAX:
> > + */
> > +enum execmem_type {
> > +	EXECMEM_DEFAULT,
> > +	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > +	EXECMEM_KPROBES,
> > +	EXECMEM_FTRACE,
> > +	EXECMEM_BPF,
> > +	EXECMEM_TYPE_MAX,
> > +};
> 
> Can we please get a break-down of how all these types are actually
> different from one another?
> 
> I'm thinking some platforms have a tiny immediate space (arm64 comes to
> mind) and has less strict placement constraints for some of them?

Yeah, and really I'd *much* rather deal with that in arch code, as I have said
several times.

For arm64 we have two bsaic restrictions: 

1) Direct branches can go +/-128M
   We can expand this range by having direct branches go to PLTs, at a
   performance cost.

2) PREL32 relocations can go +/-2G
   We cannot expand this further.

* We don't need to allocate memory for ftrace. We do not use trampolines.

* Kprobes XOL areas don't care about either of those; we don't place any
  PC-relative instructions in those. Maybe we want to in future.

* Modules care about both; we'd *prefer* to place them within +/-128M of all
  other kernel/module code, but if there's no space we can use PLTs and expand
  that to +/-2G. Since modules can refreence other modules, that ends up
  actually being halved, and modules have to fit within some 2G window that
  also covers the kernel.

* I'm not sure about BPF's requirements; it seems happy doing the same as
  modules.

So if we *must* use a common execmem allocator, what we'd reall want is our own
types, e.g.

	EXECMEM_ANYWHERE
	EXECMEM_NOPLT
	EXECMEM_PREL32

... and then we use those in arch code to implement module_alloc() and friends.

Mark.
Mike Rapoport April 16, 2024, 7:22 a.m. UTC | #8
On Mon, Apr 15, 2024 at 06:36:39PM +0100, Mark Rutland wrote:
> On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > > +/**
> > > + * enum execmem_type - types of executable memory ranges
> > > + *
> > > + * There are several subsystems that allocate executable memory.
> > > + * Architectures define different restrictions on placement,
> > > + * permissions, alignment and other parameters for memory that can be used
> > > + * by these subsystems.
> > > + * Types in this enum identify subsystems that allocate executable memory
> > > + * and let architectures define parameters for ranges suitable for
> > > + * allocations by each subsystem.
> > > + *
> > > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > > + * are not explcitly defined.
> > > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > > + * @EXECMEM_KPROBES: parameters for kprobes
> > > + * @EXECMEM_FTRACE: parameters for ftrace
> > > + * @EXECMEM_BPF: parameters for BPF
> > > + * @EXECMEM_TYPE_MAX:
> > > + */
> > > +enum execmem_type {
> > > +	EXECMEM_DEFAULT,
> > > +	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > > +	EXECMEM_KPROBES,
> > > +	EXECMEM_FTRACE,
> > > +	EXECMEM_BPF,
> > > +	EXECMEM_TYPE_MAX,
> > > +};
> > 
> > Can we please get a break-down of how all these types are actually
> > different from one another?
> > 
> > I'm thinking some platforms have a tiny immediate space (arm64 comes to
> > mind) and has less strict placement constraints for some of them?
> 
> Yeah, and really I'd *much* rather deal with that in arch code, as I have said
> several times.
> 
> For arm64 we have two bsaic restrictions: 
> 
> 1) Direct branches can go +/-128M
>    We can expand this range by having direct branches go to PLTs, at a
>    performance cost.
> 
> 2) PREL32 relocations can go +/-2G
>    We cannot expand this further.
> 
> * We don't need to allocate memory for ftrace. We do not use trampolines.
> 
> * Kprobes XOL areas don't care about either of those; we don't place any
>   PC-relative instructions in those. Maybe we want to in future.
> 
> * Modules care about both; we'd *prefer* to place them within +/-128M of all
>   other kernel/module code, but if there's no space we can use PLTs and expand
>   that to +/-2G. Since modules can refreence other modules, that ends up
>   actually being halved, and modules have to fit within some 2G window that
>   also covers the kernel.
> 
> * I'm not sure about BPF's requirements; it seems happy doing the same as
>   modules.

BPF are happy with vmalloc().
 
> So if we *must* use a common execmem allocator, what we'd reall want is our own
> types, e.g.
> 
> 	EXECMEM_ANYWHERE
> 	EXECMEM_NOPLT
> 	EXECMEM_PREL32
> 
> ... and then we use those in arch code to implement module_alloc() and friends.

I'm looking at execmem_types more as definition of the consumers, maybe I
should have named the enum execmem_consumer at the first place.

And the arch constrains defined in struct execmem_range describe how memory
should be allocated for each consumer.

These constraints are defined early at boot and remain static, so
initializing them once and letting a common allocator use them makes
perfect sense to me.

I agree that fallback_{start,end} are not ideal, but we have 3
architectures that have preferred and secondary range for modules. And arm
and powerpc use the same logic for kprobes as well, and I don't see why this
code should be duplicated.

And, for instance, if you decide to place PC-relative instructions if
kprobes XOL areas, you'd only need to update execmem_range for kprobes to
be more like the range for modules.

With central allocator it's easier to deal with the things like
VM_FLUSH_RESET_PERMS and caching of ROX memory and I think it will be more
maintainable that module_alloc(), alloc_insn_page() and
bpf_jit_alloc_exec() spread all over the place.
 
> Mark.
Masami Hiramatsu (Google) April 17, 2024, 9:06 p.m. UTC | #9
On Thu, 11 Apr 2024 19:00:41 +0300
Mike Rapoport <rppt@kernel.org> wrote:

> From: "Mike Rapoport (IBM)" <rppt@kernel.org>
> 
> module_alloc() is used everywhere as a mean to allocate memory for code.
> 
> Beside being semantically wrong, this unnecessarily ties all subsystems
> that need to allocate code, such as ftrace, kprobes and BPF to modules and
> puts the burden of code allocation to the modules code.
> 
> Several architectures override module_alloc() because of various
> constraints where the executable memory can be located and this causes
> additional obstacles for improvements of code allocation.
> 
> Start splitting code allocation from modules by introducing execmem_alloc()
> and execmem_free() APIs.
> 
> Initially, execmem_alloc() is a wrapper for module_alloc() and
> execmem_free() is a replacement of module_memfree() to allow updating all
> call sites to use the new APIs.
> 
> Since architectures define different restrictions on placement,
> permissions, alignment and other parameters for memory that can be used by
> different subsystems that allocate executable memory, execmem_alloc() takes
> a type argument, that will be used to identify the calling subsystem and to
> allow architectures define parameters for ranges suitable for that
> subsystem.
> 

This looks good to me for the kprobe part.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thank you,

> Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
> ---
>  arch/powerpc/kernel/kprobes.c    |  6 ++--
>  arch/s390/kernel/ftrace.c        |  4 +--
>  arch/s390/kernel/kprobes.c       |  4 +--
>  arch/s390/kernel/module.c        |  5 +--
>  arch/sparc/net/bpf_jit_comp_32.c |  8 ++---
>  arch/x86/kernel/ftrace.c         |  6 ++--
>  arch/x86/kernel/kprobes/core.c   |  4 +--
>  include/linux/execmem.h          | 57 ++++++++++++++++++++++++++++++++
>  include/linux/moduleloader.h     |  3 --
>  kernel/bpf/core.c                |  6 ++--
>  kernel/kprobes.c                 |  8 ++---
>  kernel/module/Kconfig            |  1 +
>  kernel/module/main.c             | 25 +++++---------
>  mm/Kconfig                       |  3 ++
>  mm/Makefile                      |  1 +
>  mm/execmem.c                     | 26 +++++++++++++++
>  16 files changed, 122 insertions(+), 45 deletions(-)
>  create mode 100644 include/linux/execmem.h
>  create mode 100644 mm/execmem.c
> 
> diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
> index bbca90a5e2ec..9fcd01bb2ce6 100644
> --- a/arch/powerpc/kernel/kprobes.c
> +++ b/arch/powerpc/kernel/kprobes.c
> @@ -19,8 +19,8 @@
>  #include <linux/extable.h>
>  #include <linux/kdebug.h>
>  #include <linux/slab.h>
> -#include <linux/moduleloader.h>
>  #include <linux/set_memory.h>
> +#include <linux/execmem.h>
>  #include <asm/code-patching.h>
>  #include <asm/cacheflush.h>
>  #include <asm/sstep.h>
> @@ -130,7 +130,7 @@ void *alloc_insn_page(void)
>  {
>  	void *page;
>  
> -	page = module_alloc(PAGE_SIZE);
> +	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
>  	if (!page)
>  		return NULL;
>  
> @@ -142,7 +142,7 @@ void *alloc_insn_page(void)
>  	}
>  	return page;
>  error:
> -	module_memfree(page);
> +	execmem_free(page);
>  	return NULL;
>  }
>  
> diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
> index c46381ea04ec..798249ef5646 100644
> --- a/arch/s390/kernel/ftrace.c
> +++ b/arch/s390/kernel/ftrace.c
> @@ -7,13 +7,13 @@
>   *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
>   */
>  
> -#include <linux/moduleloader.h>
>  #include <linux/hardirq.h>
>  #include <linux/uaccess.h>
>  #include <linux/ftrace.h>
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/kprobes.h>
> +#include <linux/execmem.h>
>  #include <trace/syscall.h>
>  #include <asm/asm-offsets.h>
>  #include <asm/text-patching.h>
> @@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void)
>  {
>  	const char *start, *end;
>  
> -	ftrace_plt = module_alloc(PAGE_SIZE);
> +	ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
>  	if (!ftrace_plt)
>  		panic("cannot allocate ftrace plt\n");
>  
> diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
> index f0cf20d4b3c5..3c1b1be744de 100644
> --- a/arch/s390/kernel/kprobes.c
> +++ b/arch/s390/kernel/kprobes.c
> @@ -9,7 +9,6 @@
>  
>  #define pr_fmt(fmt) "kprobes: " fmt
>  
> -#include <linux/moduleloader.h>
>  #include <linux/kprobes.h>
>  #include <linux/ptrace.h>
>  #include <linux/preempt.h>
> @@ -21,6 +20,7 @@
>  #include <linux/slab.h>
>  #include <linux/hardirq.h>
>  #include <linux/ftrace.h>
> +#include <linux/execmem.h>
>  #include <asm/set_memory.h>
>  #include <asm/sections.h>
>  #include <asm/dis.h>
> @@ -38,7 +38,7 @@ void *alloc_insn_page(void)
>  {
>  	void *page;
>  
> -	page = module_alloc(PAGE_SIZE);
> +	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
>  	if (!page)
>  		return NULL;
>  	set_memory_rox((unsigned long)page, 1);
> diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
> index 42215f9404af..ac97a905e8cd 100644
> --- a/arch/s390/kernel/module.c
> +++ b/arch/s390/kernel/module.c
> @@ -21,6 +21,7 @@
>  #include <linux/moduleloader.h>
>  #include <linux/bug.h>
>  #include <linux/memory.h>
> +#include <linux/execmem.h>
>  #include <asm/alternative.h>
>  #include <asm/nospec-branch.h>
>  #include <asm/facility.h>
> @@ -76,7 +77,7 @@ void *module_alloc(unsigned long size)
>  #ifdef CONFIG_FUNCTION_TRACER
>  void module_arch_cleanup(struct module *mod)
>  {
> -	module_memfree(mod->arch.trampolines_start);
> +	execmem_free(mod->arch.trampolines_start);
>  }
>  #endif
>  
> @@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct module *me,
>  
>  	size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
>  	numpages = DIV_ROUND_UP(size, PAGE_SIZE);
> -	start = module_alloc(numpages * PAGE_SIZE);
> +	start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
>  	if (!start)
>  		return -ENOMEM;
>  	set_memory_rox((unsigned long)start, numpages);
> diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
> index da2df1e84ed4..bda2dbd3f4c5 100644
> --- a/arch/sparc/net/bpf_jit_comp_32.c
> +++ b/arch/sparc/net/bpf_jit_comp_32.c
> @@ -1,10 +1,10 @@
>  // SPDX-License-Identifier: GPL-2.0
> -#include <linux/moduleloader.h>
>  #include <linux/workqueue.h>
>  #include <linux/netdevice.h>
>  #include <linux/filter.h>
>  #include <linux/cache.h>
>  #include <linux/if_vlan.h>
> +#include <linux/execmem.h>
>  
>  #include <asm/cacheflush.h>
>  #include <asm/ptrace.h>
> @@ -713,7 +713,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
>  				if (unlikely(proglen + ilen > oldproglen)) {
>  					pr_err("bpb_jit_compile fatal error\n");
>  					kfree(addrs);
> -					module_memfree(image);
> +					execmem_free(image);
>  					return;
>  				}
>  				memcpy(image + proglen, temp, ilen);
> @@ -736,7 +736,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
>  			break;
>  		}
>  		if (proglen == oldproglen) {
> -			image = module_alloc(proglen);
> +			image = execmem_alloc(EXECMEM_BPF, proglen);
>  			if (!image)
>  				goto out;
>  		}
> @@ -758,7 +758,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
>  void bpf_jit_free(struct bpf_prog *fp)
>  {
>  	if (fp->jited)
> -		module_memfree(fp->bpf_func);
> +		execmem_free(fp->bpf_func);
>  
>  	bpf_prog_unlock_free(fp);
>  }
> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
> index 70139d9d2e01..c8ddb7abda7c 100644
> --- a/arch/x86/kernel/ftrace.c
> +++ b/arch/x86/kernel/ftrace.c
> @@ -25,6 +25,7 @@
>  #include <linux/memory.h>
>  #include <linux/vmalloc.h>
>  #include <linux/set_memory.h>
> +#include <linux/execmem.h>
>  
>  #include <trace/syscall.h>
>  
> @@ -261,15 +262,14 @@ void arch_ftrace_update_code(int command)
>  #ifdef CONFIG_X86_64
>  
>  #ifdef CONFIG_MODULES
> -#include <linux/moduleloader.h>
>  /* Module allocation simplifies allocating memory for code */
>  static inline void *alloc_tramp(unsigned long size)
>  {
> -	return module_alloc(size);
> +	return execmem_alloc(EXECMEM_FTRACE, size);
>  }
>  static inline void tramp_free(void *tramp)
>  {
> -	module_memfree(tramp);
> +	execmem_free(tramp);
>  }
>  #else
>  /* Trampolines can only be created if modules are supported */
> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
> index d0e49bd7c6f3..72e6a45e7ec2 100644
> --- a/arch/x86/kernel/kprobes/core.c
> +++ b/arch/x86/kernel/kprobes/core.c
> @@ -40,12 +40,12 @@
>  #include <linux/kgdb.h>
>  #include <linux/ftrace.h>
>  #include <linux/kasan.h>
> -#include <linux/moduleloader.h>
>  #include <linux/objtool.h>
>  #include <linux/vmalloc.h>
>  #include <linux/pgtable.h>
>  #include <linux/set_memory.h>
>  #include <linux/cfi.h>
> +#include <linux/execmem.h>
>  
>  #include <asm/text-patching.h>
>  #include <asm/cacheflush.h>
> @@ -495,7 +495,7 @@ void *alloc_insn_page(void)
>  {
>  	void *page;
>  
> -	page = module_alloc(PAGE_SIZE);
> +	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
>  	if (!page)
>  		return NULL;
>  
> diff --git a/include/linux/execmem.h b/include/linux/execmem.h
> new file mode 100644
> index 000000000000..43e7995593a1
> --- /dev/null
> +++ b/include/linux/execmem.h
> @@ -0,0 +1,57 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_EXECMEM_ALLOC_H
> +#define _LINUX_EXECMEM_ALLOC_H
> +
> +#include <linux/types.h>
> +#include <linux/moduleloader.h>
> +
> +/**
> + * enum execmem_type - types of executable memory ranges
> + *
> + * There are several subsystems that allocate executable memory.
> + * Architectures define different restrictions on placement,
> + * permissions, alignment and other parameters for memory that can be used
> + * by these subsystems.
> + * Types in this enum identify subsystems that allocate executable memory
> + * and let architectures define parameters for ranges suitable for
> + * allocations by each subsystem.
> + *
> + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> + * are not explcitly defined.
> + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> + * @EXECMEM_KPROBES: parameters for kprobes
> + * @EXECMEM_FTRACE: parameters for ftrace
> + * @EXECMEM_BPF: parameters for BPF
> + * @EXECMEM_TYPE_MAX:
> + */
> +enum execmem_type {
> +	EXECMEM_DEFAULT,
> +	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> +	EXECMEM_KPROBES,
> +	EXECMEM_FTRACE,
> +	EXECMEM_BPF,
> +	EXECMEM_TYPE_MAX,
> +};
> +
> +/**
> + * execmem_alloc - allocate executable memory
> + * @type: type of the allocation
> + * @size: how many bytes of memory are required
> + *
> + * Allocates memory that will contain executable code, either generated or
> + * loaded from kernel modules.
> + *
> + * The memory will have protections defined by architecture for executable
> + * region of the @type.
> + *
> + * Return: a pointer to the allocated memory or %NULL
> + */
> +void *execmem_alloc(enum execmem_type type, size_t size);
> +
> +/**
> + * execmem_free - free executable memory
> + * @ptr: pointer to the memory that should be freed
> + */
> +void execmem_free(void *ptr);
> +
> +#endif /* _LINUX_EXECMEM_ALLOC_H */
> diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
> index 89b1e0ed9811..a3b8caee9405 100644
> --- a/include/linux/moduleloader.h
> +++ b/include/linux/moduleloader.h
> @@ -29,9 +29,6 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
>     sections.  Returns NULL on failure. */
>  void *module_alloc(unsigned long size);
>  
> -/* Free memory returned from module_alloc. */
> -void module_memfree(void *module_region);
> -
>  /* Determines if the section name is an init section (that is only used during
>   * module loading).
>   */
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 696bc55de8e8..75a54024e2f4 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -22,7 +22,6 @@
>  #include <linux/skbuff.h>
>  #include <linux/vmalloc.h>
>  #include <linux/random.h>
> -#include <linux/moduleloader.h>
>  #include <linux/bpf.h>
>  #include <linux/btf.h>
>  #include <linux/objtool.h>
> @@ -37,6 +36,7 @@
>  #include <linux/nospec.h>
>  #include <linux/bpf_mem_alloc.h>
>  #include <linux/memcontrol.h>
> +#include <linux/execmem.h>
>  
>  #include <asm/barrier.h>
>  #include <asm/unaligned.h>
> @@ -1050,12 +1050,12 @@ void bpf_jit_uncharge_modmem(u32 size)
>  
>  void *__weak bpf_jit_alloc_exec(unsigned long size)
>  {
> -	return module_alloc(size);
> +	return execmem_alloc(EXECMEM_BPF, size);
>  }
>  
>  void __weak bpf_jit_free_exec(void *addr)
>  {
> -	module_memfree(addr);
> +	execmem_free(addr);
>  }
>  
>  struct bpf_binary_header *
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 9d9095e81792..047ca629ce49 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -26,7 +26,6 @@
>  #include <linux/slab.h>
>  #include <linux/stddef.h>
>  #include <linux/export.h>
> -#include <linux/moduleloader.h>
>  #include <linux/kallsyms.h>
>  #include <linux/freezer.h>
>  #include <linux/seq_file.h>
> @@ -39,6 +38,7 @@
>  #include <linux/jump_label.h>
>  #include <linux/static_call.h>
>  #include <linux/perf_event.h>
> +#include <linux/execmem.h>
>  
>  #include <asm/sections.h>
>  #include <asm/cacheflush.h>
> @@ -113,17 +113,17 @@ enum kprobe_slot_state {
>  void __weak *alloc_insn_page(void)
>  {
>  	/*
> -	 * Use module_alloc() so this page is within +/- 2GB of where the
> +	 * Use execmem_alloc() so this page is within +/- 2GB of where the
>  	 * kernel image and loaded module images reside. This is required
>  	 * for most of the architectures.
>  	 * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
>  	 */
> -	return module_alloc(PAGE_SIZE);
> +	return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
>  }
>  
>  static void free_insn_page(void *page)
>  {
> -	module_memfree(page);
> +	execmem_free(page);
>  }
>  
>  struct kprobe_insn_cache kprobe_insn_slots = {
> diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
> index f3e0329337f6..744383c1eed1 100644
> --- a/kernel/module/Kconfig
> +++ b/kernel/module/Kconfig
> @@ -2,6 +2,7 @@
>  menuconfig MODULES
>  	bool "Enable loadable module support"
>  	modules
> +	select EXECMEM
>  	help
>  	  Kernel modules are small pieces of compiled code which can
>  	  be inserted in the running kernel, rather than being
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index 5b82b069e0d3..d56b7df0cbb6 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -57,6 +57,7 @@
>  #include <linux/audit.h>
>  #include <linux/cfi.h>
>  #include <linux/debugfs.h>
> +#include <linux/execmem.h>
>  #include <uapi/linux/module.h>
>  #include "internal.h"
>  
> @@ -1179,16 +1180,6 @@ resolve_symbol_wait(struct module *mod,
>  	return ksym;
>  }
>  
> -void __weak module_memfree(void *module_region)
> -{
> -	/*
> -	 * This memory may be RO, and freeing RO memory in an interrupt is not
> -	 * supported by vmalloc.
> -	 */
> -	WARN_ON(in_interrupt());
> -	vfree(module_region);
> -}
> -
>  void __weak module_arch_cleanup(struct module *mod)
>  {
>  }
> @@ -1213,7 +1204,7 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
>  	if (mod_mem_use_vmalloc(type))
>  		ptr = vmalloc(size);
>  	else
> -		ptr = module_alloc(size);
> +		ptr = execmem_alloc(EXECMEM_MODULE_TEXT, size);
>  
>  	if (!ptr)
>  		return -ENOMEM;
> @@ -1244,7 +1235,7 @@ static void module_memory_free(struct module *mod, enum mod_mem_type type)
>  	if (mod_mem_use_vmalloc(type))
>  		vfree(ptr);
>  	else
> -		module_memfree(ptr);
> +		execmem_free(ptr);
>  }
>  
>  static void free_mod_mem(struct module *mod)
> @@ -2496,9 +2487,9 @@ static void do_free_init(struct work_struct *w)
>  
>  	llist_for_each_safe(pos, n, list) {
>  		initfree = container_of(pos, struct mod_initfree, node);
> -		module_memfree(initfree->init_text);
> -		module_memfree(initfree->init_data);
> -		module_memfree(initfree->init_rodata);
> +		execmem_free(initfree->init_text);
> +		execmem_free(initfree->init_data);
> +		execmem_free(initfree->init_rodata);
>  		kfree(initfree);
>  	}
>  }
> @@ -2608,10 +2599,10 @@ static noinline int do_init_module(struct module *mod)
>  	 * We want to free module_init, but be aware that kallsyms may be
>  	 * walking this with preempt disabled.  In all the failure paths, we
>  	 * call synchronize_rcu(), but we don't want to slow down the success
> -	 * path. module_memfree() cannot be called in an interrupt, so do the
> +	 * path. execmem_free() cannot be called in an interrupt, so do the
>  	 * work and call synchronize_rcu() in a work queue.
>  	 *
> -	 * Note that module_alloc() on most architectures creates W+X page
> +	 * Note that execmem_alloc() on most architectures creates W+X page
>  	 * mappings which won't be cleaned up until do_free_init() runs.  Any
>  	 * code such as mark_rodata_ro() which depends on those mappings to
>  	 * be cleaned up needs to sync with the queued work by invoking
> diff --git a/mm/Kconfig b/mm/Kconfig
> index b1448aa81e15..f08a216d4793 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1241,6 +1241,9 @@ config LOCK_MM_AND_FIND_VMA
>  config IOMMU_MM_DATA
>  	bool
>  
> +config EXECMEM
> +	bool
> +
>  source "mm/damon/Kconfig"
>  
>  endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index 4abb40b911ec..001336c91864 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -133,3 +133,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
>  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
>  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
>  obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> +obj-$(CONFIG_EXECMEM) += execmem.o
> diff --git a/mm/execmem.c b/mm/execmem.c
> new file mode 100644
> index 000000000000..ed2ea41a2543
> --- /dev/null
> +++ b/mm/execmem.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
> +#include <linux/execmem.h>
> +#include <linux/moduleloader.h>
> +
> +static void *__execmem_alloc(size_t size)
> +{
> +	return module_alloc(size);
> +}
> +
> +void *execmem_alloc(enum execmem_type type, size_t size)
> +{
> +	return __execmem_alloc(size);
> +}
> +
> +void execmem_free(void *ptr)
> +{
> +	/*
> +	 * This memory may be RO, and freeing RO memory in an interrupt is not
> +	 * supported by vmalloc.
> +	 */
> +	WARN_ON(in_interrupt());
> +	vfree(ptr);
> +}
> -- 
> 2.43.0
> 
>
Song Liu April 17, 2024, 11:32 p.m. UTC | #10
On Tue, Apr 16, 2024 at 12:23 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Mon, Apr 15, 2024 at 06:36:39PM +0100, Mark Rutland wrote:
> > On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> > > On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > > > +/**
> > > > + * enum execmem_type - types of executable memory ranges
> > > > + *
> > > > + * There are several subsystems that allocate executable memory.
> > > > + * Architectures define different restrictions on placement,
> > > > + * permissions, alignment and other parameters for memory that can be used
> > > > + * by these subsystems.
> > > > + * Types in this enum identify subsystems that allocate executable memory
> > > > + * and let architectures define parameters for ranges suitable for
> > > > + * allocations by each subsystem.
> > > > + *
> > > > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > > > + * are not explcitly defined.
> > > > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > > > + * @EXECMEM_KPROBES: parameters for kprobes
> > > > + * @EXECMEM_FTRACE: parameters for ftrace
> > > > + * @EXECMEM_BPF: parameters for BPF
> > > > + * @EXECMEM_TYPE_MAX:
> > > > + */
> > > > +enum execmem_type {
> > > > + EXECMEM_DEFAULT,
> > > > + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > > > + EXECMEM_KPROBES,
> > > > + EXECMEM_FTRACE,
> > > > + EXECMEM_BPF,
> > > > + EXECMEM_TYPE_MAX,
> > > > +};
> > >
> > > Can we please get a break-down of how all these types are actually
> > > different from one another?
> > >
> > > I'm thinking some platforms have a tiny immediate space (arm64 comes to
> > > mind) and has less strict placement constraints for some of them?
> >
> > Yeah, and really I'd *much* rather deal with that in arch code, as I have said
> > several times.
> >
> > For arm64 we have two bsaic restrictions:
> >
> > 1) Direct branches can go +/-128M
> >    We can expand this range by having direct branches go to PLTs, at a
> >    performance cost.
> >
> > 2) PREL32 relocations can go +/-2G
> >    We cannot expand this further.
> >
> > * We don't need to allocate memory for ftrace. We do not use trampolines.
> >
> > * Kprobes XOL areas don't care about either of those; we don't place any
> >   PC-relative instructions in those. Maybe we want to in future.
> >
> > * Modules care about both; we'd *prefer* to place them within +/-128M of all
> >   other kernel/module code, but if there's no space we can use PLTs and expand
> >   that to +/-2G. Since modules can refreence other modules, that ends up
> >   actually being halved, and modules have to fit within some 2G window that
> >   also covers the kernel.

Is +/- 2G enough for all realistic use cases? If so, I guess we don't
really need
EXECMEM_ANYWHERE below?

> >
> > * I'm not sure about BPF's requirements; it seems happy doing the same as
> >   modules.
>
> BPF are happy with vmalloc().
>
> > So if we *must* use a common execmem allocator, what we'd reall want is our own
> > types, e.g.
> >
> >       EXECMEM_ANYWHERE
> >       EXECMEM_NOPLT
> >       EXECMEM_PREL32
> >
> > ... and then we use those in arch code to implement module_alloc() and friends.
>
> I'm looking at execmem_types more as definition of the consumers, maybe I
> should have named the enum execmem_consumer at the first place.

I think looking at execmem_type from consumers' point of view adds
unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
and bpf (and maybe also module text) all have the same requirements.
Did I miss something?

IOW, we have

enum execmem_type {
        EXECMEM_DEFAULT,
        EXECMEM_TEXT,
        EXECMEM_KPROBES = EXECMEM_TEXT,
        EXECMEM_FTRACE = EXECMEM_TEXT,
        EXECMEM_BPF = EXECMEM_TEXT,      /* we may end up without
_KPROBE, _FTRACE, _BPF */
        EXECMEM_DATA,  /* rw */
        EXECMEM_RO_DATA,
        EXECMEM_RO_AFTER_INIT,
        EXECMEM_TYPE_MAX,
};

Does this make sense?

Thanks,
Song
Mike Rapoport April 18, 2024, 3:35 p.m. UTC | #11
On Wed, Apr 17, 2024 at 04:32:49PM -0700, Song Liu wrote:
> On Tue, Apr 16, 2024 at 12:23 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Mon, Apr 15, 2024 at 06:36:39PM +0100, Mark Rutland wrote:
> > > On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> > > > On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > > > > +/**
> > > > > + * enum execmem_type - types of executable memory ranges
> > > > > + *
> > > > > + * There are several subsystems that allocate executable memory.
> > > > > + * Architectures define different restrictions on placement,
> > > > > + * permissions, alignment and other parameters for memory that can be used
> > > > > + * by these subsystems.
> > > > > + * Types in this enum identify subsystems that allocate executable memory
> > > > > + * and let architectures define parameters for ranges suitable for
> > > > > + * allocations by each subsystem.
> > > > > + *
> > > > > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > > > > + * are not explcitly defined.
> > > > > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > > > > + * @EXECMEM_KPROBES: parameters for kprobes
> > > > > + * @EXECMEM_FTRACE: parameters for ftrace
> > > > > + * @EXECMEM_BPF: parameters for BPF
> > > > > + * @EXECMEM_TYPE_MAX:
> > > > > + */
> > > > > +enum execmem_type {
> > > > > + EXECMEM_DEFAULT,
> > > > > + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > > > > + EXECMEM_KPROBES,
> > > > > + EXECMEM_FTRACE,
> > > > > + EXECMEM_BPF,
> > > > > + EXECMEM_TYPE_MAX,
> > > > > +};
> > > >
> > > > Can we please get a break-down of how all these types are actually
> > > > different from one another?
> > > >
> > > > I'm thinking some platforms have a tiny immediate space (arm64 comes to
> > > > mind) and has less strict placement constraints for some of them?
> > >
> > > Yeah, and really I'd *much* rather deal with that in arch code, as I have said
> > > several times.
> > >
> > > For arm64 we have two bsaic restrictions:
> > >
> > > 1) Direct branches can go +/-128M
> > >    We can expand this range by having direct branches go to PLTs, at a
> > >    performance cost.
> > >
> > > 2) PREL32 relocations can go +/-2G
> > >    We cannot expand this further.
> > >
> > > * We don't need to allocate memory for ftrace. We do not use trampolines.
> > >
> > > * Kprobes XOL areas don't care about either of those; we don't place any
> > >   PC-relative instructions in those. Maybe we want to in future.
> > >
> > > * Modules care about both; we'd *prefer* to place them within +/-128M of all
> > >   other kernel/module code, but if there's no space we can use PLTs and expand
> > >   that to +/-2G. Since modules can refreence other modules, that ends up
> > >   actually being halved, and modules have to fit within some 2G window that
> > >   also covers the kernel.
> 
> Is +/- 2G enough for all realistic use cases? If so, I guess we don't
> really need
> EXECMEM_ANYWHERE below?
> 
> > >
> > > * I'm not sure about BPF's requirements; it seems happy doing the same as
> > >   modules.
> >
> > BPF are happy with vmalloc().
> >
> > > So if we *must* use a common execmem allocator, what we'd reall want is our own
> > > types, e.g.
> > >
> > >       EXECMEM_ANYWHERE
> > >       EXECMEM_NOPLT
> > >       EXECMEM_PREL32
> > >
> > > ... and then we use those in arch code to implement module_alloc() and friends.
> >
> > I'm looking at execmem_types more as definition of the consumers, maybe I
> > should have named the enum execmem_consumer at the first place.
> 
> I think looking at execmem_type from consumers' point of view adds
> unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> and bpf (and maybe also module text) all have the same requirements.
> Did I miss something?

It's enough to have one architecture with different constrains for kprobes
and bpf to warrant a type for each.

Where do you see unnecessary complexity?
 
> IOW, we have
> 
> enum execmem_type {
>         EXECMEM_DEFAULT,
>         EXECMEM_TEXT,
>         EXECMEM_KPROBES = EXECMEM_TEXT,
>         EXECMEM_FTRACE = EXECMEM_TEXT,
>         EXECMEM_BPF = EXECMEM_TEXT,      /* we may end up without
> _KPROBE, _FTRACE, _BPF */
>         EXECMEM_DATA,  /* rw */
>         EXECMEM_RO_DATA,
>         EXECMEM_RO_AFTER_INIT,
>         EXECMEM_TYPE_MAX,
> };
> 
> Does this make sense?
 
How do you suggest to deal with e.g. riscv that has separate address spaces
for modules, kprobes and bpf?

> Thanks,
> Song
Song Liu April 18, 2024, 4:13 p.m. UTC | #12
On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport <rppt@kernel.org> wrote:
>
[...]
> >
> > Is +/- 2G enough for all realistic use cases? If so, I guess we don't
> > really need
> > EXECMEM_ANYWHERE below?
> >
> > > >
> > > > * I'm not sure about BPF's requirements; it seems happy doing the same as
> > > >   modules.
> > >
> > > BPF are happy with vmalloc().
> > >
> > > > So if we *must* use a common execmem allocator, what we'd reall want is our own
> > > > types, e.g.
> > > >
> > > >       EXECMEM_ANYWHERE
> > > >       EXECMEM_NOPLT
> > > >       EXECMEM_PREL32
> > > >
> > > > ... and then we use those in arch code to implement module_alloc() and friends.
> > >
> > > I'm looking at execmem_types more as definition of the consumers, maybe I
> > > should have named the enum execmem_consumer at the first place.
> >
> > I think looking at execmem_type from consumers' point of view adds
> > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > and bpf (and maybe also module text) all have the same requirements.
> > Did I miss something?
>
> It's enough to have one architecture with different constrains for kprobes
> and bpf to warrant a type for each.
>

AFAICT, some of these constraints can be changed without too much work.

> Where do you see unnecessary complexity?
>
> > IOW, we have
> >
> > enum execmem_type {
> >         EXECMEM_DEFAULT,
> >         EXECMEM_TEXT,
> >         EXECMEM_KPROBES = EXECMEM_TEXT,
> >         EXECMEM_FTRACE = EXECMEM_TEXT,
> >         EXECMEM_BPF = EXECMEM_TEXT,      /* we may end up without
> > _KPROBE, _FTRACE, _BPF */
> >         EXECMEM_DATA,  /* rw */
> >         EXECMEM_RO_DATA,
> >         EXECMEM_RO_AFTER_INIT,
> >         EXECMEM_TYPE_MAX,
> > };
> >
> > Does this make sense?
>
> How do you suggest to deal with e.g. riscv that has separate address spaces
> for modules, kprobes and bpf?

IIUC, modules and bpf use the same address space on riscv, while kprobes use
vmalloc address. I haven't tried this yet, but I think we can let
kprobes use the
same space as modules and bpf, which is:

ffffffff00000000 |  -4     GB | ffffffff7fffffff |    2 GB | modules, BPF

Did I get this right?

Thanks,
Song
Mike Rapoport April 18, 2024, 5:52 p.m. UTC | #13
On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > >
> > > > I'm looking at execmem_types more as definition of the consumers, maybe I
> > > > should have named the enum execmem_consumer at the first place.
> > >
> > > I think looking at execmem_type from consumers' point of view adds
> > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > > and bpf (and maybe also module text) all have the same requirements.
> > > Did I miss something?
> >
> > It's enough to have one architecture with different constrains for kprobes
> > and bpf to warrant a type for each.
> 
> AFAICT, some of these constraints can be changed without too much work.

But why?
I honestly don't understand what are you trying to optimize here. A few
lines of initialization in execmem_info?
What is the advantage in forcing architectures to have imposed limits on
kprobes or bpf allocations?

> > Where do you see unnecessary complexity?
> >
> > > IOW, we have
> > >
> > > enum execmem_type {
> > >         EXECMEM_DEFAULT,
> > >         EXECMEM_TEXT,
> > >         EXECMEM_KPROBES = EXECMEM_TEXT,
> > >         EXECMEM_FTRACE = EXECMEM_TEXT,
> > >         EXECMEM_BPF = EXECMEM_TEXT,      /* we may end up without
> > > _KPROBE, _FTRACE, _BPF */
> > >         EXECMEM_DATA,  /* rw */
> > >         EXECMEM_RO_DATA,
> > >         EXECMEM_RO_AFTER_INIT,
> > >         EXECMEM_TYPE_MAX,
> > > };
> > >
> > > Does this make sense?
> >
> > How do you suggest to deal with e.g. riscv that has separate address spaces
> > for modules, kprobes and bpf?
> 
> IIUC, modules and bpf use the same address space on riscv

Not exactly, bpf is a subset of modules on riscv.

> while kprobes use vmalloc address.

The whole point of using the entire vmalloc for kprobes is to avoid
pollution of limited modules space.
 
> Thanks,
> Song
Song Liu April 18, 2024, 9:01 p.m. UTC | #14
On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > > >
> > > > > I'm looking at execmem_types more as definition of the consumers, maybe I
> > > > > should have named the enum execmem_consumer at the first place.
> > > >
> > > > I think looking at execmem_type from consumers' point of view adds
> > > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > > > and bpf (and maybe also module text) all have the same requirements.
> > > > Did I miss something?
> > >
> > > It's enough to have one architecture with different constrains for kprobes
> > > and bpf to warrant a type for each.
> >
> > AFAICT, some of these constraints can be changed without too much work.
>
> But why?
> I honestly don't understand what are you trying to optimize here. A few
> lines of initialization in execmem_info?

IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
harder for bpf and kprobe to share the same ROX page. In many use cases,
a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
module text. It is not efficient if we have to allocate separate pages for each
of these use cases. If this is not a problem, the current approach works.

Thanks,
Song
Mike Rapoport April 19, 2024, 6:55 a.m. UTC | #15
On Thu, Apr 18, 2024 at 02:01:22PM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > > > >
> > > > > > I'm looking at execmem_types more as definition of the consumers, maybe I
> > > > > > should have named the enum execmem_consumer at the first place.
> > > > >
> > > > > I think looking at execmem_type from consumers' point of view adds
> > > > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > > > > and bpf (and maybe also module text) all have the same requirements.
> > > > > Did I miss something?
> > > >
> > > > It's enough to have one architecture with different constrains for kprobes
> > > > and bpf to warrant a type for each.
> > >
> > > AFAICT, some of these constraints can be changed without too much work.
> >
> > But why?
> > I honestly don't understand what are you trying to optimize here. A few
> > lines of initialization in execmem_info?
> 
> IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
> harder for bpf and kprobe to share the same ROX page. In many use cases,
> a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
> module text. It is not efficient if we have to allocate separate pages for each
> of these use cases. If this is not a problem, the current approach works.

The caching of large ROX pages does not need to be per type. 

In the POC I've posted for caching of large ROX pages on x86 [1], the cache is
global and to make kprobes and bpf use it it's enough to set a flag in
execmem_info.

[1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org

> Thanks,
> Song
Song Liu April 19, 2024, 3:54 p.m. UTC | #16
On Thu, Apr 18, 2024 at 11:56 PM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Thu, Apr 18, 2024 at 02:01:22PM -0700, Song Liu wrote:
> > On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport <rppt@kernel.org> wrote:
> > >
> > > On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > > > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > > > > >
> > > > > > > I'm looking at execmem_types more as definition of the consumers, maybe I
> > > > > > > should have named the enum execmem_consumer at the first place.
> > > > > >
> > > > > > I think looking at execmem_type from consumers' point of view adds
> > > > > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > > > > > and bpf (and maybe also module text) all have the same requirements.
> > > > > > Did I miss something?
> > > > >
> > > > > It's enough to have one architecture with different constrains for kprobes
> > > > > and bpf to warrant a type for each.
> > > >
> > > > AFAICT, some of these constraints can be changed without too much work.
> > >
> > > But why?
> > > I honestly don't understand what are you trying to optimize here. A few
> > > lines of initialization in execmem_info?
> >
> > IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
> > harder for bpf and kprobe to share the same ROX page. In many use cases,
> > a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
> > module text. It is not efficient if we have to allocate separate pages for each
> > of these use cases. If this is not a problem, the current approach works.
>
> The caching of large ROX pages does not need to be per type.
>
> In the POC I've posted for caching of large ROX pages on x86 [1], the cache is
> global and to make kprobes and bpf use it it's enough to set a flag in
> execmem_info.
>
> [1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org

For the ROX to work, we need different users (module text, kprobe, etc.) to have
the same execmem_range. From [1]:

static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
...
       p = __execmem_cache_alloc(size);
       if (p)
               return p;
      err = execmem_cache_populate(range, size);
...
}

We are calling __execmem_cache_alloc() without range. For this to work,
we can only call execmem_cache_alloc() with one execmem_range.

Did I miss something?

Thanks,
Song
Mike Rapoport April 19, 2024, 5:02 p.m. UTC | #17
On Fri, Apr 19, 2024 at 08:54:40AM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 11:56 PM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Thu, Apr 18, 2024 at 02:01:22PM -0700, Song Liu wrote:
> > > On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > >
> > > > On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > > > > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > > > > > >
> > > > > > > > I'm looking at execmem_types more as definition of the consumers, maybe I
> > > > > > > > should have named the enum execmem_consumer at the first place.
> > > > > > >
> > > > > > > I think looking at execmem_type from consumers' point of view adds
> > > > > > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > > > > > > and bpf (and maybe also module text) all have the same requirements.
> > > > > > > Did I miss something?
> > > > > >
> > > > > > It's enough to have one architecture with different constrains for kprobes
> > > > > > and bpf to warrant a type for each.
> > > > >
> > > > > AFAICT, some of these constraints can be changed without too much work.
> > > >
> > > > But why?
> > > > I honestly don't understand what are you trying to optimize here. A few
> > > > lines of initialization in execmem_info?
> > >
> > > IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
> > > harder for bpf and kprobe to share the same ROX page. In many use cases,
> > > a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
> > > module text. It is not efficient if we have to allocate separate pages for each
> > > of these use cases. If this is not a problem, the current approach works.
> >
> > The caching of large ROX pages does not need to be per type.
> >
> > In the POC I've posted for caching of large ROX pages on x86 [1], the cache is
> > global and to make kprobes and bpf use it it's enough to set a flag in
> > execmem_info.
> >
> > [1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org
> 
> For the ROX to work, we need different users (module text, kprobe, etc.) to have
> the same execmem_range. From [1]:
> 
> static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> {
> ...
>        p = __execmem_cache_alloc(size);
>        if (p)
>                return p;
>       err = execmem_cache_populate(range, size);
> ...
> }
> 
> We are calling __execmem_cache_alloc() without range. For this to work,
> we can only call execmem_cache_alloc() with one execmem_range.

Actually, on x86 this will "just work" because everything shares the same
address space :)

The 2M pages in the cache will be in the modules space, so
__execmem_cache_alloc() will always return memory from that address space.

For other architectures this indeed needs to be fixed with passing the
range to __execmem_cache_alloc() and limiting search in the cache for that
range.
 
> Did I miss something?
> 
> Thanks,
> Song
Song Liu April 19, 2024, 5:32 p.m. UTC | #18
On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport <rppt@kernel.org> wrote:
[...]
> > >
> > > [1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org
> >
> > For the ROX to work, we need different users (module text, kprobe, etc.) to have
> > the same execmem_range. From [1]:
> >
> > static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> > {
> > ...
> >        p = __execmem_cache_alloc(size);
> >        if (p)
> >                return p;
> >       err = execmem_cache_populate(range, size);
> > ...
> > }
> >
> > We are calling __execmem_cache_alloc() without range. For this to work,
> > we can only call execmem_cache_alloc() with one execmem_range.
>
> Actually, on x86 this will "just work" because everything shares the same
> address space :)
>
> The 2M pages in the cache will be in the modules space, so
> __execmem_cache_alloc() will always return memory from that address space.
>
> For other architectures this indeed needs to be fixed with passing the
> range to __execmem_cache_alloc() and limiting search in the cache for that
> range.

I think we at least need the "map to" concept (initially proposed by Thomas)
to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
maps to EXECMEM_MODULE_TEXT, so that all these actually share
the same range.

Does this make sense?

Song
Mike Rapoport April 19, 2024, 7:59 p.m. UTC | #19
On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport <rppt@kernel.org> wrote:
> [...]
> > > >
> > > > [1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org
> > >
> > > For the ROX to work, we need different users (module text, kprobe, etc.) to have
> > > the same execmem_range. From [1]:
> > >
> > > static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> > > {
> > > ...
> > >        p = __execmem_cache_alloc(size);
> > >        if (p)
> > >                return p;
> > >       err = execmem_cache_populate(range, size);
> > > ...
> > > }
> > >
> > > We are calling __execmem_cache_alloc() without range. For this to work,
> > > we can only call execmem_cache_alloc() with one execmem_range.
> >
> > Actually, on x86 this will "just work" because everything shares the same
> > address space :)
> >
> > The 2M pages in the cache will be in the modules space, so
> > __execmem_cache_alloc() will always return memory from that address space.
> >
> > For other architectures this indeed needs to be fixed with passing the
> > range to __execmem_cache_alloc() and limiting search in the cache for that
> > range.
> 
> I think we at least need the "map to" concept (initially proposed by Thomas)
> to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> maps to EXECMEM_MODULE_TEXT, so that all these actually share
> the same range.

Why?
 
> Does this make sense?
> 
> Song
Song Liu April 19, 2024, 9:42 p.m. UTC | #20
On Fri, Apr 19, 2024 at 1:00 PM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> > On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport <rppt@kernel.org> wrote:
> > [...]
> > > > >
> > > > > [1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org
> > > >
> > > > For the ROX to work, we need different users (module text, kprobe, etc.) to have
> > > > the same execmem_range. From [1]:
> > > >
> > > > static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> > > > {
> > > > ...
> > > >        p = __execmem_cache_alloc(size);
> > > >        if (p)
> > > >                return p;
> > > >       err = execmem_cache_populate(range, size);
> > > > ...
> > > > }
> > > >
> > > > We are calling __execmem_cache_alloc() without range. For this to work,
> > > > we can only call execmem_cache_alloc() with one execmem_range.
> > >
> > > Actually, on x86 this will "just work" because everything shares the same
> > > address space :)
> > >
> > > The 2M pages in the cache will be in the modules space, so
> > > __execmem_cache_alloc() will always return memory from that address space.
> > >
> > > For other architectures this indeed needs to be fixed with passing the
> > > range to __execmem_cache_alloc() and limiting search in the cache for that
> > > range.
> >
> > I think we at least need the "map to" concept (initially proposed by Thomas)
> > to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> > maps to EXECMEM_MODULE_TEXT, so that all these actually share
> > the same range.
>
> Why?

IIUC, we need to update __execmem_cache_alloc() to take a range pointer as
input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe
will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing
the "range" object, we will have to compare different range parameters to check
we can share cached pages between module text and kprobe, which is not
efficient. Did I miss something?

Thanks,
Song
Mike Rapoport April 20, 2024, 4:22 a.m. UTC | #21
On Fri, Apr 19, 2024 at 02:42:16PM -0700, Song Liu wrote:
> On Fri, Apr 19, 2024 at 1:00 PM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> > > On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > [...]
> > > > > >
> > > > > > [1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org
> > > > >
> > > > > For the ROX to work, we need different users (module text, kprobe, etc.) to have
> > > > > the same execmem_range. From [1]:
> > > > >
> > > > > static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> > > > > {
> > > > > ...
> > > > >        p = __execmem_cache_alloc(size);
> > > > >        if (p)
> > > > >                return p;
> > > > >       err = execmem_cache_populate(range, size);
> > > > > ...
> > > > > }
> > > > >
> > > > > We are calling __execmem_cache_alloc() without range. For this to work,
> > > > > we can only call execmem_cache_alloc() with one execmem_range.
> > > >
> > > > Actually, on x86 this will "just work" because everything shares the same
> > > > address space :)
> > > >
> > > > The 2M pages in the cache will be in the modules space, so
> > > > __execmem_cache_alloc() will always return memory from that address space.
> > > >
> > > > For other architectures this indeed needs to be fixed with passing the
> > > > range to __execmem_cache_alloc() and limiting search in the cache for that
> > > > range.
> > >
> > > I think we at least need the "map to" concept (initially proposed by Thomas)
> > > to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> > > maps to EXECMEM_MODULE_TEXT, so that all these actually share
> > > the same range.
> >
> > Why?
> 
> IIUC, we need to update __execmem_cache_alloc() to take a range pointer as
> input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe
> will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing
> the "range" object, we will have to compare different range parameters to check
> we can share cached pages between module text and kprobe, which is not
> efficient. Did I miss something?

We can always share large ROX pages as long as they are within the correct
address space. The permissions for them are ROX and the alignment
differences are due to KASAN and this is handled during allocation of the
large page to refill the cache. __execmem_cache_alloc() only needs to limit
the search for the address space of the range.

And regardless, they way we deal with sharing of the cache can be sorted
out later.

> Thanks,
> Song
Masami Hiramatsu (Google) April 20, 2024, 9:11 a.m. UTC | #22
On Sat, 20 Apr 2024 07:22:50 +0300
Mike Rapoport <rppt@kernel.org> wrote:

> On Fri, Apr 19, 2024 at 02:42:16PM -0700, Song Liu wrote:
> > On Fri, Apr 19, 2024 at 1:00 PM Mike Rapoport <rppt@kernel.org> wrote:
> > >
> > > On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> > > > On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > > [...]
> > > > > > >
> > > > > > > [1] https://lore.kernel.org/all/20240411160526.2093408-1-rppt@kernel.org
> > > > > >
> > > > > > For the ROX to work, we need different users (module text, kprobe, etc.) to have
> > > > > > the same execmem_range. From [1]:
> > > > > >
> > > > > > static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> > > > > > {
> > > > > > ...
> > > > > >        p = __execmem_cache_alloc(size);
> > > > > >        if (p)
> > > > > >                return p;
> > > > > >       err = execmem_cache_populate(range, size);
> > > > > > ...
> > > > > > }
> > > > > >
> > > > > > We are calling __execmem_cache_alloc() without range. For this to work,
> > > > > > we can only call execmem_cache_alloc() with one execmem_range.
> > > > >
> > > > > Actually, on x86 this will "just work" because everything shares the same
> > > > > address space :)
> > > > >
> > > > > The 2M pages in the cache will be in the modules space, so
> > > > > __execmem_cache_alloc() will always return memory from that address space.
> > > > >
> > > > > For other architectures this indeed needs to be fixed with passing the
> > > > > range to __execmem_cache_alloc() and limiting search in the cache for that
> > > > > range.
> > > >
> > > > I think we at least need the "map to" concept (initially proposed by Thomas)
> > > > to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> > > > maps to EXECMEM_MODULE_TEXT, so that all these actually share
> > > > the same range.
> > >
> > > Why?
> > 
> > IIUC, we need to update __execmem_cache_alloc() to take a range pointer as
> > input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe
> > will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing
> > the "range" object, we will have to compare different range parameters to check
> > we can share cached pages between module text and kprobe, which is not
> > efficient. Did I miss something?

Song, thanks for trying to eplain. I think I need to explain why I used
module_alloc() originally.

This depends on how kprobe features are implemented on the architecture, and
how much features are supported on kprobes.

Because kprobe jump optimization and kprobe jump-back optimization need to
use a jump instruction to jump into the trampoline and jump back from the
trampoline directly, if the architecuture jmp instruction supports +-2GB range
like x86, it needs to allocate the trampoline buffer inside such address space.
This requirement is similar to the modules (because module function needs to
call other functions in the kernel etc.), at least kprobes on x86 used
module_alloc().

However, if an architecture only supports breakpoint/trap based kprobe,
it does not need to consider whether the execmem is allocated.

> 
> We can always share large ROX pages as long as they are within the correct
> address space. The permissions for them are ROX and the alignment
> differences are due to KASAN and this is handled during allocation of the
> large page to refill the cache. __execmem_cache_alloc() only needs to limit
> the search for the address space of the range.

So I don't think EXECMEM_KPROBE always same as EXECMEM_MODULE_TEXT, it
should be configured for each arch. Especially, if it is only used for
searching parameter, it looks OK to me.

Thank you,

> 
> And regardless, they way we deal with sharing of the cache can be sorted
> out later.
> 
> > Thanks,
> > Song
> 
> -- 
> Sincerely yours,
> Mike.
>
Song Liu April 22, 2024, 6:32 p.m. UTC | #23
Hi Masami and Mike,

On Sat, Apr 20, 2024 at 2:11 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
[...]
> > >
> > > IIUC, we need to update __execmem_cache_alloc() to take a range pointer as
> > > input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe
> > > will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing
> > > the "range" object, we will have to compare different range parameters to check
> > > we can share cached pages between module text and kprobe, which is not
> > > efficient. Did I miss something?
>
> Song, thanks for trying to eplain. I think I need to explain why I used
> module_alloc() originally.
>
> This depends on how kprobe features are implemented on the architecture, and
> how much features are supported on kprobes.
>
> Because kprobe jump optimization and kprobe jump-back optimization need to
> use a jump instruction to jump into the trampoline and jump back from the
> trampoline directly, if the architecuture jmp instruction supports +-2GB range
> like x86, it needs to allocate the trampoline buffer inside such address space.
> This requirement is similar to the modules (because module function needs to
> call other functions in the kernel etc.), at least kprobes on x86 used
> module_alloc().
>
> However, if an architecture only supports breakpoint/trap based kprobe,
> it does not need to consider whether the execmem is allocated.
>
> >
> > We can always share large ROX pages as long as they are within the correct
> > address space. The permissions for them are ROX and the alignment
> > differences are due to KASAN and this is handled during allocation of the
> > large page to refill the cache. __execmem_cache_alloc() only needs to limit
> > the search for the address space of the range.
>
> So I don't think EXECMEM_KPROBE always same as EXECMEM_MODULE_TEXT, it
> should be configured for each arch. Especially, if it is only used for
> searching parameter, it looks OK to me.

Thanks for the explanation!

I was thinking "we can have EXECMEM_KPROBE share the same parameters as
EXECMEM_MODULE_TEXT for all architectures". But this thought is built on top
of assumptions on future changes/improvements within multiple sub systems.
At this moment, I have no objections moving forward with current execmem APIs.

Thanks,
Song
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..9fcd01bb2ce6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -19,8 +19,8 @@ 
 #include <linux/extable.h>
 #include <linux/kdebug.h>
 #include <linux/slab.h>
-#include <linux/moduleloader.h>
 #include <linux/set_memory.h>
+#include <linux/execmem.h>
 #include <asm/code-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/sstep.h>
@@ -130,7 +130,7 @@  void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 	if (!page)
 		return NULL;
 
@@ -142,7 +142,7 @@  void *alloc_insn_page(void)
 	}
 	return page;
 error:
-	module_memfree(page);
+	execmem_free(page);
 	return NULL;
 }
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..798249ef5646 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -7,13 +7,13 @@ 
  *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/moduleloader.h>
 #include <linux/hardirq.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/kprobes.h>
+#include <linux/execmem.h>
 #include <trace/syscall.h>
 #include <asm/asm-offsets.h>
 #include <asm/text-patching.h>
@@ -220,7 +220,7 @@  static int __init ftrace_plt_init(void)
 {
 	const char *start, *end;
 
-	ftrace_plt = module_alloc(PAGE_SIZE);
+	ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
 	if (!ftrace_plt)
 		panic("cannot allocate ftrace plt\n");
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index f0cf20d4b3c5..3c1b1be744de 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -9,7 +9,6 @@ 
 
 #define pr_fmt(fmt) "kprobes: " fmt
 
-#include <linux/moduleloader.h>
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
@@ -21,6 +20,7 @@ 
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 #include <linux/ftrace.h>
+#include <linux/execmem.h>
 #include <asm/set_memory.h>
 #include <asm/sections.h>
 #include <asm/dis.h>
@@ -38,7 +38,7 @@  void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 	if (!page)
 		return NULL;
 	set_memory_rox((unsigned long)page, 1);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 42215f9404af..ac97a905e8cd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -21,6 +21,7 @@ 
 #include <linux/moduleloader.h>
 #include <linux/bug.h>
 #include <linux/memory.h>
+#include <linux/execmem.h>
 #include <asm/alternative.h>
 #include <asm/nospec-branch.h>
 #include <asm/facility.h>
@@ -76,7 +77,7 @@  void *module_alloc(unsigned long size)
 #ifdef CONFIG_FUNCTION_TRACER
 void module_arch_cleanup(struct module *mod)
 {
-	module_memfree(mod->arch.trampolines_start);
+	execmem_free(mod->arch.trampolines_start);
 }
 #endif
 
@@ -510,7 +511,7 @@  static int module_alloc_ftrace_hotpatch_trampolines(struct module *me,
 
 	size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
 	numpages = DIV_ROUND_UP(size, PAGE_SIZE);
-	start = module_alloc(numpages * PAGE_SIZE);
+	start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
 	if (!start)
 		return -ENOMEM;
 	set_memory_rox((unsigned long)start, numpages);
diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
index da2df1e84ed4..bda2dbd3f4c5 100644
--- a/arch/sparc/net/bpf_jit_comp_32.c
+++ b/arch/sparc/net/bpf_jit_comp_32.c
@@ -1,10 +1,10 @@ 
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/moduleloader.h>
 #include <linux/workqueue.h>
 #include <linux/netdevice.h>
 #include <linux/filter.h>
 #include <linux/cache.h>
 #include <linux/if_vlan.h>
+#include <linux/execmem.h>
 
 #include <asm/cacheflush.h>
 #include <asm/ptrace.h>
@@ -713,7 +713,7 @@  cond_branch:			f_offset = addrs[i + filter[i].jf];
 				if (unlikely(proglen + ilen > oldproglen)) {
 					pr_err("bpb_jit_compile fatal error\n");
 					kfree(addrs);
-					module_memfree(image);
+					execmem_free(image);
 					return;
 				}
 				memcpy(image + proglen, temp, ilen);
@@ -736,7 +736,7 @@  cond_branch:			f_offset = addrs[i + filter[i].jf];
 			break;
 		}
 		if (proglen == oldproglen) {
-			image = module_alloc(proglen);
+			image = execmem_alloc(EXECMEM_BPF, proglen);
 			if (!image)
 				goto out;
 		}
@@ -758,7 +758,7 @@  cond_branch:			f_offset = addrs[i + filter[i].jf];
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		execmem_free(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 70139d9d2e01..c8ddb7abda7c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -25,6 +25,7 @@ 
 #include <linux/memory.h>
 #include <linux/vmalloc.h>
 #include <linux/set_memory.h>
+#include <linux/execmem.h>
 
 #include <trace/syscall.h>
 
@@ -261,15 +262,14 @@  void arch_ftrace_update_code(int command)
 #ifdef CONFIG_X86_64
 
 #ifdef CONFIG_MODULES
-#include <linux/moduleloader.h>
 /* Module allocation simplifies allocating memory for code */
 static inline void *alloc_tramp(unsigned long size)
 {
-	return module_alloc(size);
+	return execmem_alloc(EXECMEM_FTRACE, size);
 }
 static inline void tramp_free(void *tramp)
 {
-	module_memfree(tramp);
+	execmem_free(tramp);
 }
 #else
 /* Trampolines can only be created if modules are supported */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d0e49bd7c6f3..72e6a45e7ec2 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -40,12 +40,12 @@ 
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/kasan.h>
-#include <linux/moduleloader.h>
 #include <linux/objtool.h>
 #include <linux/vmalloc.h>
 #include <linux/pgtable.h>
 #include <linux/set_memory.h>
 #include <linux/cfi.h>
+#include <linux/execmem.h>
 
 #include <asm/text-patching.h>
 #include <asm/cacheflush.h>
@@ -495,7 +495,7 @@  void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 	if (!page)
 		return NULL;
 
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
new file mode 100644
index 000000000000..43e7995593a1
--- /dev/null
+++ b/include/linux/execmem.h
@@ -0,0 +1,57 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_EXECMEM_ALLOC_H
+#define _LINUX_EXECMEM_ALLOC_H
+
+#include <linux/types.h>
+#include <linux/moduleloader.h>
+
+/**
+ * enum execmem_type - types of executable memory ranges
+ *
+ * There are several subsystems that allocate executable memory.
+ * Architectures define different restrictions on placement,
+ * permissions, alignment and other parameters for memory that can be used
+ * by these subsystems.
+ * Types in this enum identify subsystems that allocate executable memory
+ * and let architectures define parameters for ranges suitable for
+ * allocations by each subsystem.
+ *
+ * @EXECMEM_DEFAULT: default parameters that would be used for types that
+ * are not explcitly defined.
+ * @EXECMEM_MODULE_TEXT: parameters for module text sections
+ * @EXECMEM_KPROBES: parameters for kprobes
+ * @EXECMEM_FTRACE: parameters for ftrace
+ * @EXECMEM_BPF: parameters for BPF
+ * @EXECMEM_TYPE_MAX:
+ */
+enum execmem_type {
+	EXECMEM_DEFAULT,
+	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
+	EXECMEM_KPROBES,
+	EXECMEM_FTRACE,
+	EXECMEM_BPF,
+	EXECMEM_TYPE_MAX,
+};
+
+/**
+ * execmem_alloc - allocate executable memory
+ * @type: type of the allocation
+ * @size: how many bytes of memory are required
+ *
+ * Allocates memory that will contain executable code, either generated or
+ * loaded from kernel modules.
+ *
+ * The memory will have protections defined by architecture for executable
+ * region of the @type.
+ *
+ * Return: a pointer to the allocated memory or %NULL
+ */
+void *execmem_alloc(enum execmem_type type, size_t size);
+
+/**
+ * execmem_free - free executable memory
+ * @ptr: pointer to the memory that should be freed
+ */
+void execmem_free(void *ptr);
+
+#endif /* _LINUX_EXECMEM_ALLOC_H */
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 89b1e0ed9811..a3b8caee9405 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -29,9 +29,6 @@  unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
    sections.  Returns NULL on failure. */
 void *module_alloc(unsigned long size);
 
-/* Free memory returned from module_alloc. */
-void module_memfree(void *module_region);
-
 /* Determines if the section name is an init section (that is only used during
  * module loading).
  */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 696bc55de8e8..75a54024e2f4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -22,7 +22,6 @@ 
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
 #include <linux/random.h>
-#include <linux/moduleloader.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/objtool.h>
@@ -37,6 +36,7 @@ 
 #include <linux/nospec.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
+#include <linux/execmem.h>
 
 #include <asm/barrier.h>
 #include <asm/unaligned.h>
@@ -1050,12 +1050,12 @@  void bpf_jit_uncharge_modmem(u32 size)
 
 void *__weak bpf_jit_alloc_exec(unsigned long size)
 {
-	return module_alloc(size);
+	return execmem_alloc(EXECMEM_BPF, size);
 }
 
 void __weak bpf_jit_free_exec(void *addr)
 {
-	module_memfree(addr);
+	execmem_free(addr);
 }
 
 struct bpf_binary_header *
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9d9095e81792..047ca629ce49 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -26,7 +26,6 @@ 
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/export.h>
-#include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
@@ -39,6 +38,7 @@ 
 #include <linux/jump_label.h>
 #include <linux/static_call.h>
 #include <linux/perf_event.h>
+#include <linux/execmem.h>
 
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -113,17 +113,17 @@  enum kprobe_slot_state {
 void __weak *alloc_insn_page(void)
 {
 	/*
-	 * Use module_alloc() so this page is within +/- 2GB of where the
+	 * Use execmem_alloc() so this page is within +/- 2GB of where the
 	 * kernel image and loaded module images reside. This is required
 	 * for most of the architectures.
 	 * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
 	 */
-	return module_alloc(PAGE_SIZE);
+	return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 }
 
 static void free_insn_page(void *page)
 {
-	module_memfree(page);
+	execmem_free(page);
 }
 
 struct kprobe_insn_cache kprobe_insn_slots = {
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index f3e0329337f6..744383c1eed1 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -2,6 +2,7 @@ 
 menuconfig MODULES
 	bool "Enable loadable module support"
 	modules
+	select EXECMEM
 	help
 	  Kernel modules are small pieces of compiled code which can
 	  be inserted in the running kernel, rather than being
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 5b82b069e0d3..d56b7df0cbb6 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -57,6 +57,7 @@ 
 #include <linux/audit.h>
 #include <linux/cfi.h>
 #include <linux/debugfs.h>
+#include <linux/execmem.h>
 #include <uapi/linux/module.h>
 #include "internal.h"
 
@@ -1179,16 +1180,6 @@  resolve_symbol_wait(struct module *mod,
 	return ksym;
 }
 
-void __weak module_memfree(void *module_region)
-{
-	/*
-	 * This memory may be RO, and freeing RO memory in an interrupt is not
-	 * supported by vmalloc.
-	 */
-	WARN_ON(in_interrupt());
-	vfree(module_region);
-}
-
 void __weak module_arch_cleanup(struct module *mod)
 {
 }
@@ -1213,7 +1204,7 @@  static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 	if (mod_mem_use_vmalloc(type))
 		ptr = vmalloc(size);
 	else
-		ptr = module_alloc(size);
+		ptr = execmem_alloc(EXECMEM_MODULE_TEXT, size);
 
 	if (!ptr)
 		return -ENOMEM;
@@ -1244,7 +1235,7 @@  static void module_memory_free(struct module *mod, enum mod_mem_type type)
 	if (mod_mem_use_vmalloc(type))
 		vfree(ptr);
 	else
-		module_memfree(ptr);
+		execmem_free(ptr);
 }
 
 static void free_mod_mem(struct module *mod)
@@ -2496,9 +2487,9 @@  static void do_free_init(struct work_struct *w)
 
 	llist_for_each_safe(pos, n, list) {
 		initfree = container_of(pos, struct mod_initfree, node);
-		module_memfree(initfree->init_text);
-		module_memfree(initfree->init_data);
-		module_memfree(initfree->init_rodata);
+		execmem_free(initfree->init_text);
+		execmem_free(initfree->init_data);
+		execmem_free(initfree->init_rodata);
 		kfree(initfree);
 	}
 }
@@ -2608,10 +2599,10 @@  static noinline int do_init_module(struct module *mod)
 	 * We want to free module_init, but be aware that kallsyms may be
 	 * walking this with preempt disabled.  In all the failure paths, we
 	 * call synchronize_rcu(), but we don't want to slow down the success
-	 * path. module_memfree() cannot be called in an interrupt, so do the
+	 * path. execmem_free() cannot be called in an interrupt, so do the
 	 * work and call synchronize_rcu() in a work queue.
 	 *
-	 * Note that module_alloc() on most architectures creates W+X page
+	 * Note that execmem_alloc() on most architectures creates W+X page
 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
 	 * code such as mark_rodata_ro() which depends on those mappings to
 	 * be cleaned up needs to sync with the queued work by invoking
diff --git a/mm/Kconfig b/mm/Kconfig
index b1448aa81e15..f08a216d4793 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1241,6 +1241,9 @@  config LOCK_MM_AND_FIND_VMA
 config IOMMU_MM_DATA
 	bool
 
+config EXECMEM
+	bool
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 4abb40b911ec..001336c91864 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@  obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_EXECMEM) += execmem.o
diff --git a/mm/execmem.c b/mm/execmem.c
new file mode 100644
index 000000000000..ed2ea41a2543
--- /dev/null
+++ b/mm/execmem.c
@@ -0,0 +1,26 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/execmem.h>
+#include <linux/moduleloader.h>
+
+static void *__execmem_alloc(size_t size)
+{
+	return module_alloc(size);
+}
+
+void *execmem_alloc(enum execmem_type type, size_t size)
+{
+	return __execmem_alloc(size);
+}
+
+void execmem_free(void *ptr)
+{
+	/*
+	 * This memory may be RO, and freeing RO memory in an interrupt is not
+	 * supported by vmalloc.
+	 */
+	WARN_ON(in_interrupt());
+	vfree(ptr);
+}