Message ID | 20220608000014.3054333-1-jarkko@profian.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | kprobes: Enable tracing for mololithic kernel images | expand |
. On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > Tracing with kprobes while running a monolithic kernel is currently > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > dependency is a result of kprobes code using the module allocator for the > trampoline code. > > Detaching kprobes from modules helps to squeeze down the user space, > e.g. when developing new core kernel features, while still having all > the nice tracing capabilities. > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > code with CONFIG_MODULES. > > As the result, kprobes can be used with a monolithic kernel. It's strange when MODULES is n, but vmlinux still obtains module_alloc. Maybe we need a kprobe_alloc, right? > > Signed-off-by: Jarkko Sakkinen <jarkko@profian.com> > --- > Tested with the help of BuildRoot and QEMU: > - arm (function tracer) > - arm64 (function tracer) > - mips (function tracer) > - powerpc (function tracer) > - riscv (function tracer) > - s390 (function tracer) > - sparc (function tracer) > - x86 (function tracer) > - sh (function tracer, for the "pure" kernel/modules_alloc.c path) > --- > arch/Kconfig | 1 - > arch/arm/kernel/Makefile | 5 +++ > arch/arm/kernel/module.c | 32 ---------------- > arch/arm/kernel/module_alloc.c | 42 ++++++++++++++++++++ > arch/arm64/kernel/Makefile | 5 +++ > arch/arm64/kernel/module.c | 47 ----------------------- > arch/arm64/kernel/module_alloc.c | 57 ++++++++++++++++++++++++++++ > arch/mips/kernel/Makefile | 5 +++ > arch/mips/kernel/module.c | 9 ----- > arch/mips/kernel/module_alloc.c | 18 +++++++++ > arch/parisc/kernel/Makefile | 5 +++ > arch/parisc/kernel/module.c | 11 ------ > arch/parisc/kernel/module_alloc.c | 23 +++++++++++ > arch/powerpc/kernel/Makefile | 5 +++ > arch/powerpc/kernel/module.c | 37 ------------------ > arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++ > arch/riscv/kernel/Makefile | 5 +++ > arch/riscv/kernel/module.c | 10 ----- > arch/riscv/kernel/module_alloc.c | 19 ++++++++++ > arch/s390/kernel/Makefile | 5 +++ > arch/s390/kernel/module.c | 17 --------- > arch/s390/kernel/module_alloc.c | 33 ++++++++++++++++ > arch/sparc/kernel/Makefile | 5 +++ > arch/sparc/kernel/module.c | 30 --------------- > arch/sparc/kernel/module_alloc.c | 39 +++++++++++++++++++ > arch/x86/kernel/Makefile | 5 +++ > arch/x86/kernel/module.c | 50 ------------------------ > arch/x86/kernel/module_alloc.c | 61 ++++++++++++++++++++++++++++++ > kernel/Makefile | 5 +++ > kernel/kprobes.c | 10 +++++ > kernel/module/main.c | 17 --------- > kernel/module_alloc.c | 26 +++++++++++++ > kernel/trace/trace_kprobe.c | 10 ++++- > 33 files changed, 434 insertions(+), 262 deletions(-) > create mode 100644 arch/arm/kernel/module_alloc.c > create mode 100644 arch/arm64/kernel/module_alloc.c > create mode 100644 arch/mips/kernel/module_alloc.c > create mode 100644 arch/parisc/kernel/module_alloc.c > create mode 100644 arch/powerpc/kernel/module_alloc.c > create mode 100644 arch/riscv/kernel/module_alloc.c > create mode 100644 arch/s390/kernel/module_alloc.c > create mode 100644 arch/sparc/kernel/module_alloc.c > create mode 100644 arch/x86/kernel/module_alloc.c > create mode 100644 kernel/module_alloc.c > > diff --git a/arch/Kconfig b/arch/Kconfig > index fcf9a41a4ef5..e8e3e7998a2e 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -39,7 +39,6 @@ config GENERIC_ENTRY > > config KPROBES > bool "Kprobes" > - depends on MODULES > depends on HAVE_KPROBES > select KALLSYMS > select TASKS_RCU if PREEMPTION > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile > index 553866751e1a..d2bb954cd54f 100644 > --- a/arch/arm/kernel/Makefile > +++ b/arch/arm/kernel/Makefile > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o > obj-$(CONFIG_ISA_DMA_API) += dma.o > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o > obj-$(CONFIG_MODULES) += armksyms.o module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_ARM_MODULE_PLTS) += module-plts.o > obj-$(CONFIG_ISA_DMA) += dma-isa.o > obj-$(CONFIG_PCI) += bios32.o isa.o > diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c > index d59c36dc0494..054e799e7091 100644 > --- a/arch/arm/kernel/module.c > +++ b/arch/arm/kernel/module.c > @@ -22,38 +22,6 @@ > #include <asm/unwind.h> > #include <asm/opcodes.h> > > -#ifdef CONFIG_XIP_KERNEL > -/* > - * The XIP kernel text is mapped in the module area for modules and > - * some other stuff to work without any indirect relocations. > - * MODULES_VADDR is redefined here and not in asm/memory.h to avoid > - * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. > - */ > -#undef MODULES_VADDR > -#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) > -#endif > - > -#ifdef CONFIG_MMU > -void *module_alloc(unsigned long size) > -{ > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - /* Silence the initial allocation */ > - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) > - gfp_mask |= __GFP_NOWARN; > - > - p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > - gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > - if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) > - return p; > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#endif > - > bool module_init_section(const char *name) > { > return strstarts(name, ".init") || > diff --git a/arch/arm/kernel/module_alloc.c b/arch/arm/kernel/module_alloc.c > new file mode 100644 > index 000000000000..c3cfb2b60575 > --- /dev/null > +++ b/arch/arm/kernel/module_alloc.c > @@ -0,0 +1,42 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2002 Russell King. > + * Modified for nommu by Hyok S. Choi > + * > + * Module allocation method suggested by Andi Kleen. > + */ > + > +#include <linux/mm.h> > +#include <linux/vmalloc.h> > + > +#ifdef CONFIG_XIP_KERNEL > +/* > + * The XIP kernel text is mapped in the module area for modules and > + * some other stuff to work without any indirect relocations. > + * MODULES_VADDR is redefined here and not in asm/memory.h to avoid > + * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. > + */ > +#undef MODULES_VADDR > +#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) > +#endif > + > +#ifdef CONFIG_MMU > +void *module_alloc(unsigned long size) > +{ > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + /* Silence the initial allocation */ > + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) > + gfp_mask |= __GFP_NOWARN; > + > + p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > + gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > + if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) > + return p; > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > + GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#endif > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile > index fa7981d0d917..d61c02da857e 100644 > --- a/arch/arm64/kernel/Makefile > +++ b/arch/arm64/kernel/Makefile > @@ -43,6 +43,11 @@ obj-$(CONFIG_COMPAT) += sigreturn32.o > obj-$(CONFIG_KUSER_HELPERS) += kuser32.o > obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o > obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o > obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o > diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c > index f2d4bb14bfab..d2a979746da0 100644 > --- a/arch/arm64/kernel/module.c > +++ b/arch/arm64/kernel/module.c > @@ -20,53 +20,6 @@ > #include <asm/insn.h> > #include <asm/sections.h> > > -void *module_alloc(unsigned long size) > -{ > - u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - /* Silence the initial allocation */ > - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) > - gfp_mask |= __GFP_NOWARN; > - > - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || > - IS_ENABLED(CONFIG_KASAN_SW_TAGS)) > - /* don't exceed the static module region - see below */ > - module_alloc_end = MODULES_END; > - > - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > - module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, > - NUMA_NO_NODE, __builtin_return_address(0)); > - > - if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && > - (IS_ENABLED(CONFIG_KASAN_VMALLOC) || > - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && > - !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) > - /* > - * KASAN without KASAN_VMALLOC can only deal with module > - * allocations being served from the reserved module region, > - * since the remainder of the vmalloc region is already > - * backed by zero shadow pages, and punching holes into it > - * is non-trivial. Since the module region is not randomized > - * when KASAN is enabled without KASAN_VMALLOC, it is even > - * less likely that the module region gets exhausted, so we > - * can simply omit this fallback in that case. > - */ > - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > - module_alloc_base + SZ_2G, GFP_KERNEL, > - PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > - > - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > - vfree(p); > - return NULL; > - } > - > - /* Memory is intended to be executable, reset the pointer tag. */ > - return kasan_reset_tag(p); > -} > - > enum aarch64_reloc_op { > RELOC_OP_NONE, > RELOC_OP_ABS, > diff --git a/arch/arm64/kernel/module_alloc.c b/arch/arm64/kernel/module_alloc.c > new file mode 100644 > index 000000000000..0340c8c0b782 > --- /dev/null > +++ b/arch/arm64/kernel/module_alloc.c > @@ -0,0 +1,57 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2012 ARM Limited > + * > + * Author: Will Deacon <will.deacon@arm.com> > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void *module_alloc(unsigned long size) > +{ > + u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + /* Silence the initial allocation */ > + if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) > + gfp_mask |= __GFP_NOWARN; > + > + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || > + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) > + /* don't exceed the static module region - see below */ > + module_alloc_end = MODULES_END; > + > + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, > + NUMA_NO_NODE, __builtin_return_address(0)); > + > + if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && > + (IS_ENABLED(CONFIG_KASAN_VMALLOC) || > + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && > + !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) > + /* > + * KASAN without KASAN_VMALLOC can only deal with module > + * allocations being served from the reserved module region, > + * since the remainder of the vmalloc region is already > + * backed by zero shadow pages, and punching holes into it > + * is non-trivial. Since the module region is not randomized > + * when KASAN is enabled without KASAN_VMALLOC, it is even > + * less likely that the module region gets exhausted, so we > + * can simply omit this fallback in that case. > + */ > + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > + module_alloc_base + SZ_2G, GFP_KERNEL, > + PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > + > + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > + vfree(p); > + return NULL; > + } > + > + /* Memory is intended to be executable, reset the pointer tag. */ > + return kasan_reset_tag(p); > +} > diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile > index 7c96282bff2e..cb9297b613a1 100644 > --- a/arch/mips/kernel/Makefile > +++ b/arch/mips/kernel/Makefile > @@ -38,6 +38,11 @@ obj-$(CONFIG_SYNC_R4K) += sync-r4k.o > obj-$(CONFIG_DEBUG_FS) += segment.o > obj-$(CONFIG_STACKTRACE) += stacktrace.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > > obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o > obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o > diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c > index 14f46d17500a..214b2d1868a5 100644 > --- a/arch/mips/kernel/module.c > +++ b/arch/mips/kernel/module.c > @@ -31,15 +31,6 @@ struct mips_hi16 { > static LIST_HEAD(dbe_list); > static DEFINE_SPINLOCK(dbe_lock); > > -#ifdef MODULE_START > -void *module_alloc(unsigned long size) > -{ > - return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, > - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#endif > - > static void apply_r_mips_32(u32 *location, u32 base, Elf_Addr v) > { > *location = base + v; > diff --git a/arch/mips/kernel/module_alloc.c b/arch/mips/kernel/module_alloc.c > new file mode 100644 > index 000000000000..582ab5ed8916 > --- /dev/null > +++ b/arch/mips/kernel/module_alloc.c > @@ -0,0 +1,18 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2001 Rusty Russell. > + * Copyright (C) 2003, 2004 Ralf Baechle (ralf@linux-mips.org) > + * Copyright (C) 2005 Thiemo Seufer > + */ > + > +#include <linux/mm.h> > +#include <linux/vmalloc.h> > + > +#ifdef MODULE_START > +void *module_alloc(unsigned long size) > +{ > + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, > + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#endif > diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile > index d0bfac89a842..cd38083ae077 100644 > --- a/arch/parisc/kernel/Makefile > +++ b/arch/parisc/kernel/Makefile > @@ -25,6 +25,11 @@ obj-$(CONFIG_SMP) += smp.o > obj-$(CONFIG_PA11) += pci-dma.o > obj-$(CONFIG_PCI) += pci.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_64BIT) += sys_parisc32.o signal32.o > obj-$(CONFIG_STACKTRACE)+= stacktrace.o > obj-$(CONFIG_AUDIT) += audit.o > diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c > index 7df140545b22..c81e63e2549b 100644 > --- a/arch/parisc/kernel/module.c > +++ b/arch/parisc/kernel/module.c > @@ -192,17 +192,6 @@ static inline int reassemble_22(int as22) > ((as22 & 0x0003ff) << 3)); > } > > -void *module_alloc(unsigned long size) > -{ > - /* using RWX means less protection for modules, but it's > - * easier than trying to map the text, data, init_text and > - * init_data correctly */ > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > - GFP_KERNEL, > - PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > - > #ifndef CONFIG_64BIT > static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n) > { > diff --git a/arch/parisc/kernel/module_alloc.c b/arch/parisc/kernel/module_alloc.c > new file mode 100644 > index 000000000000..4e4e4ce1132e > --- /dev/null > +++ b/arch/parisc/kernel/module_alloc.c > @@ -0,0 +1,23 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Linux/PA-RISC Project > + * Copyright (C) 2003 Randolph Chung <tausq at debian . org> > + * Copyright (C) 2008 Helge Deller <deller@gmx.de> > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void *module_alloc(unsigned long size) > +{ > + /* > + * Using RWX means less protection for modules, but it's > + * easier than trying to map the text, data, init_text and > + * init_data correctly. > + */ > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > + GFP_KERNEL, > + PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile > index 2e2a2a9bcf43..5a811cdf230b 100644 > --- a/arch/powerpc/kernel/Makefile > +++ b/arch/powerpc/kernel/Makefile > @@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o > endif > obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o > obj-$(CONFIG_MODULES) += module.o module_$(BITS).o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_44x) += cpu_setup_44x.o > obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o > obj-$(CONFIG_PPC_DOORBELL) += dbell.o > diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c > index f6d6ae0a1692..b30e00964a60 100644 > --- a/arch/powerpc/kernel/module.c > +++ b/arch/powerpc/kernel/module.c > @@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr, > > return 0; > } > - > -static __always_inline void * > -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) > -{ > - pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; > - gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); > - > - /* > - * Don't do huge page allocations for modules yet until more testing > - * is done. STRICT_MODULE_RWX may require extra work to support this > - * too. > - */ > - return __vmalloc_node_range(size, 1, start, end, gfp, prot, > - VM_FLUSH_RESET_PERMS, > - NUMA_NO_NODE, __builtin_return_address(0)); > -} > - > -void *module_alloc(unsigned long size) > -{ > -#ifdef MODULES_VADDR > - unsigned long limit = (unsigned long)_etext - SZ_32M; > - void *ptr = NULL; > - > - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); > - > - /* First try within 32M limit from _etext to avoid branch trampolines */ > - if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) > - ptr = __module_alloc(size, limit, MODULES_END, true); > - > - if (!ptr) > - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); > - > - return ptr; > -#else > - return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); > -#endif > -} > diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c > new file mode 100644 > index 000000000000..48541c27ce46 > --- /dev/null > +++ b/arch/powerpc/kernel/module_alloc.c > @@ -0,0 +1,47 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Kernel module help for powerpc. > + * Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. > + * Copyright (C) 2008 Freescale Semiconductor, Inc. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +static __always_inline void * > +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) > +{ > + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; > + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); > + > + /* > + * Don't do huge page allocations for modules yet until more testing > + * is done. STRICT_MODULE_RWX may require extra work to support this > + * too. > + */ > + return __vmalloc_node_range(size, 1, start, end, gfp, prot, > + VM_FLUSH_RESET_PERMS, > + NUMA_NO_NODE, __builtin_return_address(0)); > +} > + > +void *module_alloc(unsigned long size) > +{ > +#ifdef MODULES_VADDR > + unsigned long limit = (unsigned long)_etext - SZ_32M; > + void *ptr = NULL; > + > + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); > + > + /* First try within 32M limit from _etext to avoid branch trampolines */ > + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) > + ptr = __module_alloc(size, limit, MODULES_END, true); > + > + if (!ptr) > + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); > + > + return ptr; > +#else > + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); > +#endif > +} > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile > index c71d6591d539..7af346ae575e 100644 > --- a/arch/riscv/kernel/Makefile > +++ b/arch/riscv/kernel/Makefile > @@ -61,6 +61,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o > > obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o > > obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o > diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c > index 91fe16bfaa07..1621d5e458f7 100644 > --- a/arch/riscv/kernel/module.c > +++ b/arch/riscv/kernel/module.c > @@ -419,16 +419,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, > return 0; > } > > -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) > -void *module_alloc(unsigned long size) > -{ > - return __vmalloc_node_range(size, 1, MODULES_VADDR, > - MODULES_END, GFP_KERNEL, > - PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#endif > - > static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, > const Elf_Shdr *sechdrs, > const char *name) > diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c > new file mode 100644 > index 000000000000..ed87ed048713 > --- /dev/null > +++ b/arch/riscv/kernel/module_alloc.c > @@ -0,0 +1,19 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2017 Zihao Yu > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > +#include <asm/sections.h> > + > +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) > +void *module_alloc(unsigned long size) > +{ > + return __vmalloc_node_range(size, 1, MODULES_VADDR, > + MODULES_END, GFP_KERNEL, > + PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#endif > diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile > index 27d6b3c7aa06..71e73e34b441 100644 > --- a/arch/s390/kernel/Makefile > +++ b/arch/s390/kernel/Makefile > @@ -48,6 +48,11 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o > CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) > > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o > obj-$(CONFIG_NUMA) += numa.o > obj-$(CONFIG_AUDIT) += audit.o > diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c > index 26125a9c436d..e15f09fa50e2 100644 > --- a/arch/s390/kernel/module.c > +++ b/arch/s390/kernel/module.c > @@ -35,23 +35,6 @@ > > #define PLT_ENTRY_SIZE 22 > > -void *module_alloc(unsigned long size) > -{ > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - if (PAGE_ALIGN(size) > MODULES_LEN) > - return NULL; > - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, > - gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > - __builtin_return_address(0)); > - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > - vfree(p); > - return NULL; > - } > - return p; > -} > - > #ifdef CONFIG_FUNCTION_TRACER > void module_arch_cleanup(struct module *mod) > { > diff --git a/arch/s390/kernel/module_alloc.c b/arch/s390/kernel/module_alloc.c > new file mode 100644 > index 000000000000..e4c4175fb759 > --- /dev/null > +++ b/arch/s390/kernel/module_alloc.c > @@ -0,0 +1,33 @@ > +// SPDX-License-Identifier: GPL-2.0+ > +/* > + * Kernel module help for s390. > + * > + * S390 version > + * Copyright IBM Corp. 2002, 2003 > + * Author(s): Arnd Bergmann (arndb@de.ibm.com) > + * Martin Schwidefsky (schwidefsky@de.ibm.com) > + * > + * based on i386 version > + * Copyright (C) 2001 Rusty Russell. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void *module_alloc(unsigned long size) > +{ > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + if (PAGE_ALIGN(size) > MODULES_LEN) > + return NULL; > + p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, > + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > + __builtin_return_address(0)); > + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > + vfree(p); > + return NULL; > + } > + return p; > +} > diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile > index d3a0e072ebe8..53905c00a094 100644 > --- a/arch/sparc/kernel/Makefile > +++ b/arch/sparc/kernel/Makefile > @@ -89,6 +89,11 @@ obj-$(CONFIG_SUN_PM) += apc.o pmc.o > > obj-$(CONFIG_MODULES) += module.o > obj-$(CONFIG_MODULES) += sparc_ksyms.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_SPARC_LED) += led.o > obj-$(CONFIG_KGDB) += kgdb_$(BITS).o > > diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c > index df39580f398d..f2babc69f189 100644 > --- a/arch/sparc/kernel/module.c > +++ b/arch/sparc/kernel/module.c > @@ -21,36 +21,6 @@ > > #include "entry.h" > > -#ifdef CONFIG_SPARC64 > - > -#include <linux/jump_label.h> > - > -static void *module_map(unsigned long size) > -{ > - if (PAGE_ALIGN(size) > MODULES_LEN) > - return NULL; > - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#else > -static void *module_map(unsigned long size) > -{ > - return vmalloc(size); > -} > -#endif /* CONFIG_SPARC64 */ > - > -void *module_alloc(unsigned long size) > -{ > - void *ret; > - > - ret = module_map(size); > - if (ret) > - memset(ret, 0, size); > - > - return ret; > -} > - > /* Make generic code ignore STT_REGISTER dummy undefined symbols. */ > int module_frob_arch_sections(Elf_Ehdr *hdr, > Elf_Shdr *sechdrs, > diff --git a/arch/sparc/kernel/module_alloc.c b/arch/sparc/kernel/module_alloc.c > new file mode 100644 > index 000000000000..4bacf8cff012 > --- /dev/null > +++ b/arch/sparc/kernel/module_alloc.c > @@ -0,0 +1,39 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Kernel module help for sparc64. > + * > + * Copyright (C) 2001 Rusty Russell. > + * Copyright (C) 2002 David S. Miller. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +#ifdef CONFIG_SPARC64 > +#include <linux/jump_label.h> > + > +static void *module_map(unsigned long size) > +{ > + if (PAGE_ALIGN(size) > MODULES_LEN) > + return NULL; > + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#else > +static void *module_map(unsigned long size) > +{ > + return vmalloc(size); > +} > +#endif /* CONFIG_SPARC64 */ > + > +void *module_alloc(unsigned long size) > +{ > + void *ret; > + > + ret = module_map(size); > + if (ret) > + memset(ret, 0, size); > + > + return ret; > +} > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile > index 03364dc40d8d..1f0304140f44 100644 > --- a/arch/x86/kernel/Makefile > +++ b/arch/x86/kernel/Makefile > @@ -108,6 +108,11 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o > obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o > obj-y += kprobes/ > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_X86_32) += doublefault_32.o > obj-$(CONFIG_KGDB) += kgdb.o > obj-$(CONFIG_VM86) += vm86_32.o > diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c > index b98ffcf4d250..40bb10b5a68d 100644 > --- a/arch/x86/kernel/module.c > +++ b/arch/x86/kernel/module.c > @@ -36,56 +36,6 @@ do { \ > } while (0) > #endif > > -#ifdef CONFIG_RANDOMIZE_BASE > -static unsigned long module_load_offset; > - > -/* Mutex protects the module_load_offset. */ > -static DEFINE_MUTEX(module_kaslr_mutex); > - > -static unsigned long int get_module_load_offset(void) > -{ > - if (kaslr_enabled()) { > - mutex_lock(&module_kaslr_mutex); > - /* > - * Calculate the module_load_offset the first time this > - * code is called. Once calculated it stays the same until > - * reboot. > - */ > - if (module_load_offset == 0) > - module_load_offset = > - (get_random_int() % 1024 + 1) * PAGE_SIZE; > - mutex_unlock(&module_kaslr_mutex); > - } > - return module_load_offset; > -} > -#else > -static unsigned long int get_module_load_offset(void) > -{ > - return 0; > -} > -#endif > - > -void *module_alloc(unsigned long size) > -{ > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - if (PAGE_ALIGN(size) > MODULES_LEN) > - return NULL; > - > - p = __vmalloc_node_range(size, MODULE_ALIGN, > - MODULES_VADDR + get_module_load_offset(), > - MODULES_END, gfp_mask, > - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > - __builtin_return_address(0)); > - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > - vfree(p); > - return NULL; > - } > - > - return p; > -} > - > #ifdef CONFIG_X86_32 > int apply_relocate(Elf32_Shdr *sechdrs, > const char *strtab, > diff --git a/arch/x86/kernel/module_alloc.c b/arch/x86/kernel/module_alloc.c > new file mode 100644 > index 000000000000..a7df2af1467a > --- /dev/null > +++ b/arch/x86/kernel/module_alloc.c > @@ -0,0 +1,61 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Kernel module help for x86. > + * Copyright (C) 2001 Rusty Russell. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/random.h> > +#include <linux/vmalloc.h> > +#include <asm/setup.h> > + > +#ifdef CONFIG_RANDOMIZE_BASE > +static unsigned long module_load_offset; > + > +/* Mutex protects the module_load_offset. */ > +static DEFINE_MUTEX(module_kaslr_mutex); > + > +static unsigned long get_module_load_offset(void) > +{ > + if (kaslr_enabled()) { > + mutex_lock(&module_kaslr_mutex); > + /* > + * Calculate the module_load_offset the first time this > + * code is called. Once calculated it stays the same until > + * reboot. > + */ > + if (module_load_offset == 0) > + module_load_offset = > + (get_random_int() % 1024 + 1) * PAGE_SIZE; > + mutex_unlock(&module_kaslr_mutex); > + } > + return module_load_offset; > +} > +#else > +static unsigned long get_module_load_offset(void) > +{ > + return 0; > +} > +#endif > + > +void *module_alloc(unsigned long size) > +{ > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + if (PAGE_ALIGN(size) > MODULES_LEN) > + return NULL; > + > + p = __vmalloc_node_range(size, MODULE_ALIGN, > + MODULES_VADDR + get_module_load_offset(), > + MODULES_END, gfp_mask, > + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > + __builtin_return_address(0)); > + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > + vfree(p); > + return NULL; > + } > + > + return p; > +} > diff --git a/kernel/Makefile b/kernel/Makefile > index 318789c728d3..2981fe42060d 100644 > --- a/kernel/Makefile > +++ b/kernel/Makefile > @@ -53,6 +53,11 @@ obj-y += livepatch/ > obj-y += dma/ > obj-y += entry/ > obj-$(CONFIG_MODULES) += module/ > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > > obj-$(CONFIG_KCMP) += kcmp.o > obj-$(CONFIG_FREEZER) += freezer.o > diff --git a/kernel/kprobes.c b/kernel/kprobes.c > index f214f8c088ed..3f9876374cd3 100644 > --- a/kernel/kprobes.c > +++ b/kernel/kprobes.c > @@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p, > goto out; > } > > +#ifdef CONFIG_MODULES > /* Check if 'p' is probing a module. */ > *probed_mod = __module_text_address((unsigned long) p->addr); > if (*probed_mod) { > @@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p, > ret = -ENOENT; > } > } > +#endif > + > out: > preempt_enable(); > jump_label_unlock(); > @@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) > return 0; > } > > +#ifdef CONFIG_MODULES > /* Remove all symbols in given area from kprobe blacklist */ > static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) > { > @@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) > { > kprobe_remove_area_blacklist(entry, entry + 1); > } > +#endif /* CONFIG_MODULES */ > > int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, > char *type, char *sym) > @@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start, > return ret ? : arch_populate_kprobe_blacklist(); > } > > +#ifdef CONFIG_MODULES > static void add_module_kprobe_blacklist(struct module *mod) > { > unsigned long start, end; > @@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = { > .notifier_call = kprobes_module_callback, > .priority = 0 > }; > +#endif /* CONFIG_MODULES */ > > void kprobe_free_init_mem(void) > { > @@ -2717,8 +2724,11 @@ static int __init init_kprobes(void) > err = arch_init_kprobes(); > if (!err) > err = register_die_notifier(&kprobe_exceptions_nb); > + > +#ifdef CONFIG_MODULES > if (!err) > err = register_module_notifier(&kprobe_module_nb); > +#endif > > kprobes_initialized = (err == 0); > kprobe_sysctls_init(); > diff --git a/kernel/module/main.c b/kernel/module/main.c > index fed58d30725d..7fa182b78550 100644 > --- a/kernel/module/main.c > +++ b/kernel/module/main.c > @@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod, > return ksym; > } > > -void __weak module_memfree(void *module_region) > -{ > - /* > - * This memory may be RO, and freeing RO memory in an interrupt is not > - * supported by vmalloc. > - */ > - WARN_ON(in_interrupt()); > - vfree(module_region); > -} > - > void __weak module_arch_cleanup(struct module *mod) > { > } > @@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) > ddebug_remove_module(mod->name); > } > > -void * __weak module_alloc(unsigned long size) > -{ > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, > - NUMA_NO_NODE, __builtin_return_address(0)); > -} > - > bool __weak module_init_section(const char *name) > { > return strstarts(name, ".init"); > diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c > new file mode 100644 > index 000000000000..26a4c60998ad > --- /dev/null > +++ b/kernel/module_alloc.c > @@ -0,0 +1,26 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2002 Richard Henderson > + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void * __weak module_alloc(unsigned long size) > +{ > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, > + NUMA_NO_NODE, __builtin_return_address(0)); > +} > + > +void __weak module_memfree(void *module_region) > +{ > + /* > + * This memory may be RO, and freeing RO memory in an interrupt is not > + * supported by vmalloc. > + */ > + WARN_ON(in_interrupt()); > + vfree(module_region); > +} > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c > index 93507330462c..050b2975332e 100644 > --- a/kernel/trace/trace_kprobe.c > +++ b/kernel/trace/trace_kprobe.c > @@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) > return kprobe_gone(&tk->rp.kp); > } > > +#ifdef CONFIG_MODULES > static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, > struct module *mod) > { > @@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, > > return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; > } > +#endif /* CONFIG_MODULES */ > > static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) > { > + bool ret = false; > +#ifdef CONFIG_MODULES > char *p; > - bool ret; > > if (!tk->symbol) > return false; > @@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) > ret = !!find_module(tk->symbol); > rcu_read_unlock_sched(); > *p = ':'; > +#endif /* CONFIG_MODULES */ > > return ret; > } > @@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) > return ret; > } > > +#ifdef CONFIG_MODULES > /* Module notifier call back, checking event on the module */ > static int trace_kprobe_module_callback(struct notifier_block *nb, > unsigned long val, void *data) > @@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = { > .notifier_call = trace_kprobe_module_callback, > .priority = 1 /* Invoked after kprobe module callback */ > }; > +#endif /* CONFIG_MODULES */ > > static int __trace_kprobe_create(int argc, const char *argv[]) > { > @@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void) > if (ret) > return ret; > > +#ifdef CONFIG_MODULES > if (register_module_notifier(&trace_kprobe_module_nb)) > return -EINVAL; > +#endif /* CONFIG_MODULES */ > > return 0; > } > -- > 2.36.1 >
On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > . > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > Tracing with kprobes while running a monolithic kernel is currently > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > dependency is a result of kprobes code using the module allocator for the > > trampoline code. > > > > Detaching kprobes from modules helps to squeeze down the user space, > > e.g. when developing new core kernel features, while still having all > > the nice tracing capabilities. > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > code with CONFIG_MODULES. > > > > As the result, kprobes can be used with a monolithic kernel. > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > Maybe we need a kprobe_alloc, right? Perhaps not the best name but at least it documents the fact that they use the same allocator. Few years ago I carved up something "half-way there" for kprobes, and I used the name text_alloc() [*]. [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ BR, Jarkko
Hi Jarkko, On Wed, 8 Jun 2022 08:25:38 +0300 Jarkko Sakkinen <jarkko@kernel.org> wrote: > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > . > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > dependency is a result of kprobes code using the module allocator for the > > > trampoline code. > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > e.g. when developing new core kernel features, while still having all > > > the nice tracing capabilities. > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > code with CONFIG_MODULES. > > > > > > As the result, kprobes can be used with a monolithic kernel. > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > Maybe we need a kprobe_alloc, right? > > Perhaps not the best name but at least it documents the fact that > they use the same allocator. > > Few years ago I carved up something "half-way there" for kprobes, > and I used the name text_alloc() [*]. > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ Yeah, I remember that. Thank you for updating your patch! I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. If module support maintainers think this name is not good, you may be able to rename it as text_alloc() and make the module_alloc() as a wrapper of it. Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> for kprobe side. Thank you,
On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > Hi Jarkko, > > On Wed, 8 Jun 2022 08:25:38 +0300 > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > . > > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > dependency is a result of kprobes code using the module allocator for the > > > > trampoline code. > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > e.g. when developing new core kernel features, while still having all > > > > the nice tracing capabilities. > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > code with CONFIG_MODULES. > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > Maybe we need a kprobe_alloc, right? > > > > Perhaps not the best name but at least it documents the fact that > > they use the same allocator. > > > > Few years ago I carved up something "half-way there" for kprobes, > > and I used the name text_alloc() [*]. > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > Yeah, I remember that. Thank you for updating your patch! > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > If module support maintainers think this name is not good, you may be > able to rename it as text_alloc() and make the module_alloc() as a > wrapper of it. IIUC, most users of module_alloc() use it to allocate memory for text, except that module code uses it for both text and data. Therefore, I guess calling it text_alloc() is not 100% accurate until we change the module code (to use a different API to allocate memory for data). Thanks, Song > > Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > for kprobe side. > > Thank you, > > -- > Masami Hiramatsu (Google) <mhiramat@kernel.org>
Hello Jarkko, On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <jarkko@profian.com> wrote: > > Tracing with kprobes while running a monolithic kernel is currently > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > dependency is a result of kprobes code using the module allocator for the > trampoline code. > > Detaching kprobes from modules helps to squeeze down the user space, > e.g. when developing new core kernel features, while still having all > the nice tracing capabilities. > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > code with CONFIG_MODULES. > > As the result, kprobes can be used with a monolithic kernel. I think I may have mentioned this the previous time as well, but I don't think this is the right approach. Kprobes uses alloc_insn_page() to allocate executable memory, but the requirements for this memory are radically different compared to loadable modules, which need to be within an arch-specific distance of the core kernel, need KASAN backing etc etc. This is why arm64, for instance, does not implement alloc_insn_page() in terms of module_alloc() [and likely does not belong in this patch for that reason] Is there any reason kprobes cannot simply use vmalloc()? > > Signed-off-by: Jarkko Sakkinen <jarkko@profian.com> > --- > Tested with the help of BuildRoot and QEMU: > - arm (function tracer) > - arm64 (function tracer) > - mips (function tracer) > - powerpc (function tracer) > - riscv (function tracer) > - s390 (function tracer) > - sparc (function tracer) > - x86 (function tracer) > - sh (function tracer, for the "pure" kernel/modules_alloc.c path) > --- > arch/Kconfig | 1 - > arch/arm/kernel/Makefile | 5 +++ > arch/arm/kernel/module.c | 32 ---------------- > arch/arm/kernel/module_alloc.c | 42 ++++++++++++++++++++ > arch/arm64/kernel/Makefile | 5 +++ > arch/arm64/kernel/module.c | 47 ----------------------- > arch/arm64/kernel/module_alloc.c | 57 ++++++++++++++++++++++++++++ > arch/mips/kernel/Makefile | 5 +++ > arch/mips/kernel/module.c | 9 ----- > arch/mips/kernel/module_alloc.c | 18 +++++++++ > arch/parisc/kernel/Makefile | 5 +++ > arch/parisc/kernel/module.c | 11 ------ > arch/parisc/kernel/module_alloc.c | 23 +++++++++++ > arch/powerpc/kernel/Makefile | 5 +++ > arch/powerpc/kernel/module.c | 37 ------------------ > arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++ > arch/riscv/kernel/Makefile | 5 +++ > arch/riscv/kernel/module.c | 10 ----- > arch/riscv/kernel/module_alloc.c | 19 ++++++++++ > arch/s390/kernel/Makefile | 5 +++ > arch/s390/kernel/module.c | 17 --------- > arch/s390/kernel/module_alloc.c | 33 ++++++++++++++++ > arch/sparc/kernel/Makefile | 5 +++ > arch/sparc/kernel/module.c | 30 --------------- > arch/sparc/kernel/module_alloc.c | 39 +++++++++++++++++++ > arch/x86/kernel/Makefile | 5 +++ > arch/x86/kernel/module.c | 50 ------------------------ > arch/x86/kernel/module_alloc.c | 61 ++++++++++++++++++++++++++++++ > kernel/Makefile | 5 +++ > kernel/kprobes.c | 10 +++++ > kernel/module/main.c | 17 --------- > kernel/module_alloc.c | 26 +++++++++++++ > kernel/trace/trace_kprobe.c | 10 ++++- > 33 files changed, 434 insertions(+), 262 deletions(-) > create mode 100644 arch/arm/kernel/module_alloc.c > create mode 100644 arch/arm64/kernel/module_alloc.c > create mode 100644 arch/mips/kernel/module_alloc.c > create mode 100644 arch/parisc/kernel/module_alloc.c > create mode 100644 arch/powerpc/kernel/module_alloc.c > create mode 100644 arch/riscv/kernel/module_alloc.c > create mode 100644 arch/s390/kernel/module_alloc.c > create mode 100644 arch/sparc/kernel/module_alloc.c > create mode 100644 arch/x86/kernel/module_alloc.c > create mode 100644 kernel/module_alloc.c > > diff --git a/arch/Kconfig b/arch/Kconfig > index fcf9a41a4ef5..e8e3e7998a2e 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -39,7 +39,6 @@ config GENERIC_ENTRY > > config KPROBES > bool "Kprobes" > - depends on MODULES > depends on HAVE_KPROBES > select KALLSYMS > select TASKS_RCU if PREEMPTION > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile > index 553866751e1a..d2bb954cd54f 100644 > --- a/arch/arm/kernel/Makefile > +++ b/arch/arm/kernel/Makefile > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o > obj-$(CONFIG_ISA_DMA_API) += dma.o > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o > obj-$(CONFIG_MODULES) += armksyms.o module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_ARM_MODULE_PLTS) += module-plts.o > obj-$(CONFIG_ISA_DMA) += dma-isa.o > obj-$(CONFIG_PCI) += bios32.o isa.o > diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c > index d59c36dc0494..054e799e7091 100644 > --- a/arch/arm/kernel/module.c > +++ b/arch/arm/kernel/module.c > @@ -22,38 +22,6 @@ > #include <asm/unwind.h> > #include <asm/opcodes.h> > > -#ifdef CONFIG_XIP_KERNEL > -/* > - * The XIP kernel text is mapped in the module area for modules and > - * some other stuff to work without any indirect relocations. > - * MODULES_VADDR is redefined here and not in asm/memory.h to avoid > - * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. > - */ > -#undef MODULES_VADDR > -#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) > -#endif > - > -#ifdef CONFIG_MMU > -void *module_alloc(unsigned long size) > -{ > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - /* Silence the initial allocation */ > - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) > - gfp_mask |= __GFP_NOWARN; > - > - p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > - gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > - if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) > - return p; > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#endif > - > bool module_init_section(const char *name) > { > return strstarts(name, ".init") || > diff --git a/arch/arm/kernel/module_alloc.c b/arch/arm/kernel/module_alloc.c > new file mode 100644 > index 000000000000..c3cfb2b60575 > --- /dev/null > +++ b/arch/arm/kernel/module_alloc.c > @@ -0,0 +1,42 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2002 Russell King. > + * Modified for nommu by Hyok S. Choi > + * > + * Module allocation method suggested by Andi Kleen. > + */ > + > +#include <linux/mm.h> > +#include <linux/vmalloc.h> > + > +#ifdef CONFIG_XIP_KERNEL > +/* > + * The XIP kernel text is mapped in the module area for modules and > + * some other stuff to work without any indirect relocations. > + * MODULES_VADDR is redefined here and not in asm/memory.h to avoid > + * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. > + */ > +#undef MODULES_VADDR > +#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) > +#endif > + > +#ifdef CONFIG_MMU > +void *module_alloc(unsigned long size) > +{ > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + /* Silence the initial allocation */ > + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) > + gfp_mask |= __GFP_NOWARN; > + > + p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > + gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > + if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) > + return p; > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > + GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#endif > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile > index fa7981d0d917..d61c02da857e 100644 > --- a/arch/arm64/kernel/Makefile > +++ b/arch/arm64/kernel/Makefile > @@ -43,6 +43,11 @@ obj-$(CONFIG_COMPAT) += sigreturn32.o > obj-$(CONFIG_KUSER_HELPERS) += kuser32.o > obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o > obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o > obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o > diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c > index f2d4bb14bfab..d2a979746da0 100644 > --- a/arch/arm64/kernel/module.c > +++ b/arch/arm64/kernel/module.c > @@ -20,53 +20,6 @@ > #include <asm/insn.h> > #include <asm/sections.h> > > -void *module_alloc(unsigned long size) > -{ > - u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - /* Silence the initial allocation */ > - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) > - gfp_mask |= __GFP_NOWARN; > - > - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || > - IS_ENABLED(CONFIG_KASAN_SW_TAGS)) > - /* don't exceed the static module region - see below */ > - module_alloc_end = MODULES_END; > - > - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > - module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, > - NUMA_NO_NODE, __builtin_return_address(0)); > - > - if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && > - (IS_ENABLED(CONFIG_KASAN_VMALLOC) || > - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && > - !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) > - /* > - * KASAN without KASAN_VMALLOC can only deal with module > - * allocations being served from the reserved module region, > - * since the remainder of the vmalloc region is already > - * backed by zero shadow pages, and punching holes into it > - * is non-trivial. Since the module region is not randomized > - * when KASAN is enabled without KASAN_VMALLOC, it is even > - * less likely that the module region gets exhausted, so we > - * can simply omit this fallback in that case. > - */ > - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > - module_alloc_base + SZ_2G, GFP_KERNEL, > - PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > - > - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > - vfree(p); > - return NULL; > - } > - > - /* Memory is intended to be executable, reset the pointer tag. */ > - return kasan_reset_tag(p); > -} > - > enum aarch64_reloc_op { > RELOC_OP_NONE, > RELOC_OP_ABS, > diff --git a/arch/arm64/kernel/module_alloc.c b/arch/arm64/kernel/module_alloc.c > new file mode 100644 > index 000000000000..0340c8c0b782 > --- /dev/null > +++ b/arch/arm64/kernel/module_alloc.c > @@ -0,0 +1,57 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2012 ARM Limited > + * > + * Author: Will Deacon <will.deacon@arm.com> > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void *module_alloc(unsigned long size) > +{ > + u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + /* Silence the initial allocation */ > + if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) > + gfp_mask |= __GFP_NOWARN; > + > + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || > + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) > + /* don't exceed the static module region - see below */ > + module_alloc_end = MODULES_END; > + > + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, > + NUMA_NO_NODE, __builtin_return_address(0)); > + > + if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && > + (IS_ENABLED(CONFIG_KASAN_VMALLOC) || > + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && > + !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) > + /* > + * KASAN without KASAN_VMALLOC can only deal with module > + * allocations being served from the reserved module region, > + * since the remainder of the vmalloc region is already > + * backed by zero shadow pages, and punching holes into it > + * is non-trivial. Since the module region is not randomized > + * when KASAN is enabled without KASAN_VMALLOC, it is even > + * less likely that the module region gets exhausted, so we > + * can simply omit this fallback in that case. > + */ > + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, > + module_alloc_base + SZ_2G, GFP_KERNEL, > + PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > + > + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > + vfree(p); > + return NULL; > + } > + > + /* Memory is intended to be executable, reset the pointer tag. */ > + return kasan_reset_tag(p); > +} > diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile > index 7c96282bff2e..cb9297b613a1 100644 > --- a/arch/mips/kernel/Makefile > +++ b/arch/mips/kernel/Makefile > @@ -38,6 +38,11 @@ obj-$(CONFIG_SYNC_R4K) += sync-r4k.o > obj-$(CONFIG_DEBUG_FS) += segment.o > obj-$(CONFIG_STACKTRACE) += stacktrace.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > > obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o > obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o > diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c > index 14f46d17500a..214b2d1868a5 100644 > --- a/arch/mips/kernel/module.c > +++ b/arch/mips/kernel/module.c > @@ -31,15 +31,6 @@ struct mips_hi16 { > static LIST_HEAD(dbe_list); > static DEFINE_SPINLOCK(dbe_lock); > > -#ifdef MODULE_START > -void *module_alloc(unsigned long size) > -{ > - return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, > - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#endif > - > static void apply_r_mips_32(u32 *location, u32 base, Elf_Addr v) > { > *location = base + v; > diff --git a/arch/mips/kernel/module_alloc.c b/arch/mips/kernel/module_alloc.c > new file mode 100644 > index 000000000000..582ab5ed8916 > --- /dev/null > +++ b/arch/mips/kernel/module_alloc.c > @@ -0,0 +1,18 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2001 Rusty Russell. > + * Copyright (C) 2003, 2004 Ralf Baechle (ralf@linux-mips.org) > + * Copyright (C) 2005 Thiemo Seufer > + */ > + > +#include <linux/mm.h> > +#include <linux/vmalloc.h> > + > +#ifdef MODULE_START > +void *module_alloc(unsigned long size) > +{ > + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, > + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#endif > diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile > index d0bfac89a842..cd38083ae077 100644 > --- a/arch/parisc/kernel/Makefile > +++ b/arch/parisc/kernel/Makefile > @@ -25,6 +25,11 @@ obj-$(CONFIG_SMP) += smp.o > obj-$(CONFIG_PA11) += pci-dma.o > obj-$(CONFIG_PCI) += pci.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_64BIT) += sys_parisc32.o signal32.o > obj-$(CONFIG_STACKTRACE)+= stacktrace.o > obj-$(CONFIG_AUDIT) += audit.o > diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c > index 7df140545b22..c81e63e2549b 100644 > --- a/arch/parisc/kernel/module.c > +++ b/arch/parisc/kernel/module.c > @@ -192,17 +192,6 @@ static inline int reassemble_22(int as22) > ((as22 & 0x0003ff) << 3)); > } > > -void *module_alloc(unsigned long size) > -{ > - /* using RWX means less protection for modules, but it's > - * easier than trying to map the text, data, init_text and > - * init_data correctly */ > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > - GFP_KERNEL, > - PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > - > #ifndef CONFIG_64BIT > static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n) > { > diff --git a/arch/parisc/kernel/module_alloc.c b/arch/parisc/kernel/module_alloc.c > new file mode 100644 > index 000000000000..4e4e4ce1132e > --- /dev/null > +++ b/arch/parisc/kernel/module_alloc.c > @@ -0,0 +1,23 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Linux/PA-RISC Project > + * Copyright (C) 2003 Randolph Chung <tausq at debian . org> > + * Copyright (C) 2008 Helge Deller <deller@gmx.de> > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void *module_alloc(unsigned long size) > +{ > + /* > + * Using RWX means less protection for modules, but it's > + * easier than trying to map the text, data, init_text and > + * init_data correctly. > + */ > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > + GFP_KERNEL, > + PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile > index 2e2a2a9bcf43..5a811cdf230b 100644 > --- a/arch/powerpc/kernel/Makefile > +++ b/arch/powerpc/kernel/Makefile > @@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o > endif > obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o > obj-$(CONFIG_MODULES) += module.o module_$(BITS).o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_44x) += cpu_setup_44x.o > obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o > obj-$(CONFIG_PPC_DOORBELL) += dbell.o > diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c > index f6d6ae0a1692..b30e00964a60 100644 > --- a/arch/powerpc/kernel/module.c > +++ b/arch/powerpc/kernel/module.c > @@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr, > > return 0; > } > - > -static __always_inline void * > -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) > -{ > - pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; > - gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); > - > - /* > - * Don't do huge page allocations for modules yet until more testing > - * is done. STRICT_MODULE_RWX may require extra work to support this > - * too. > - */ > - return __vmalloc_node_range(size, 1, start, end, gfp, prot, > - VM_FLUSH_RESET_PERMS, > - NUMA_NO_NODE, __builtin_return_address(0)); > -} > - > -void *module_alloc(unsigned long size) > -{ > -#ifdef MODULES_VADDR > - unsigned long limit = (unsigned long)_etext - SZ_32M; > - void *ptr = NULL; > - > - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); > - > - /* First try within 32M limit from _etext to avoid branch trampolines */ > - if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) > - ptr = __module_alloc(size, limit, MODULES_END, true); > - > - if (!ptr) > - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); > - > - return ptr; > -#else > - return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); > -#endif > -} > diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c > new file mode 100644 > index 000000000000..48541c27ce46 > --- /dev/null > +++ b/arch/powerpc/kernel/module_alloc.c > @@ -0,0 +1,47 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Kernel module help for powerpc. > + * Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. > + * Copyright (C) 2008 Freescale Semiconductor, Inc. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +static __always_inline void * > +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) > +{ > + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; > + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); > + > + /* > + * Don't do huge page allocations for modules yet until more testing > + * is done. STRICT_MODULE_RWX may require extra work to support this > + * too. > + */ > + return __vmalloc_node_range(size, 1, start, end, gfp, prot, > + VM_FLUSH_RESET_PERMS, > + NUMA_NO_NODE, __builtin_return_address(0)); > +} > + > +void *module_alloc(unsigned long size) > +{ > +#ifdef MODULES_VADDR > + unsigned long limit = (unsigned long)_etext - SZ_32M; > + void *ptr = NULL; > + > + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); > + > + /* First try within 32M limit from _etext to avoid branch trampolines */ > + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) > + ptr = __module_alloc(size, limit, MODULES_END, true); > + > + if (!ptr) > + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); > + > + return ptr; > +#else > + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); > +#endif > +} > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile > index c71d6591d539..7af346ae575e 100644 > --- a/arch/riscv/kernel/Makefile > +++ b/arch/riscv/kernel/Makefile > @@ -61,6 +61,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o > > obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o > > obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o > diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c > index 91fe16bfaa07..1621d5e458f7 100644 > --- a/arch/riscv/kernel/module.c > +++ b/arch/riscv/kernel/module.c > @@ -419,16 +419,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, > return 0; > } > > -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) > -void *module_alloc(unsigned long size) > -{ > - return __vmalloc_node_range(size, 1, MODULES_VADDR, > - MODULES_END, GFP_KERNEL, > - PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#endif > - > static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, > const Elf_Shdr *sechdrs, > const char *name) > diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c > new file mode 100644 > index 000000000000..ed87ed048713 > --- /dev/null > +++ b/arch/riscv/kernel/module_alloc.c > @@ -0,0 +1,19 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2017 Zihao Yu > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > +#include <asm/sections.h> > + > +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) > +void *module_alloc(unsigned long size) > +{ > + return __vmalloc_node_range(size, 1, MODULES_VADDR, > + MODULES_END, GFP_KERNEL, > + PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#endif > diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile > index 27d6b3c7aa06..71e73e34b441 100644 > --- a/arch/s390/kernel/Makefile > +++ b/arch/s390/kernel/Makefile > @@ -48,6 +48,11 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o > CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) > > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o > obj-$(CONFIG_NUMA) += numa.o > obj-$(CONFIG_AUDIT) += audit.o > diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c > index 26125a9c436d..e15f09fa50e2 100644 > --- a/arch/s390/kernel/module.c > +++ b/arch/s390/kernel/module.c > @@ -35,23 +35,6 @@ > > #define PLT_ENTRY_SIZE 22 > > -void *module_alloc(unsigned long size) > -{ > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - if (PAGE_ALIGN(size) > MODULES_LEN) > - return NULL; > - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, > - gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > - __builtin_return_address(0)); > - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > - vfree(p); > - return NULL; > - } > - return p; > -} > - > #ifdef CONFIG_FUNCTION_TRACER > void module_arch_cleanup(struct module *mod) > { > diff --git a/arch/s390/kernel/module_alloc.c b/arch/s390/kernel/module_alloc.c > new file mode 100644 > index 000000000000..e4c4175fb759 > --- /dev/null > +++ b/arch/s390/kernel/module_alloc.c > @@ -0,0 +1,33 @@ > +// SPDX-License-Identifier: GPL-2.0+ > +/* > + * Kernel module help for s390. > + * > + * S390 version > + * Copyright IBM Corp. 2002, 2003 > + * Author(s): Arnd Bergmann (arndb@de.ibm.com) > + * Martin Schwidefsky (schwidefsky@de.ibm.com) > + * > + * based on i386 version > + * Copyright (C) 2001 Rusty Russell. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void *module_alloc(unsigned long size) > +{ > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + if (PAGE_ALIGN(size) > MODULES_LEN) > + return NULL; > + p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, > + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > + __builtin_return_address(0)); > + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > + vfree(p); > + return NULL; > + } > + return p; > +} > diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile > index d3a0e072ebe8..53905c00a094 100644 > --- a/arch/sparc/kernel/Makefile > +++ b/arch/sparc/kernel/Makefile > @@ -89,6 +89,11 @@ obj-$(CONFIG_SUN_PM) += apc.o pmc.o > > obj-$(CONFIG_MODULES) += module.o > obj-$(CONFIG_MODULES) += sparc_ksyms.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_SPARC_LED) += led.o > obj-$(CONFIG_KGDB) += kgdb_$(BITS).o > > diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c > index df39580f398d..f2babc69f189 100644 > --- a/arch/sparc/kernel/module.c > +++ b/arch/sparc/kernel/module.c > @@ -21,36 +21,6 @@ > > #include "entry.h" > > -#ifdef CONFIG_SPARC64 > - > -#include <linux/jump_label.h> > - > -static void *module_map(unsigned long size) > -{ > - if (PAGE_ALIGN(size) > MODULES_LEN) > - return NULL; > - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > - __builtin_return_address(0)); > -} > -#else > -static void *module_map(unsigned long size) > -{ > - return vmalloc(size); > -} > -#endif /* CONFIG_SPARC64 */ > - > -void *module_alloc(unsigned long size) > -{ > - void *ret; > - > - ret = module_map(size); > - if (ret) > - memset(ret, 0, size); > - > - return ret; > -} > - > /* Make generic code ignore STT_REGISTER dummy undefined symbols. */ > int module_frob_arch_sections(Elf_Ehdr *hdr, > Elf_Shdr *sechdrs, > diff --git a/arch/sparc/kernel/module_alloc.c b/arch/sparc/kernel/module_alloc.c > new file mode 100644 > index 000000000000..4bacf8cff012 > --- /dev/null > +++ b/arch/sparc/kernel/module_alloc.c > @@ -0,0 +1,39 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Kernel module help for sparc64. > + * > + * Copyright (C) 2001 Rusty Russell. > + * Copyright (C) 2002 David S. Miller. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +#ifdef CONFIG_SPARC64 > +#include <linux/jump_label.h> > + > +static void *module_map(unsigned long size) > +{ > + if (PAGE_ALIGN(size) > MODULES_LEN) > + return NULL; > + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, > + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, > + __builtin_return_address(0)); > +} > +#else > +static void *module_map(unsigned long size) > +{ > + return vmalloc(size); > +} > +#endif /* CONFIG_SPARC64 */ > + > +void *module_alloc(unsigned long size) > +{ > + void *ret; > + > + ret = module_map(size); > + if (ret) > + memset(ret, 0, size); > + > + return ret; > +} > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile > index 03364dc40d8d..1f0304140f44 100644 > --- a/arch/x86/kernel/Makefile > +++ b/arch/x86/kernel/Makefile > @@ -108,6 +108,11 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o > obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o > obj-y += kprobes/ > obj-$(CONFIG_MODULES) += module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > obj-$(CONFIG_X86_32) += doublefault_32.o > obj-$(CONFIG_KGDB) += kgdb.o > obj-$(CONFIG_VM86) += vm86_32.o > diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c > index b98ffcf4d250..40bb10b5a68d 100644 > --- a/arch/x86/kernel/module.c > +++ b/arch/x86/kernel/module.c > @@ -36,56 +36,6 @@ do { \ > } while (0) > #endif > > -#ifdef CONFIG_RANDOMIZE_BASE > -static unsigned long module_load_offset; > - > -/* Mutex protects the module_load_offset. */ > -static DEFINE_MUTEX(module_kaslr_mutex); > - > -static unsigned long int get_module_load_offset(void) > -{ > - if (kaslr_enabled()) { > - mutex_lock(&module_kaslr_mutex); > - /* > - * Calculate the module_load_offset the first time this > - * code is called. Once calculated it stays the same until > - * reboot. > - */ > - if (module_load_offset == 0) > - module_load_offset = > - (get_random_int() % 1024 + 1) * PAGE_SIZE; > - mutex_unlock(&module_kaslr_mutex); > - } > - return module_load_offset; > -} > -#else > -static unsigned long int get_module_load_offset(void) > -{ > - return 0; > -} > -#endif > - > -void *module_alloc(unsigned long size) > -{ > - gfp_t gfp_mask = GFP_KERNEL; > - void *p; > - > - if (PAGE_ALIGN(size) > MODULES_LEN) > - return NULL; > - > - p = __vmalloc_node_range(size, MODULE_ALIGN, > - MODULES_VADDR + get_module_load_offset(), > - MODULES_END, gfp_mask, > - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > - __builtin_return_address(0)); > - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > - vfree(p); > - return NULL; > - } > - > - return p; > -} > - > #ifdef CONFIG_X86_32 > int apply_relocate(Elf32_Shdr *sechdrs, > const char *strtab, > diff --git a/arch/x86/kernel/module_alloc.c b/arch/x86/kernel/module_alloc.c > new file mode 100644 > index 000000000000..a7df2af1467a > --- /dev/null > +++ b/arch/x86/kernel/module_alloc.c > @@ -0,0 +1,61 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Kernel module help for x86. > + * Copyright (C) 2001 Rusty Russell. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/random.h> > +#include <linux/vmalloc.h> > +#include <asm/setup.h> > + > +#ifdef CONFIG_RANDOMIZE_BASE > +static unsigned long module_load_offset; > + > +/* Mutex protects the module_load_offset. */ > +static DEFINE_MUTEX(module_kaslr_mutex); > + > +static unsigned long get_module_load_offset(void) > +{ > + if (kaslr_enabled()) { > + mutex_lock(&module_kaslr_mutex); > + /* > + * Calculate the module_load_offset the first time this > + * code is called. Once calculated it stays the same until > + * reboot. > + */ > + if (module_load_offset == 0) > + module_load_offset = > + (get_random_int() % 1024 + 1) * PAGE_SIZE; > + mutex_unlock(&module_kaslr_mutex); > + } > + return module_load_offset; > +} > +#else > +static unsigned long get_module_load_offset(void) > +{ > + return 0; > +} > +#endif > + > +void *module_alloc(unsigned long size) > +{ > + gfp_t gfp_mask = GFP_KERNEL; > + void *p; > + > + if (PAGE_ALIGN(size) > MODULES_LEN) > + return NULL; > + > + p = __vmalloc_node_range(size, MODULE_ALIGN, > + MODULES_VADDR + get_module_load_offset(), > + MODULES_END, gfp_mask, > + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, > + __builtin_return_address(0)); > + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { > + vfree(p); > + return NULL; > + } > + > + return p; > +} > diff --git a/kernel/Makefile b/kernel/Makefile > index 318789c728d3..2981fe42060d 100644 > --- a/kernel/Makefile > +++ b/kernel/Makefile > @@ -53,6 +53,11 @@ obj-y += livepatch/ > obj-y += dma/ > obj-y += entry/ > obj-$(CONFIG_MODULES) += module/ > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif > > obj-$(CONFIG_KCMP) += kcmp.o > obj-$(CONFIG_FREEZER) += freezer.o > diff --git a/kernel/kprobes.c b/kernel/kprobes.c > index f214f8c088ed..3f9876374cd3 100644 > --- a/kernel/kprobes.c > +++ b/kernel/kprobes.c > @@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p, > goto out; > } > > +#ifdef CONFIG_MODULES > /* Check if 'p' is probing a module. */ > *probed_mod = __module_text_address((unsigned long) p->addr); > if (*probed_mod) { > @@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p, > ret = -ENOENT; > } > } > +#endif > + > out: > preempt_enable(); > jump_label_unlock(); > @@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) > return 0; > } > > +#ifdef CONFIG_MODULES > /* Remove all symbols in given area from kprobe blacklist */ > static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) > { > @@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) > { > kprobe_remove_area_blacklist(entry, entry + 1); > } > +#endif /* CONFIG_MODULES */ > > int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, > char *type, char *sym) > @@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start, > return ret ? : arch_populate_kprobe_blacklist(); > } > > +#ifdef CONFIG_MODULES > static void add_module_kprobe_blacklist(struct module *mod) > { > unsigned long start, end; > @@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = { > .notifier_call = kprobes_module_callback, > .priority = 0 > }; > +#endif /* CONFIG_MODULES */ > > void kprobe_free_init_mem(void) > { > @@ -2717,8 +2724,11 @@ static int __init init_kprobes(void) > err = arch_init_kprobes(); > if (!err) > err = register_die_notifier(&kprobe_exceptions_nb); > + > +#ifdef CONFIG_MODULES > if (!err) > err = register_module_notifier(&kprobe_module_nb); > +#endif > > kprobes_initialized = (err == 0); > kprobe_sysctls_init(); > diff --git a/kernel/module/main.c b/kernel/module/main.c > index fed58d30725d..7fa182b78550 100644 > --- a/kernel/module/main.c > +++ b/kernel/module/main.c > @@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod, > return ksym; > } > > -void __weak module_memfree(void *module_region) > -{ > - /* > - * This memory may be RO, and freeing RO memory in an interrupt is not > - * supported by vmalloc. > - */ > - WARN_ON(in_interrupt()); > - vfree(module_region); > -} > - > void __weak module_arch_cleanup(struct module *mod) > { > } > @@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) > ddebug_remove_module(mod->name); > } > > -void * __weak module_alloc(unsigned long size) > -{ > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, > - NUMA_NO_NODE, __builtin_return_address(0)); > -} > - > bool __weak module_init_section(const char *name) > { > return strstarts(name, ".init"); > diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c > new file mode 100644 > index 000000000000..26a4c60998ad > --- /dev/null > +++ b/kernel/module_alloc.c > @@ -0,0 +1,26 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2002 Richard Henderson > + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. > + */ > + > +#include <linux/mm.h> > +#include <linux/moduleloader.h> > +#include <linux/vmalloc.h> > + > +void * __weak module_alloc(unsigned long size) > +{ > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, > + NUMA_NO_NODE, __builtin_return_address(0)); > +} > + > +void __weak module_memfree(void *module_region) > +{ > + /* > + * This memory may be RO, and freeing RO memory in an interrupt is not > + * supported by vmalloc. > + */ > + WARN_ON(in_interrupt()); > + vfree(module_region); > +} > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c > index 93507330462c..050b2975332e 100644 > --- a/kernel/trace/trace_kprobe.c > +++ b/kernel/trace/trace_kprobe.c > @@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) > return kprobe_gone(&tk->rp.kp); > } > > +#ifdef CONFIG_MODULES > static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, > struct module *mod) > { > @@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, > > return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; > } > +#endif /* CONFIG_MODULES */ > > static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) > { > + bool ret = false; > +#ifdef CONFIG_MODULES > char *p; > - bool ret; > > if (!tk->symbol) > return false; > @@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) > ret = !!find_module(tk->symbol); > rcu_read_unlock_sched(); > *p = ':'; > +#endif /* CONFIG_MODULES */ > > return ret; > } > @@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) > return ret; > } > > +#ifdef CONFIG_MODULES > /* Module notifier call back, checking event on the module */ > static int trace_kprobe_module_callback(struct notifier_block *nb, > unsigned long val, void *data) > @@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = { > .notifier_call = trace_kprobe_module_callback, > .priority = 1 /* Invoked after kprobe module callback */ > }; > +#endif /* CONFIG_MODULES */ > > static int __trace_kprobe_create(int argc, const char *argv[]) > { > @@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void) > if (ret) > return ret; > > +#ifdef CONFIG_MODULES > if (register_module_notifier(&trace_kprobe_module_nb)) > return -EINVAL; > +#endif /* CONFIG_MODULES */ > > return 0; > } > -- > 2.36.1 >
On Wed, Jun 8, 2022 at 9:28 AM Ard Biesheuvel <ardb@kernel.org> wrote: > > Hello Jarkko, > > On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > Tracing with kprobes while running a monolithic kernel is currently > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > dependency is a result of kprobes code using the module allocator for the > > trampoline code. > > > > Detaching kprobes from modules helps to squeeze down the user space, > > e.g. when developing new core kernel features, while still having all > > the nice tracing capabilities. > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > code with CONFIG_MODULES. > > > > As the result, kprobes can be used with a monolithic kernel. > > I think I may have mentioned this the previous time as well, but I > don't think this is the right approach. > > Kprobes uses alloc_insn_page() to allocate executable memory, but the > requirements for this memory are radically different compared to > loadable modules, which need to be within an arch-specific distance of > the core kernel, need KASAN backing etc etc. I think the distance of core kernel requirement is the same for kprobe alloc_insn_page and modules, no? Thanks, Song > > This is why arm64, for instance, does not implement alloc_insn_page() > in terms of module_alloc() [and likely does not belong in this patch > for that reason] > > Is there any reason kprobes cannot simply use vmalloc()? >
On Wed, Jun 8, 2022 at 9:12 AM Song Liu <song@kernel.org> wrote: > > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > Hi Jarkko, > > > > On Wed, 8 Jun 2022 08:25:38 +0300 > > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > > . > > > > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > > dependency is a result of kprobes code using the module allocator for the > > > > > trampoline code. > > > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > > e.g. when developing new core kernel features, while still having all > > > > > the nice tracing capabilities. > > > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > > code with CONFIG_MODULES. > > > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > > > Maybe we need a kprobe_alloc, right? > > > > > > Perhaps not the best name but at least it documents the fact that > > > they use the same allocator. > > > > > > Few years ago I carved up something "half-way there" for kprobes, > > > and I used the name text_alloc() [*]. > > > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > > > Yeah, I remember that. Thank you for updating your patch! > > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > > If module support maintainers think this name is not good, you may be > > able to rename it as text_alloc() and make the module_alloc() as a > > wrapper of it. > > IIUC, most users of module_alloc() use it to allocate memory for text, except > that module code uses it for both text and data. Therefore, I guess calling it > text_alloc() is not 100% accurate until we change the module code (to use > a different API to allocate memory for data). Git history showed me 7a0e27b2a0ce mm: remove vmalloc_exec I guess we are somehow going back in time... Song > > Thanks, > Song > > > > > Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > for kprobe side. > > > > Thank you, > > > > -- > > Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Wed, Jun 08, 2022 at 11:20:53AM -0700, Song Liu wrote: > On Wed, Jun 8, 2022 at 9:12 AM Song Liu <song@kernel.org> wrote: > > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > On Wed, 8 Jun 2022 08:25:38 +0300 > > > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > > > > > Maybe we need a kprobe_alloc, right? > > > > > > > > Perhaps not the best name but at least it documents the fact that > > > > they use the same allocator. > > > > > > > > Few years ago I carved up something "half-way there" for kprobes, > > > > and I used the name text_alloc() [*]. > > > > > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > > > > > Yeah, I remember that. Thank you for updating your patch! > > > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > > > If module support maintainers think this name is not good, you may be > > > able to rename it as text_alloc() and make the module_alloc() as a > > > wrapper of it. > > > > IIUC, most users of module_alloc() use it to allocate memory for text, except > > that module code uses it for both text and data. Therefore, I guess calling it > > text_alloc() is not 100% accurate until we change the module code (to use > > a different API to allocate memory for data). > > Git history showed me > > 7a0e27b2a0ce mm: remove vmalloc_exec > > I guess we are somehow going back in time... No, that was removed because it has only one user. The real hard work to generalize vmalloc_exec() with all the arch special sauce was not done. To do this properly architectures must be able to override it. We can use the old vmalloc_exec() or text_alloc(). I think vmalloc_exec() is more in line with mm stuff, but it would be our first __weak mm call from what I can tell. Anyway patches welcomed. Luis
On Wed, Jun 08, 2022 at 01:26:19PM -0700, Luis Chamberlain wrote:
> No, that was removed because it has only one user.
That is only part of the story. The other part is that the overall
kernel simply does not have any business allocating exutable memory.
Executable memory is a very special concept for modules or module-like
code like kprobes, and should not be exposed as a general concept.
Especially as executable memory really should not also be writable
for security reasons. In other words, we should actually never
allocate executable memory, every. We might seal memory and then
mark it executable after having written to it, which is how modules
and kprobes are implemented on all modern Linux ports anyway.
On Wed, Jun 08, 2022 at 06:27:51PM +0200, Ard Biesheuvel wrote: > Hello Jarkko, > > On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > Tracing with kprobes while running a monolithic kernel is currently > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > dependency is a result of kprobes code using the module allocator for the > > trampoline code. > > > > Detaching kprobes from modules helps to squeeze down the user space, > > e.g. when developing new core kernel features, while still having all > > the nice tracing capabilities. > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > code with CONFIG_MODULES. > > > > As the result, kprobes can be used with a monolithic kernel. > > I think I may have mentioned this the previous time as well, but I > don't think this is the right approach. OK, I apologize for my ignorance. It's been a while. > Kprobes uses alloc_insn_page() to allocate executable memory, but the > requirements for this memory are radically different compared to > loadable modules, which need to be within an arch-specific distance of > the core kernel, need KASAN backing etc etc. > > This is why arm64, for instance, does not implement alloc_insn_page() > in terms of module_alloc() [and likely does not belong in this patch > for that reason] > > Is there any reason kprobes cannot simply use vmalloc()? All arch's, except nios2 use vmalloc() in the end for module_alloc(). nios2 uses kmalloc() for the reasons that I'm not aware of, but it does not support kprobes in the first place. Based on this, I think that could work out just fine. I could cope with that. BR, Jarkko
On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote: > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile > index 553866751e1a..d2bb954cd54f 100644 > --- a/arch/arm/kernel/Makefile > +++ b/arch/arm/kernel/Makefile > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o > obj-$(CONFIG_ISA_DMA_API) += dma.o > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o > obj-$(CONFIG_MODULES) += armksyms.o module.o > +ifeq ($(CONFIG_MODULES),y) > +obj-y += module_alloc.o > +else > +obj-$(CONFIG_KPROBES) += module_alloc.o > +endif Doesn't: obj-$(CONFIG_MODULES) += module_alloc.o obj-$(CONFIG_KPROBES) += module_alloc.o work just as well? The kbuild modules.rst documentation says: The order of files in $(obj-y) is significant. Duplicates in the lists are allowed: the first instance will be linked into built-in.a and succeeding instances will be ignored. so you should be fine... or the documentation is wrong!
On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote: > On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote: > > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile > > index 553866751e1a..d2bb954cd54f 100644 > > --- a/arch/arm/kernel/Makefile > > +++ b/arch/arm/kernel/Makefile > > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o > > obj-$(CONFIG_ISA_DMA_API) += dma.o > > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o > > obj-$(CONFIG_MODULES) += armksyms.o module.o > > +ifeq ($(CONFIG_MODULES),y) > > +obj-y += module_alloc.o > > +else > > +obj-$(CONFIG_KPROBES) += module_alloc.o > > +endif > > Doesn't: > > obj-$(CONFIG_MODULES) += module_alloc.o > obj-$(CONFIG_KPROBES) += module_alloc.o > > work just as well? The kbuild modules.rst documentation says: > > The order of files in $(obj-y) is significant. Duplicates in > the lists are allowed: the first instance will be linked into > built-in.a and succeeding instances will be ignored. > > so you should be fine... or the documentation is wrong! OK, I did not know this. Thanks for the tip! BR, Jarkko
On Thu, Jun 09, 2022 at 08:30:12AM +0000, Christophe Leroy wrote: > > > Le 08/06/2022 à 01:59, Jarkko Sakkinen a écrit : > > [You don't often get email from jarkko@profian.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ] > > > > Tracing with kprobes while running a monolithic kernel is currently > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > dependency is a result of kprobes code using the module allocator for the > > trampoline code. > > > > Detaching kprobes from modules helps to squeeze down the user space, > > e.g. when developing new core kernel features, while still having all > > the nice tracing capabilities. > > Nice idea, could also be nice to have BPF without MODULES. Yeah, for sure. You have to start from somewhere :-) I'd guess this a step forward also for BPF. > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > code with CONFIG_MODULES. > > Nice, but that's not enough. You have to audit every peace of code that > depends on CONFIG_MODULES and see if it needs to be activated for your > case as well. For instance some powerpc configurations don't honor exec > page faults on kernel pages when CONFIG_MODULES is not selected. Thanks for pointing this out. With "every peace of code" you probably are referring to the 13 arch-folders, which support kprobes in the first place (just checking)? > > As the result, kprobes can be used with a monolithic kernel. > > > > Signed-off-by: Jarkko Sakkinen <jarkko@profian.com> > > I think this patch should be split in a several patches, one (or even > one per architectures ?) to make modules_alloc() independant of > CONFIG_MODULES, then a patch to make CONFIG_KPROBES independant on > CONFIG_MOUDLES. Agreed. And also because of your previous remark, i.e. each arch needs it own conclusions of the changes. I purposely did this first as a one patch in order to get a better picture of the situation. > > --- > > Tested with the help of BuildRoot and QEMU: > > - arm (function tracer) > > - arm64 (function tracer) > > - mips (function tracer) > > - powerpc (function tracer) > > - riscv (function tracer) > > - s390 (function tracer) > > - sparc (function tracer) > > - x86 (function tracer) > > - sh (function tracer, for the "pure" kernel/modules_alloc.c path) > > --- > > arch/Kconfig | 1 - > > arch/arm/kernel/Makefile | 5 +++ > > arch/arm/kernel/module.c | 32 ---------------- > > arch/arm/kernel/module_alloc.c | 42 ++++++++++++++++++++ > > arch/arm64/kernel/Makefile | 5 +++ > > arch/arm64/kernel/module.c | 47 ----------------------- > > arch/arm64/kernel/module_alloc.c | 57 ++++++++++++++++++++++++++++ > > arch/mips/kernel/Makefile | 5 +++ > > arch/mips/kernel/module.c | 9 ----- > > arch/mips/kernel/module_alloc.c | 18 +++++++++ > > arch/parisc/kernel/Makefile | 5 +++ > > arch/parisc/kernel/module.c | 11 ------ > > arch/parisc/kernel/module_alloc.c | 23 +++++++++++ > > arch/powerpc/kernel/Makefile | 5 +++ > > arch/powerpc/kernel/module.c | 37 ------------------ > > arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++ > > You are missing necessary changes for powerpc. > > On powerpc 8xx or powerpc 603, software TLB handlers don't honor > instruction TLB miss when CONFIG_MODULES are not set, look into > head_8xx.S and head_book3s_32.S > > On powerpc book3s/32, all kernel space is set to NX except the module > segment. When CONFIG_MODULES is all space is set NX. See > mmu_mark_initmem_nx() and is_module_segment(). Thank you! I'll go this through and also try to build an environment with BuildRoot where I can test-run this configuration. > > arch/riscv/kernel/Makefile | 5 +++ > > arch/riscv/kernel/module.c | 10 ----- > > arch/riscv/kernel/module_alloc.c | 19 ++++++++++ > > arch/s390/kernel/Makefile | 5 +++ > > arch/s390/kernel/module.c | 17 --------- > > arch/s390/kernel/module_alloc.c | 33 ++++++++++++++++ > > arch/sparc/kernel/Makefile | 5 +++ > > arch/sparc/kernel/module.c | 30 --------------- > > arch/sparc/kernel/module_alloc.c | 39 +++++++++++++++++++ > > arch/x86/kernel/Makefile | 5 +++ > > arch/x86/kernel/module.c | 50 ------------------------ > > arch/x86/kernel/module_alloc.c | 61 ++++++++++++++++++++++++++++++ > > kernel/Makefile | 5 +++ > > kernel/kprobes.c | 10 +++++ > > kernel/module/main.c | 17 --------- > > kernel/module_alloc.c | 26 +++++++++++++ > > kernel/trace/trace_kprobe.c | 10 ++++- > > 33 files changed, 434 insertions(+), 262 deletions(-) > > create mode 100644 arch/arm/kernel/module_alloc.c > > create mode 100644 arch/arm64/kernel/module_alloc.c > > create mode 100644 arch/mips/kernel/module_alloc.c > > create mode 100644 arch/parisc/kernel/module_alloc.c > > create mode 100644 arch/powerpc/kernel/module_alloc.c > > create mode 100644 arch/riscv/kernel/module_alloc.c > > create mode 100644 arch/s390/kernel/module_alloc.c > > create mode 100644 arch/sparc/kernel/module_alloc.c > > create mode 100644 arch/x86/kernel/module_alloc.c > > create mode 100644 kernel/module_alloc.c > > > > diff --git a/arch/Kconfig b/arch/Kconfig > > index fcf9a41a4ef5..e8e3e7998a2e 100644 > > --- a/arch/Kconfig > > +++ b/arch/Kconfig > > @@ -39,7 +39,6 @@ config GENERIC_ENTRY > > > > config KPROBES > > bool "Kprobes" > > - depends on MODULES > > depends on HAVE_KPROBES > > select KALLSYMS > > select TASKS_RCU if PREEMPTION > > > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile > > index 2e2a2a9bcf43..5a811cdf230b 100644 > > --- a/arch/powerpc/kernel/Makefile > > +++ b/arch/powerpc/kernel/Makefile > > @@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o > > endif > > obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o > > obj-$(CONFIG_MODULES) += module.o module_$(BITS).o > > +ifeq ($(CONFIG_MODULES),y) > > +obj-y += module_alloc.o > > +else > > +obj-$(CONFIG_KPROBES) += module_alloc.o > > +endif > > Why not just do: > > obj-$(CONFIG_MODULES) += module_alloc.o > obj-$(CONFIG_KPROBES) += module_alloc.o > > However, a new hidden config item (eg: CONFIG_DYNAMIC_TEXT) selected by > both CONFIG_MODULES and CONFIG_KPROBES would make like easier when > you'll come to do the changes required. I'll do this. Russell King also pointed out the same thing. > > obj-$(CONFIG_44x) += cpu_setup_44x.o > > obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o > > obj-$(CONFIG_PPC_DOORBELL) += dbell.o > > diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c > > index f6d6ae0a1692..b30e00964a60 100644 > > --- a/arch/powerpc/kernel/module.c > > +++ b/arch/powerpc/kernel/module.c > > @@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr, > > > > return 0; > > } > > - > > -static __always_inline void * > > -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) > > -{ > > - pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; > > - gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); > > - > > - /* > > - * Don't do huge page allocations for modules yet until more testing > > - * is done. STRICT_MODULE_RWX may require extra work to support this > > - * too. > > - */ > > - return __vmalloc_node_range(size, 1, start, end, gfp, prot, > > - VM_FLUSH_RESET_PERMS, > > - NUMA_NO_NODE, __builtin_return_address(0)); > > -} > > - > > -void *module_alloc(unsigned long size) > > -{ > > -#ifdef MODULES_VADDR > > - unsigned long limit = (unsigned long)_etext - SZ_32M; > > - void *ptr = NULL; > > - > > - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); > > - > > - /* First try within 32M limit from _etext to avoid branch trampolines */ > > - if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) > > - ptr = __module_alloc(size, limit, MODULES_END, true); > > - > > - if (!ptr) > > - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); > > - > > - return ptr; > > -#else > > - return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); > > -#endif > > -} > > diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c > > new file mode 100644 > > index 000000000000..48541c27ce46 > > --- /dev/null > > +++ b/arch/powerpc/kernel/module_alloc.c > > @@ -0,0 +1,47 @@ > > +// SPDX-License-Identifier: GPL-2.0-or-later > > +/* > > + * Kernel module help for powerpc. > > + * Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. > > + * Copyright (C) 2008 Freescale Semiconductor, Inc. > > + */ > > + > > +#include <linux/mm.h> > > +#include <linux/moduleloader.h> > > +#include <linux/vmalloc.h> > > + > > +static __always_inline void * > > +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) > > +{ > > + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; > > + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); > > + > > + /* > > + * Don't do huge page allocations for modules yet until more testing > > + * is done. STRICT_MODULE_RWX may require extra work to support this > > + * too. > > + */ > > + return __vmalloc_node_range(size, 1, start, end, gfp, prot, > > + VM_FLUSH_RESET_PERMS, > > + NUMA_NO_NODE, __builtin_return_address(0)); > > +} > > + > > +void *module_alloc(unsigned long size) > > +{ > > +#ifdef MODULES_VADDR > > Is MODULES_VADDR defined even when CONFIG_MODULES is not ? Yes, by this in ppc's asm/pgtable.h: #ifdef CONFIG_PPC_BOOK3S #include <asm/book3s/pgtable.h> #else #include <asm/nohash/pgtable.h> #endif /* !CONFIG_PPC_BOOK3S */ > > + unsigned long limit = (unsigned long)_etext - SZ_32M; > > + void *ptr = NULL; > > + > > + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); > > + > > + /* First try within 32M limit from _etext to avoid branch trampolines */ > > + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) > > + ptr = __module_alloc(size, limit, MODULES_END, true); > > + > > + if (!ptr) > > + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); > > + > > + return ptr; > > +#else > > + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); > > +#endif > > +} > > > diff --git a/kernel/Makefile b/kernel/Makefile > > index 318789c728d3..2981fe42060d 100644 > > --- a/kernel/Makefile > > +++ b/kernel/Makefile > > @@ -53,6 +53,11 @@ obj-y += livepatch/ > > obj-y += dma/ > > obj-y += entry/ > > obj-$(CONFIG_MODULES) += module/ > > +ifeq ($(CONFIG_MODULES),y) > > +obj-y += module_alloc.o > > +else > > +obj-$(CONFIG_KPROBES) += module_alloc.o > > +endif > > Same comment, could be: > > obj-$(CONFIG_MODULES) += module_alloc.o > obj-$(CONFIG_KPROBES) += module_alloc.o Ditto. > > > > > obj-$(CONFIG_KCMP) += kcmp.o > > obj-$(CONFIG_FREEZER) += freezer.o > > diff --git a/kernel/kprobes.c b/kernel/kprobes.c > > index f214f8c088ed..3f9876374cd3 100644 > > --- a/kernel/kprobes.c > > +++ b/kernel/kprobes.c > > @@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p, > > goto out; > > } > > > > +#ifdef CONFIG_MODULES > > /* Check if 'p' is probing a module. */ > > *probed_mod = __module_text_address((unsigned long) p->addr); > > if (*probed_mod) { > > @@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p, > > ret = -ENOENT; > > } > > } > > +#endif > > + > > out: > > preempt_enable(); > > jump_label_unlock(); > > @@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) > > return 0; > > } > > > > +#ifdef CONFIG_MODULES > > /* Remove all symbols in given area from kprobe blacklist */ > > static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) > > { > > @@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) > > { > > kprobe_remove_area_blacklist(entry, entry + 1); > > } > > +#endif /* CONFIG_MODULES */ > > > > int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, > > char *type, char *sym) > > @@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start, > > return ret ? : arch_populate_kprobe_blacklist(); > > } > > > > +#ifdef CONFIG_MODULES > > static void add_module_kprobe_blacklist(struct module *mod) > > { > > unsigned long start, end; > > @@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = { > > .notifier_call = kprobes_module_callback, > > .priority = 0 > > }; > > +#endif /* CONFIG_MODULES */ > > > > void kprobe_free_init_mem(void) > > { > > @@ -2717,8 +2724,11 @@ static int __init init_kprobes(void) > > err = arch_init_kprobes(); > > if (!err) > > err = register_die_notifier(&kprobe_exceptions_nb); > > + > > +#ifdef CONFIG_MODULES > > if (!err) > > err = register_module_notifier(&kprobe_module_nb); > > +#endif > > > > kprobes_initialized = (err == 0); > > kprobe_sysctls_init(); > > diff --git a/kernel/module/main.c b/kernel/module/main.c > > index fed58d30725d..7fa182b78550 100644 > > --- a/kernel/module/main.c > > +++ b/kernel/module/main.c > > @@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod, > > return ksym; > > } > > > > -void __weak module_memfree(void *module_region) > > -{ > > - /* > > - * This memory may be RO, and freeing RO memory in an interrupt is not > > - * supported by vmalloc. > > - */ > > - WARN_ON(in_interrupt()); > > - vfree(module_region); > > -} > > - > > void __weak module_arch_cleanup(struct module *mod) > > { > > } > > @@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) > > ddebug_remove_module(mod->name); > > } > > > > -void * __weak module_alloc(unsigned long size) > > -{ > > - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > > - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, > > - NUMA_NO_NODE, __builtin_return_address(0)); > > -} > > - > > bool __weak module_init_section(const char *name) > > { > > return strstarts(name, ".init"); > > diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c > > new file mode 100644 > > index 000000000000..26a4c60998ad > > --- /dev/null > > +++ b/kernel/module_alloc.c > > @@ -0,0 +1,26 @@ > > +// SPDX-License-Identifier: GPL-2.0-or-later > > +/* > > + * Copyright (C) 2002 Richard Henderson > > + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. > > + */ > > + > > +#include <linux/mm.h> > > +#include <linux/moduleloader.h> > > +#include <linux/vmalloc.h> > > + > > +void * __weak module_alloc(unsigned long size) > > +{ > > + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, > > + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, > > + NUMA_NO_NODE, __builtin_return_address(0)); > > +} > > + > > +void __weak module_memfree(void *module_region) > > +{ > > + /* > > + * This memory may be RO, and freeing RO memory in an interrupt is not > > + * supported by vmalloc. > > + */ > > + WARN_ON(in_interrupt()); > > + vfree(module_region); > > +} > > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c > > index 93507330462c..050b2975332e 100644 > > --- a/kernel/trace/trace_kprobe.c > > +++ b/kernel/trace/trace_kprobe.c > > @@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) > > return kprobe_gone(&tk->rp.kp); > > } > > > > +#ifdef CONFIG_MODULES > > static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, > > struct module *mod) > > { > > @@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, > > > > return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; > > } > > +#endif /* CONFIG_MODULES */ > > > > static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) > > { > > + bool ret = false; > > +#ifdef CONFIG_MODULES > > char *p; > > - bool ret; > > > > if (!tk->symbol) > > return false; > > @@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) > > ret = !!find_module(tk->symbol); > > rcu_read_unlock_sched(); > > *p = ':'; > > +#endif /* CONFIG_MODULES */ > > > > return ret; > > } > > @@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) > > return ret; > > } > > > > +#ifdef CONFIG_MODULES > > /* Module notifier call back, checking event on the module */ > > static int trace_kprobe_module_callback(struct notifier_block *nb, > > unsigned long val, void *data) > > @@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = { > > .notifier_call = trace_kprobe_module_callback, > > .priority = 1 /* Invoked after kprobe module callback */ > > }; > > +#endif /* CONFIG_MODULES */ > > > > static int __trace_kprobe_create(int argc, const char *argv[]) > > { > > @@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void) > > if (ret) > > return ret; > > > > +#ifdef CONFIG_MODULES > > if (register_module_notifier(&trace_kprobe_module_nb)) > > return -EINVAL; > > +#endif /* CONFIG_MODULES */ > > > > return 0; > > } > > -- > > 2.36.1 > > Thanks for the well-considered remarks! BR, Jarkko
On Wed, Jun 08, 2022 at 11:21:15PM +0900, Masami Hiramatsu wrote: > Hi Jarkko, > > On Wed, 8 Jun 2022 08:25:38 +0300 > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > . > > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > dependency is a result of kprobes code using the module allocator for the > > > > trampoline code. > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > e.g. when developing new core kernel features, while still having all > > > > the nice tracing capabilities. > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > code with CONFIG_MODULES. > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > Maybe we need a kprobe_alloc, right? > > > > Perhaps not the best name but at least it documents the fact that > > they use the same allocator. > > > > Few years ago I carved up something "half-way there" for kprobes, > > and I used the name text_alloc() [*]. > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > Yeah, I remember that. Thank you for updating your patch! > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > If module support maintainers think this name is not good, you may be > able to rename it as text_alloc() and make the module_alloc() as a > wrapper of it. > > Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > for kprobe side. Thanks a lot! If I split that code into its own patch with no code changes, can I attach this to the patch? I.e. most likely I'll split arch's into their own patches. > Thank you, > > -- > Masami Hiramatsu (Google) <mhiramat@kernel.org> BR, Jarkko
On Wed, Jun 08, 2022 at 09:12:34AM -0700, Song Liu wrote: > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > Hi Jarkko, > > > > On Wed, 8 Jun 2022 08:25:38 +0300 > > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > > . > > > > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > > dependency is a result of kprobes code using the module allocator for the > > > > > trampoline code. > > > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > > e.g. when developing new core kernel features, while still having all > > > > > the nice tracing capabilities. > > > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > > code with CONFIG_MODULES. > > > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > > > Maybe we need a kprobe_alloc, right? > > > > > > Perhaps not the best name but at least it documents the fact that > > > they use the same allocator. > > > > > > Few years ago I carved up something "half-way there" for kprobes, > > > and I used the name text_alloc() [*]. > > > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > > > Yeah, I remember that. Thank you for updating your patch! > > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > > If module support maintainers think this name is not good, you may be > > able to rename it as text_alloc() and make the module_alloc() as a > > wrapper of it. > > IIUC, most users of module_alloc() use it to allocate memory for text, except > that module code uses it for both text and data. Therefore, I guess calling it > text_alloc() is not 100% accurate until we change the module code (to use > a different API to allocate memory for data). After reading the feedback, I'd stay on using module_alloc() because it has arch-specific quirks baked in. Easier to deal with them in one place. > Thanks, > Song BR, Jarkko
On Thu, 9 Jun 2022 at 15:14, Jarkko Sakkinen <jarkko@kernel.org> wrote: > > On Wed, Jun 08, 2022 at 09:12:34AM -0700, Song Liu wrote: > > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > > Hi Jarkko, > > > > > > On Wed, 8 Jun 2022 08:25:38 +0300 > > > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > > > . > > > > > > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > > > dependency is a result of kprobes code using the module allocator for the > > > > > > trampoline code. > > > > > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > > > e.g. when developing new core kernel features, while still having all > > > > > > the nice tracing capabilities. > > > > > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > > > code with CONFIG_MODULES. > > > > > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > > > > > Maybe we need a kprobe_alloc, right? > > > > > > > > Perhaps not the best name but at least it documents the fact that > > > > they use the same allocator. > > > > > > > > Few years ago I carved up something "half-way there" for kprobes, > > > > and I used the name text_alloc() [*]. > > > > > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > > > > > Yeah, I remember that. Thank you for updating your patch! > > > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > > > If module support maintainers think this name is not good, you may be > > > able to rename it as text_alloc() and make the module_alloc() as a > > > wrapper of it. > > > > IIUC, most users of module_alloc() use it to allocate memory for text, except > > that module code uses it for both text and data. Therefore, I guess calling it > > text_alloc() is not 100% accurate until we change the module code (to use > > a different API to allocate memory for data). > > After reading the feedback, I'd stay on using module_alloc() because > it has arch-specific quirks baked in. Easier to deal with them in one > place. > In that case, please ensure that you enable this only on architectures where it is needed. arm64 implements alloc_insn_page() without relying on module_alloc() so I would not expect to see any changes there.
On Thu, Jun 09, 2022 at 05:48:52AM +0200, Christoph Hellwig wrote: > On Wed, Jun 08, 2022 at 01:26:19PM -0700, Luis Chamberlain wrote: > > No, that was removed because it has only one user. > > That is only part of the story. The other part is that the overall > kernel simply does not have any business allocating exutable memory. > Executable memory is a very special concept for modules or module-like > code like kprobes, and should not be exposed as a general concept. It is not just modules and kprobes, it is also ftrace and bpf too now. So while it should not be used everywhere calling it module_alloc() is just confusing at this point. Likewise, module_alloc_huge() is being proposed too and I'd rather we deal with this properly in aligment of taking care of the rename as well. If the concern is to restrict access we can use the module namespace stuff so to ensure only intended users get access to it. > Especially as executable memory really should not also be writable > for security reasons. In other words, we should actually never > allocate executable memory, every. We might seal memory and then > mark it executable after having written to it, which is how modules > and kprobes are implemented on all modern Linux ports anyway. The respective free *should* do the executable bits, and there is no generic way to do this for all archs and so it is open coded today. In fact some architectures need further work / help and so split up the module data and exect already on v5.19+ with the new ARCH_WANTS_MODULES_DATA_IN_VMALLOC. See this thread for details: https://lkml.kernel.org/r/Yo1XTN441qbNTLGR@bombadil.infradead.org Doing this work is not easy, but if we're going to do it, it must be done right. Luis
On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote: > On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote: > > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile > > index 553866751e1a..d2bb954cd54f 100644 > > --- a/arch/arm/kernel/Makefile > > +++ b/arch/arm/kernel/Makefile > > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o > > obj-$(CONFIG_ISA_DMA_API) += dma.o > > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o > > obj-$(CONFIG_MODULES) += armksyms.o module.o > > +ifeq ($(CONFIG_MODULES),y) > > +obj-y += module_alloc.o > > +else > > +obj-$(CONFIG_KPROBES) += module_alloc.o > > +endif > > Doesn't: > > obj-$(CONFIG_MODULES) += module_alloc.o > obj-$(CONFIG_KPROBES) += module_alloc.o That just begs for a new kconfig symbol for the object, and for the object then to be built with it. The archs which override the default can use ARCH_HAS_VM_ALLOC_EXEC. Please note that the respective free is important as well and its not clear if we need an another define for the free. Someone has to do that work. We want to ensure to noexec the code on free and this can vary on each arch. > work just as well? The kbuild modules.rst documentation says: > > The order of files in $(obj-y) is significant. Duplicates in > the lists are allowed: the first instance will be linked into > built-in.a and succeeding instances will be ignored. > > so you should be fine... or the documentation is wrong! Agreed, but this is just sloppy, better to use a new kconfig symbol to represent what is actually being required. Luis
On Thu, 2022-06-09 at 06:24 -0700, Luis Chamberlain wrote: > On Thu, Jun 09, 2022 at 05:48:52AM +0200, Christoph Hellwig wrote: > > On Wed, Jun 08, 2022 at 01:26:19PM -0700, Luis Chamberlain wrote: > > > No, that was removed because it has only one user. > > > > That is only part of the story. The other part is that the overall > > kernel simply does not have any business allocating exutable > > memory. > > Executable memory is a very special concept for modules or module- > > like > > code like kprobes, and should not be exposed as a general concept. > > It is not just modules and kprobes, it is also ftrace and bpf too > now. > So while it should not be used everywhere calling it module_alloc() > is just confusing at this point. Likewise, module_alloc_huge() is > being proposed too and I'd rather we deal with this properly in > aligment > of taking care of the rename as well. > > If the concern is to restrict access we can use the module namespace > stuff > so to ensure only intended users get access to it. BPF even has multiple uses for text allocation. It has its own trampoline feature that puts different type of text in the allocation, with its own allocation routine. I looks like there are even more little allocators in there. So yea, there seems to be a lot of the kernel in the business of dynamically generated text, for better or worse. I agree that it needs to be done carefully. However, these usages always seem to have the same problems (W^X, arch eccentricities, etc). So I don't think we should hide away the pieces. Instead we should have something with guard rails on it, so they can't get the allocation part wrong. But I guess the question here is: what should we do in the meantime? It is kind of similar to the questions that came up around the bpf prog pack allocator. Should we hold up allocator related work until underlying problems are resolved and there is some mature core solution? Personally I had thought we would need to do some clean switch to a much different interface. I still think someday it will be required, but it seems to be evolving naturally for the time being. Like say for a next step we moved prog pack out of bpf into core code, gave it it's own copy of module_alloc(), and then made kprobes use it. Then we would have something with improved W^X guard rails, and kprobes would not depend on modules anymore. I think maybe it's a step in the right direction, even if it's not perfect.
On Thu, Jun 9, 2022 at 1:34 AM Christophe Leroy <christophe.leroy@csgroup.eu> wrote: > > > > Le 08/06/2022 à 18:12, Song Liu a écrit : > > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > >> > >> Hi Jarkko, > >> > >> On Wed, 8 Jun 2022 08:25:38 +0300 > >> Jarkko Sakkinen <jarkko@kernel.org> wrote: > >> > >>> On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > >>>> . > >>>> > >>>> On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > >>>>> > >>>>> Tracing with kprobes while running a monolithic kernel is currently > >>>>> impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > >>>>> dependency is a result of kprobes code using the module allocator for the > >>>>> trampoline code. > >>>>> > >>>>> Detaching kprobes from modules helps to squeeze down the user space, > >>>>> e.g. when developing new core kernel features, while still having all > >>>>> the nice tracing capabilities. > >>>>> > >>>>> For kernel/ and arch/*, move module_alloc() and module_memfree() to > >>>>> module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > >>>>> or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > >>>>> code with CONFIG_MODULES. > >>>>> > >>>>> As the result, kprobes can be used with a monolithic kernel. > >>>> It's strange when MODULES is n, but vmlinux still obtains module_alloc. > >>>> > >>>> Maybe we need a kprobe_alloc, right? > >>> > >>> Perhaps not the best name but at least it documents the fact that > >>> they use the same allocator. > >>> > >>> Few years ago I carved up something "half-way there" for kprobes, > >>> and I used the name text_alloc() [*]. > >>> > >>> [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > >> > >> Yeah, I remember that. Thank you for updating your patch! > >> I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > >> If module support maintainers think this name is not good, you may be > >> able to rename it as text_alloc() and make the module_alloc() as a > >> wrapper of it. > > > > IIUC, most users of module_alloc() use it to allocate memory for text, except > > that module code uses it for both text and data. Therefore, I guess calling it > > text_alloc() is not 100% accurate until we change the module code (to use > > a different API to allocate memory for data). > > When CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC, module code uses > module_alloc() for text and vmalloc() for data, see function > move_module() in kernel/module/main.c Thanks for the pointer! I will play with it. Song
On Thu, Jun 9, 2022 at 11:41 AM Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote: > > On Thu, 2022-06-09 at 06:24 -0700, Luis Chamberlain wrote: [...] > > Like say for a next step we moved prog pack out of bpf into core code, > gave it it's own copy of module_alloc(), and then made kprobes use it. > Then we would have something with improved W^X guard rails, and kprobes > would not depend on modules anymore. I think maybe it's a step in the > right direction, even if it's not perfect. I was also thinking about where to put the prog pack allocator eventually. Shall it be part of mm code or module code? Btw, prog pack has more challenges with W^X requirement: 1. The arch need to have text poke mechanism; 2. Some users of prog pack need to learn to use the ROXbuffer with text poke, for example, how to calculate relative address differently. Thanks, Song
On Thu, 9 Jun 2022 15:23:16 +0200 Ard Biesheuvel <ardb@kernel.org> wrote: > On Thu, 9 Jun 2022 at 15:14, Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > On Wed, Jun 08, 2022 at 09:12:34AM -0700, Song Liu wrote: > > > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > > > > Hi Jarkko, > > > > > > > > On Wed, 8 Jun 2022 08:25:38 +0300 > > > > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > > > > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > > > > . > > > > > > > > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > > > > dependency is a result of kprobes code using the module allocator for the > > > > > > > trampoline code. > > > > > > > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > > > > e.g. when developing new core kernel features, while still having all > > > > > > > the nice tracing capabilities. > > > > > > > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > > > > code with CONFIG_MODULES. > > > > > > > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > > > > > > > Maybe we need a kprobe_alloc, right? > > > > > > > > > > Perhaps not the best name but at least it documents the fact that > > > > > they use the same allocator. > > > > > > > > > > Few years ago I carved up something "half-way there" for kprobes, > > > > > and I used the name text_alloc() [*]. > > > > > > > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > > > > > > > Yeah, I remember that. Thank you for updating your patch! > > > > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > > > > If module support maintainers think this name is not good, you may be > > > > able to rename it as text_alloc() and make the module_alloc() as a > > > > wrapper of it. > > > > > > IIUC, most users of module_alloc() use it to allocate memory for text, except > > > that module code uses it for both text and data. Therefore, I guess calling it > > > text_alloc() is not 100% accurate until we change the module code (to use > > > a different API to allocate memory for data). > > > > After reading the feedback, I'd stay on using module_alloc() because > > it has arch-specific quirks baked in. Easier to deal with them in one > > place. > > > > In that case, please ensure that you enable this only on architectures > where it is needed. arm64 implements alloc_insn_page() without relying > on module_alloc() so I would not expect to see any changes there. Hmm, what about adding CONFIG_ARCH_HAVE_ALLOC_INSN_PAGE and check it? If it is defined, kprobes will not define the __weak function, but if not, it will use module_alloc()? Thank you,
On Wed, 8 Jun 2022 11:19:19 -0700 Song Liu <song@kernel.org> wrote: > On Wed, Jun 8, 2022 at 9:28 AM Ard Biesheuvel <ardb@kernel.org> wrote: > > > > Hello Jarkko, > > > > On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > dependency is a result of kprobes code using the module allocator for the > > > trampoline code. > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > e.g. when developing new core kernel features, while still having all > > > the nice tracing capabilities. > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > code with CONFIG_MODULES. > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > I think I may have mentioned this the previous time as well, but I > > don't think this is the right approach. > > > > Kprobes uses alloc_insn_page() to allocate executable memory, but the > > requirements for this memory are radically different compared to > > loadable modules, which need to be within an arch-specific distance of > > the core kernel, need KASAN backing etc etc. > > I think the distance of core kernel requirement is the same for kprobe > alloc_insn_page and modules, no? This strongly depends on how kprobes (software breakpoint and single-step) is implemented on the arch. For example, x86 implements the so-called "kprobe-booster" which jumps back from the single stepping trampoline buffer. Then the buffer address must be within the range where it can jump to the original address. However, if the arch implements single-step as an instruction emulation, it has no such limitation. As far as I know, arm64 will do emulation for the instructions which change PC register and will do direct execution with another software breakpoint for other instructions. Why I'm using module_alloc() for a generic function, is that can cover the limitation most widely. Thus, if we have CONFIG_ARCH_HAVE_ALLOC_INSN_PAGE flag and kprobes can check it instead of using __weak function, the kprobes may not need to depend on module_alloc() in general. Thank you, > > Thanks, > Song > > > > > This is why arm64, for instance, does not implement alloc_insn_page() > > in terms of module_alloc() [and likely does not belong in this patch > > for that reason] > > > > > > > Is there any reason kprobes cannot simply use vmalloc()? > >
On Thu, Jun 09, 2022 at 03:23:16PM +0200, Ard Biesheuvel wrote: > On Thu, 9 Jun 2022 at 15:14, Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > On Wed, Jun 08, 2022 at 09:12:34AM -0700, Song Liu wrote: > > > On Wed, Jun 8, 2022 at 7:21 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > > > > Hi Jarkko, > > > > > > > > On Wed, 8 Jun 2022 08:25:38 +0300 > > > > Jarkko Sakkinen <jarkko@kernel.org> wrote: > > > > > > > > > On Wed, Jun 08, 2022 at 10:35:42AM +0800, Guo Ren wrote: > > > > > > . > > > > > > > > > > > > On Wed, Jun 8, 2022 at 8:02 AM Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > > > > dependency is a result of kprobes code using the module allocator for the > > > > > > > trampoline code. > > > > > > > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > > > > e.g. when developing new core kernel features, while still having all > > > > > > > the nice tracing capabilities. > > > > > > > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > > > > code with CONFIG_MODULES. > > > > > > > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > > > It's strange when MODULES is n, but vmlinux still obtains module_alloc. > > > > > > > > > > > > Maybe we need a kprobe_alloc, right? > > > > > > > > > > Perhaps not the best name but at least it documents the fact that > > > > > they use the same allocator. > > > > > > > > > > Few years ago I carved up something "half-way there" for kprobes, > > > > > and I used the name text_alloc() [*]. > > > > > > > > > > [*] https://lore.kernel.org/all/20200724050553.1724168-1-jarkko.sakkinen@linux.intel.com/ > > > > > > > > Yeah, I remember that. Thank you for updating your patch! > > > > I think the idea (split module_alloc() from CONFIG_MODULE) is good to me. > > > > If module support maintainers think this name is not good, you may be > > > > able to rename it as text_alloc() and make the module_alloc() as a > > > > wrapper of it. > > > > > > IIUC, most users of module_alloc() use it to allocate memory for text, except > > > that module code uses it for both text and data. Therefore, I guess calling it > > > text_alloc() is not 100% accurate until we change the module code (to use > > > a different API to allocate memory for data). > > > > After reading the feedback, I'd stay on using module_alloc() because > > it has arch-specific quirks baked in. Easier to deal with them in one > > place. > > > > In that case, please ensure that you enable this only on architectures > where it is needed. arm64 implements alloc_insn_page() without relying > on module_alloc() so I would not expect to see any changes there. Right, got it, thanks for remark. BR, Jarkko
On Thu, Jun 09, 2022 at 06:44:45AM -0700, Luis Chamberlain wrote: > On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote: > > On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote: > > > diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile > > > index 553866751e1a..d2bb954cd54f 100644 > > > --- a/arch/arm/kernel/Makefile > > > +++ b/arch/arm/kernel/Makefile > > > @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o > > > obj-$(CONFIG_ISA_DMA_API) += dma.o > > > obj-$(CONFIG_FIQ) += fiq.o fiqasm.o > > > obj-$(CONFIG_MODULES) += armksyms.o module.o > > > +ifeq ($(CONFIG_MODULES),y) > > > +obj-y += module_alloc.o > > > +else > > > +obj-$(CONFIG_KPROBES) += module_alloc.o > > > +endif > > > > Doesn't: > > > > obj-$(CONFIG_MODULES) += module_alloc.o > > obj-$(CONFIG_KPROBES) += module_alloc.o > > That just begs for a new kconfig symbol for the object, and for > the object then to be built with it. > > The archs which override the default can use ARCH_HAS_VM_ALLOC_EXEC. > Please note that the respective free is important as well and its > not clear if we need an another define for the free. Someone has > to do that work. We want to ensure to noexec the code on free and > this can vary on each arch. Let me check if I understand this (not 100% sure). So if arch define ARCH_HAS_VMALLOC_EXEC, then this would set config flag CONFIG_VMALLOC_EXEC, which would be used to include the compilation unit? BR, Jarkko
On Sun, Jun 12, 2022 at 09:30:41PM +0900, Masami Hiramatsu wrote: > On Wed, 8 Jun 2022 11:19:19 -0700 > Song Liu <song@kernel.org> wrote: > > > On Wed, Jun 8, 2022 at 9:28 AM Ard Biesheuvel <ardb@kernel.org> wrote: > > > > > > Hello Jarkko, > > > > > > On Wed, 8 Jun 2022 at 02:02, Jarkko Sakkinen <jarkko@profian.com> wrote: > > > > > > > > Tracing with kprobes while running a monolithic kernel is currently > > > > impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This > > > > dependency is a result of kprobes code using the module allocator for the > > > > trampoline code. > > > > > > > > Detaching kprobes from modules helps to squeeze down the user space, > > > > e.g. when developing new core kernel features, while still having all > > > > the nice tracing capabilities. > > > > > > > > For kernel/ and arch/*, move module_alloc() and module_memfree() to > > > > module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES > > > > or CONFIG_KPROBES is enabled. In addition, flag kernel module specific > > > > code with CONFIG_MODULES. > > > > > > > > As the result, kprobes can be used with a monolithic kernel. > > > > > > I think I may have mentioned this the previous time as well, but I > > > don't think this is the right approach. > > > > > > Kprobes uses alloc_insn_page() to allocate executable memory, but the > > > requirements for this memory are radically different compared to > > > loadable modules, which need to be within an arch-specific distance of > > > the core kernel, need KASAN backing etc etc. > > > > I think the distance of core kernel requirement is the same for kprobe > > alloc_insn_page and modules, no? > > This strongly depends on how kprobes (software breakpoint and > single-step) is implemented on the arch. For example, x86 implements > the so-called "kprobe-booster" which jumps back from the single > stepping trampoline buffer. Then the buffer address must be within > the range where it can jump to the original address. > However, if the arch implements single-step as an instruction > emulation, it has no such limitation. As far as I know, arm64 > will do emulation for the instructions which change PC register > and will do direct execution with another software breakpoint > for other instructions. > > Why I'm using module_alloc() for a generic function, is that > can cover the limitation most widely. > Thus, if we have CONFIG_ARCH_HAVE_ALLOC_INSN_PAGE flag and > kprobes can check it instead of using __weak function, the > kprobes may not need to depend on module_alloc() in general. OK, I guess this is what Luis meant. I'll try to carve up something based on this. BR, Jarkko
On Thu, Jun 09, 2022 at 06:41:36PM +0000, Edgecombe, Rick P wrote: > On Thu, 2022-06-09 at 06:24 -0700, Luis Chamberlain wrote: > > On Thu, Jun 09, 2022 at 05:48:52AM +0200, Christoph Hellwig wrote: > > > On Wed, Jun 08, 2022 at 01:26:19PM -0700, Luis Chamberlain wrote: > > > > No, that was removed because it has only one user. > > > > > > That is only part of the story. The other part is that the overall > > > kernel simply does not have any business allocating exutable > > > memory. > > > Executable memory is a very special concept for modules or module- > > > like > > > code like kprobes, and should not be exposed as a general concept. > > > > It is not just modules and kprobes, it is also ftrace and bpf too > > now. > > So while it should not be used everywhere calling it module_alloc() > > is just confusing at this point. Likewise, module_alloc_huge() is > > being proposed too and I'd rather we deal with this properly in > > aligment > > of taking care of the rename as well. > > > > If the concern is to restrict access we can use the module namespace > > stuff > > so to ensure only intended users get access to it. > > BPF even has multiple uses for text allocation. It has its own > trampoline feature that puts different type of text in the allocation, > with its own allocation routine. I looks like there are even more > little allocators in there. > > So yea, there seems to be a lot of the kernel in the business of > dynamically generated text, for better or worse. I agree that it needs > to be done carefully. However, these usages always seem to have the > same problems (W^X, arch eccentricities, etc). So I don't think we > should hide away the pieces. Instead we should have something with > guard rails on it, so they can't get the allocation part wrong. > > But I guess the question here is: what should we do in the meantime? It > is kind of similar to the questions that came up around the bpf prog > pack allocator. Should we hold up allocator related work until > underlying problems are resolved and there is some mature core > solution? > > Personally I had thought we would need to do some clean switch to a > much different interface. I still think someday it will be required, > but it seems to be evolving naturally for the time being. > > Like say for a next step we moved prog pack out of bpf into core code, > gave it it's own copy of module_alloc(), and then made kprobes use it. > Then we would have something with improved W^X guard rails, and kprobes > would not depend on modules anymore. I think maybe it's a step in the > right direction, even if it's not perfect. So you're saying that I should (as a first step) basically clone module_alloc() implementation for kprobes, and future for BPF use, in order to get a clean starting point? BR, Jarkko
On Tue, Jun 14, 2022 at 03:32:38PM +0300, jarkko@kernel.org wrote: > > Like say for a next step we moved prog pack out of bpf into core code, > > gave it it's own copy of module_alloc(), and then made kprobes use it. > > Then we would have something with improved W^X guard rails, and kprobes > > would not depend on modules anymore. I think maybe it's a step in the > > right direction, even if it's not perfect. > > So you're saying that I should (as a first step) basically clone > module_alloc() implementation for kprobes, and future for BPF > use, in order to get a clean starting point? I don't think cloning the code helps anyone. The fact that except for the eBPF mess everyone uses module_alloc and the related infrastructure is a feature and not a bug. The interface should become better than what we have right now, but there is few enough users that this can be done in one go. So assuming we really care deeply enough about fancy tracing without modules (and I'm not sure we do, even if you don't use modules it doesn't hurt to just build the modules code, I do that all the time for my test machines), the general approach in your series is the right one.
On Tue, Jun 14, 2022 at 12:36:25PM +0000, Christophe Leroy wrote: > > > Le 14/06/2022 à 14:26, Jarkko Sakkinen a écrit : > > On Thu, Jun 09, 2022 at 06:44:45AM -0700, Luis Chamberlain wrote: > >> On Thu, Jun 09, 2022 at 08:47:38AM +0100, Russell King (Oracle) wrote: > >>> On Wed, Jun 08, 2022 at 02:59:27AM +0300, Jarkko Sakkinen wrote: > >>>> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile > >>>> index 553866751e1a..d2bb954cd54f 100644 > >>>> --- a/arch/arm/kernel/Makefile > >>>> +++ b/arch/arm/kernel/Makefile > >>>> @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o > >>>> obj-$(CONFIG_ISA_DMA_API) += dma.o > >>>> obj-$(CONFIG_FIQ) += fiq.o fiqasm.o > >>>> obj-$(CONFIG_MODULES) += armksyms.o module.o > >>>> +ifeq ($(CONFIG_MODULES),y) > >>>> +obj-y += module_alloc.o > >>>> +else > >>>> +obj-$(CONFIG_KPROBES) += module_alloc.o > >>>> +endif > >>> > >>> Doesn't: > >>> > >>> obj-$(CONFIG_MODULES) += module_alloc.o > >>> obj-$(CONFIG_KPROBES) += module_alloc.o > >> > >> That just begs for a new kconfig symbol for the object, and for > >> the object then to be built with it. > >> > >> The archs which override the default can use ARCH_HAS_VM_ALLOC_EXEC. > >> Please note that the respective free is important as well and its > >> not clear if we need an another define for the free. Someone has > >> to do that work. We want to ensure to noexec the code on free and > >> this can vary on each arch. > > > > Let me check if I understand this (not 100% sure). > > > > So if arch define ARCH_HAS_VMALLOC_EXEC, then this would set > > config flag CONFIG_VMALLOC_EXEC, which would be used to include > > the compilation unit? > > > > I guess you have two possible approaches. > > Either architectures select CONFIG_ARCH_HAS_VMALLOC_EXEC at all time and > then you add a CONFIG_VMALLOC_EXEC which depends on > CONFIG_ARCH_HAS_VMALLOC_EXEC and CONFIG_MODULES or CONFIG_KPROBES, > > Or architectures select CONFIG_ARCH_HAS_VMALLOC_EXEC only when either > CONFIG_MODULES or CONFIG_KPROBES is selected, in that case there is no > need for a CONFIG_VMALLOC_EXEC. Right, got it now. Thanks for the elaboration. > Christophe BR, Jarkko
On Wed, Jun 15, 2022 at 08:37:07AM +0200, hch@lst.de wrote: > On Tue, Jun 14, 2022 at 03:32:38PM +0300, jarkko@kernel.org wrote: > > > Like say for a next step we moved prog pack out of bpf into core code, > > > gave it it's own copy of module_alloc(), and then made kprobes use it. > > > Then we would have something with improved W^X guard rails, and kprobes > > > would not depend on modules anymore. I think maybe it's a step in the > > > right direction, even if it's not perfect. > > > > So you're saying that I should (as a first step) basically clone > > module_alloc() implementation for kprobes, and future for BPF > > use, in order to get a clean starting point? > > I don't think cloning the code helps anyone. The fact that except > for the eBPF mess everyone uses module_alloc and the related > infrastructure is a feature and not a bug. The interface should > become better than what we have right now, but there is few enough > users that this can be done in one go. > > So assuming we really care deeply enough about fancy tracing without > modules (and I'm not sure we do, even if you don't use modules it > doesn't hurt to just build the modules code, I do that all the time > for my test machines), the general approach in your series is the > right one. OK, thanks for the elaboration! However I bake it, I doubt that next version is going to be the final version, given all the angles. Therefore, I mostly Christophe's suggestions on compilation flags, and also split this into per-arch patches. That should be at least to the right direction. BR, Jarkko
diff --git a/arch/Kconfig b/arch/Kconfig index fcf9a41a4ef5..e8e3e7998a2e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -39,7 +39,6 @@ config GENERIC_ENTRY config KPROBES bool "Kprobes" - depends on MODULES depends on HAVE_KPROBES select KALLSYMS select TASKS_RCU if PREEMPTION diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 553866751e1a..d2bb954cd54f 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -44,6 +44,11 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle.o obj-$(CONFIG_ISA_DMA_API) += dma.o obj-$(CONFIG_FIQ) += fiq.o fiqasm.o obj-$(CONFIG_MODULES) += armksyms.o module.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_ARM_MODULE_PLTS) += module-plts.o obj-$(CONFIG_ISA_DMA) += dma-isa.o obj-$(CONFIG_PCI) += bios32.o isa.o diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index d59c36dc0494..054e799e7091 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -22,38 +22,6 @@ #include <asm/unwind.h> #include <asm/opcodes.h> -#ifdef CONFIG_XIP_KERNEL -/* - * The XIP kernel text is mapped in the module area for modules and - * some other stuff to work without any indirect relocations. - * MODULES_VADDR is redefined here and not in asm/memory.h to avoid - * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. - */ -#undef MODULES_VADDR -#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) -#endif - -#ifdef CONFIG_MMU -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - /* Silence the initial allocation */ - if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) - gfp_mask |= __GFP_NOWARN; - - p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) - return p; - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} -#endif - bool module_init_section(const char *name) { return strstarts(name, ".init") || diff --git a/arch/arm/kernel/module_alloc.c b/arch/arm/kernel/module_alloc.c new file mode 100644 index 000000000000..c3cfb2b60575 --- /dev/null +++ b/arch/arm/kernel/module_alloc.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2002 Russell King. + * Modified for nommu by Hyok S. Choi + * + * Module allocation method suggested by Andi Kleen. + */ + +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#ifdef CONFIG_XIP_KERNEL +/* + * The XIP kernel text is mapped in the module area for modules and + * some other stuff to work without any indirect relocations. + * MODULES_VADDR is redefined here and not in asm/memory.h to avoid + * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off. + */ +#undef MODULES_VADDR +#define MODULES_VADDR (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK) +#endif + +#ifdef CONFIG_MMU +void *module_alloc(unsigned long size) +{ + gfp_t gfp_mask = GFP_KERNEL; + void *p; + + /* Silence the initial allocation */ + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) + gfp_mask |= __GFP_NOWARN; + + p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p) + return p; + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} +#endif diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fa7981d0d917..d61c02da857e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -43,6 +43,11 @@ obj-$(CONFIG_COMPAT) += sigreturn32.o obj-$(CONFIG_KUSER_HELPERS) += kuser32.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o obj-$(CONFIG_MODULES) += module.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index f2d4bb14bfab..d2a979746da0 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -20,53 +20,6 @@ #include <asm/insn.h> #include <asm/sections.h> -void *module_alloc(unsigned long size) -{ - u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - /* Silence the initial allocation */ - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) - gfp_mask |= __GFP_NOWARN; - - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || - IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - /* don't exceed the static module region - see below */ - module_alloc_end = MODULES_END; - - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, - NUMA_NO_NODE, __builtin_return_address(0)); - - if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - (IS_ENABLED(CONFIG_KASAN_VMALLOC) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) - /* - * KASAN without KASAN_VMALLOC can only deal with module - * allocations being served from the reserved module region, - * since the remainder of the vmalloc region is already - * backed by zero shadow pages, and punching holes into it - * is non-trivial. Since the module region is not randomized - * when KASAN is enabled without KASAN_VMALLOC, it is even - * less likely that the module region gets exhausted, so we - * can simply omit this fallback in that case. - */ - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_base + SZ_2G, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - - /* Memory is intended to be executable, reset the pointer tag. */ - return kasan_reset_tag(p); -} - enum aarch64_reloc_op { RELOC_OP_NONE, RELOC_OP_ABS, diff --git a/arch/arm64/kernel/module_alloc.c b/arch/arm64/kernel/module_alloc.c new file mode 100644 index 000000000000..0340c8c0b782 --- /dev/null +++ b/arch/arm64/kernel/module_alloc.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 ARM Limited + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/vmalloc.h> + +void *module_alloc(unsigned long size) +{ + u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; + gfp_t gfp_mask = GFP_KERNEL; + void *p; + + /* Silence the initial allocation */ + if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) + gfp_mask |= __GFP_NOWARN; + + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + /* don't exceed the static module region - see below */ + module_alloc_end = MODULES_END; + + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + + if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && + (IS_ENABLED(CONFIG_KASAN_VMALLOC) || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) + /* + * KASAN without KASAN_VMALLOC can only deal with module + * allocations being served from the reserved module region, + * since the remainder of the vmalloc region is already + * backed by zero shadow pages, and punching holes into it + * is non-trivial. Since the module region is not randomized + * when KASAN is enabled without KASAN_VMALLOC, it is even + * less likely that the module region gets exhausted, so we + * can simply omit this fallback in that case. + */ + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, + module_alloc_base + SZ_2G, GFP_KERNEL, + PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); + + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { + vfree(p); + return NULL; + } + + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); +} diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile index 7c96282bff2e..cb9297b613a1 100644 --- a/arch/mips/kernel/Makefile +++ b/arch/mips/kernel/Makefile @@ -38,6 +38,11 @@ obj-$(CONFIG_SYNC_R4K) += sync-r4k.o obj-$(CONFIG_DEBUG_FS) += segment.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_MODULES) += module.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 14f46d17500a..214b2d1868a5 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -31,15 +31,6 @@ struct mips_hi16 { static LIST_HEAD(dbe_list); static DEFINE_SPINLOCK(dbe_lock); -#ifdef MODULE_START -void *module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} -#endif - static void apply_r_mips_32(u32 *location, u32 base, Elf_Addr v) { *location = base + v; diff --git a/arch/mips/kernel/module_alloc.c b/arch/mips/kernel/module_alloc.c new file mode 100644 index 000000000000..582ab5ed8916 --- /dev/null +++ b/arch/mips/kernel/module_alloc.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2001 Rusty Russell. + * Copyright (C) 2003, 2004 Ralf Baechle (ralf@linux-mips.org) + * Copyright (C) 2005 Thiemo Seufer + */ + +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#ifdef MODULE_START +void *module_alloc(unsigned long size) +{ + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} +#endif diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile index d0bfac89a842..cd38083ae077 100644 --- a/arch/parisc/kernel/Makefile +++ b/arch/parisc/kernel/Makefile @@ -25,6 +25,11 @@ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PA11) += pci-dma.o obj-$(CONFIG_PCI) += pci.o obj-$(CONFIG_MODULES) += module.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_64BIT) += sys_parisc32.o signal32.o obj-$(CONFIG_STACKTRACE)+= stacktrace.o obj-$(CONFIG_AUDIT) += audit.o diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 7df140545b22..c81e63e2549b 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -192,17 +192,6 @@ static inline int reassemble_22(int as22) ((as22 & 0x0003ff) << 3)); } -void *module_alloc(unsigned long size) -{ - /* using RWX means less protection for modules, but it's - * easier than trying to map the text, data, init_text and - * init_data correctly */ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, - PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} - #ifndef CONFIG_64BIT static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n) { diff --git a/arch/parisc/kernel/module_alloc.c b/arch/parisc/kernel/module_alloc.c new file mode 100644 index 000000000000..4e4e4ce1132e --- /dev/null +++ b/arch/parisc/kernel/module_alloc.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Linux/PA-RISC Project + * Copyright (C) 2003 Randolph Chung <tausq at debian . org> + * Copyright (C) 2008 Helge Deller <deller@gmx.de> + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/vmalloc.h> + +void *module_alloc(unsigned long size) +{ + /* + * Using RWX means less protection for modules, but it's + * easier than trying to map the text, data, init_text and + * init_data correctly. + */ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, + PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 2e2a2a9bcf43..5a811cdf230b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -103,6 +103,11 @@ obj-$(CONFIG_HIBERNATION) += swsusp_$(BITS).o endif obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o obj-$(CONFIG_MODULES) += module.o module_$(BITS).o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_44x) += cpu_setup_44x.o obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o obj-$(CONFIG_PPC_DOORBELL) += dbell.o diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index f6d6ae0a1692..b30e00964a60 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -88,40 +88,3 @@ int module_finalize(const Elf_Ehdr *hdr, return 0; } - -static __always_inline void * -__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) -{ - pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; - gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); - - /* - * Don't do huge page allocations for modules yet until more testing - * is done. STRICT_MODULE_RWX may require extra work to support this - * too. - */ - return __vmalloc_node_range(size, 1, start, end, gfp, prot, - VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - -void *module_alloc(unsigned long size) -{ -#ifdef MODULES_VADDR - unsigned long limit = (unsigned long)_etext - SZ_32M; - void *ptr = NULL; - - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); - - /* First try within 32M limit from _etext to avoid branch trampolines */ - if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) - ptr = __module_alloc(size, limit, MODULES_END, true); - - if (!ptr) - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); - - return ptr; -#else - return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); -#endif -} diff --git a/arch/powerpc/kernel/module_alloc.c b/arch/powerpc/kernel/module_alloc.c new file mode 100644 index 000000000000..48541c27ce46 --- /dev/null +++ b/arch/powerpc/kernel/module_alloc.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Kernel module help for powerpc. + * Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. + * Copyright (C) 2008 Freescale Semiconductor, Inc. + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/vmalloc.h> + +static __always_inline void * +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) +{ + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); + + /* + * Don't do huge page allocations for modules yet until more testing + * is done. STRICT_MODULE_RWX may require extra work to support this + * too. + */ + return __vmalloc_node_range(size, 1, start, end, gfp, prot, + VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); +} + +void *module_alloc(unsigned long size) +{ +#ifdef MODULES_VADDR + unsigned long limit = (unsigned long)_etext - SZ_32M; + void *ptr = NULL; + + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); + + /* First try within 32M limit from _etext to avoid branch trampolines */ + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) + ptr = __module_alloc(size, limit, MODULES_END, true); + + if (!ptr) + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); + + return ptr; +#else + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); +#endif +} diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index c71d6591d539..7af346ae575e 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -61,6 +61,11 @@ obj-$(CONFIG_SMP) += cpu_ops.o obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o obj-$(CONFIG_MODULES) += module.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 91fe16bfaa07..1621d5e458f7 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -419,16 +419,6 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, return 0; } -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) -void *module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, MODULES_VADDR, - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} -#endif - static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, const char *name) diff --git a/arch/riscv/kernel/module_alloc.c b/arch/riscv/kernel/module_alloc.c new file mode 100644 index 000000000000..ed87ed048713 --- /dev/null +++ b/arch/riscv/kernel/module_alloc.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Zihao Yu + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/vmalloc.h> +#include <asm/sections.h> + +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +void *module_alloc(unsigned long size) +{ + return __vmalloc_node_range(size, 1, MODULES_VADDR, + MODULES_END, GFP_KERNEL, + PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} +#endif diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 27d6b3c7aa06..71e73e34b441 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -48,6 +48,11 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 26125a9c436d..e15f09fa50e2 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -35,23 +35,6 @@ #define PLT_ENTRY_SIZE 22 -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - return p; -} - #ifdef CONFIG_FUNCTION_TRACER void module_arch_cleanup(struct module *mod) { diff --git a/arch/s390/kernel/module_alloc.c b/arch/s390/kernel/module_alloc.c new file mode 100644 index 000000000000..e4c4175fb759 --- /dev/null +++ b/arch/s390/kernel/module_alloc.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Kernel module help for s390. + * + * S390 version + * Copyright IBM Corp. 2002, 2003 + * Author(s): Arnd Bergmann (arndb@de.ibm.com) + * Martin Schwidefsky (schwidefsky@de.ibm.com) + * + * based on i386 version + * Copyright (C) 2001 Rusty Russell. + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/vmalloc.h> + +void *module_alloc(unsigned long size) +{ + gfp_t gfp_mask = GFP_KERNEL; + void *p; + + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, + __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { + vfree(p); + return NULL; + } + return p; +} diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index d3a0e072ebe8..53905c00a094 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -89,6 +89,11 @@ obj-$(CONFIG_SUN_PM) += apc.o pmc.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULES) += sparc_ksyms.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_SPARC_LED) += led.o obj-$(CONFIG_KGDB) += kgdb_$(BITS).o diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index df39580f398d..f2babc69f189 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -21,36 +21,6 @@ #include "entry.h" -#ifdef CONFIG_SPARC64 - -#include <linux/jump_label.h> - -static void *module_map(unsigned long size) -{ - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} -#else -static void *module_map(unsigned long size) -{ - return vmalloc(size); -} -#endif /* CONFIG_SPARC64 */ - -void *module_alloc(unsigned long size) -{ - void *ret; - - ret = module_map(size); - if (ret) - memset(ret, 0, size); - - return ret; -} - /* Make generic code ignore STT_REGISTER dummy undefined symbols. */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/arch/sparc/kernel/module_alloc.c b/arch/sparc/kernel/module_alloc.c new file mode 100644 index 000000000000..4bacf8cff012 --- /dev/null +++ b/arch/sparc/kernel/module_alloc.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Kernel module help for sparc64. + * + * Copyright (C) 2001 Rusty Russell. + * Copyright (C) 2002 David S. Miller. + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/vmalloc.h> + +#ifdef CONFIG_SPARC64 +#include <linux/jump_label.h> + +static void *module_map(unsigned long size) +{ + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} +#else +static void *module_map(unsigned long size) +{ + return vmalloc(size); +} +#endif /* CONFIG_SPARC64 */ + +void *module_alloc(unsigned long size) +{ + void *ret; + + ret = module_map(size); + if (ret) + memset(ret, 0, size); + + return ret; +} diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 03364dc40d8d..1f0304140f44 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -108,6 +108,11 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_X86_32) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b98ffcf4d250..40bb10b5a68d 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -36,56 +36,6 @@ do { \ } while (0) #endif -#ifdef CONFIG_RANDOMIZE_BASE -static unsigned long module_load_offset; - -/* Mutex protects the module_load_offset. */ -static DEFINE_MUTEX(module_kaslr_mutex); - -static unsigned long int get_module_load_offset(void) -{ - if (kaslr_enabled()) { - mutex_lock(&module_kaslr_mutex); - /* - * Calculate the module_load_offset the first time this - * code is called. Once calculated it stays the same until - * reboot. - */ - if (module_load_offset == 0) - module_load_offset = - (get_random_int() % 1024 + 1) * PAGE_SIZE; - mutex_unlock(&module_kaslr_mutex); - } - return module_load_offset; -} -#else -static unsigned long int get_module_load_offset(void) -{ - return 0; -} -#endif - -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - - p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - - return p; -} - #ifdef CONFIG_X86_32 int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, diff --git a/arch/x86/kernel/module_alloc.c b/arch/x86/kernel/module_alloc.c new file mode 100644 index 000000000000..a7df2af1467a --- /dev/null +++ b/arch/x86/kernel/module_alloc.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Kernel module help for x86. + * Copyright (C) 2001 Rusty Russell. + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/random.h> +#include <linux/vmalloc.h> +#include <asm/setup.h> + +#ifdef CONFIG_RANDOMIZE_BASE +static unsigned long module_load_offset; + +/* Mutex protects the module_load_offset. */ +static DEFINE_MUTEX(module_kaslr_mutex); + +static unsigned long get_module_load_offset(void) +{ + if (kaslr_enabled()) { + mutex_lock(&module_kaslr_mutex); + /* + * Calculate the module_load_offset the first time this + * code is called. Once calculated it stays the same until + * reboot. + */ + if (module_load_offset == 0) + module_load_offset = + (get_random_int() % 1024 + 1) * PAGE_SIZE; + mutex_unlock(&module_kaslr_mutex); + } + return module_load_offset; +} +#else +static unsigned long get_module_load_offset(void) +{ + return 0; +} +#endif + +void *module_alloc(unsigned long size) +{ + gfp_t gfp_mask = GFP_KERNEL; + void *p; + + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + + p = __vmalloc_node_range(size, MODULE_ALIGN, + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, + __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { + vfree(p); + return NULL; + } + + return p; +} diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d3..2981fe42060d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -53,6 +53,11 @@ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ obj-$(CONFIG_MODULES) += module/ +ifeq ($(CONFIG_MODULES),y) +obj-y += module_alloc.o +else +obj-$(CONFIG_KPROBES) += module_alloc.o +endif obj-$(CONFIG_KCMP) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f214f8c088ed..3f9876374cd3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1569,6 +1569,7 @@ static int check_kprobe_address_safe(struct kprobe *p, goto out; } +#ifdef CONFIG_MODULES /* Check if 'p' is probing a module. */ *probed_mod = __module_text_address((unsigned long) p->addr); if (*probed_mod) { @@ -1592,6 +1593,8 @@ static int check_kprobe_address_safe(struct kprobe *p, ret = -ENOENT; } } +#endif + out: preempt_enable(); jump_label_unlock(); @@ -2475,6 +2478,7 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +#ifdef CONFIG_MODULES /* Remove all symbols in given area from kprobe blacklist */ static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) { @@ -2492,6 +2496,7 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) { kprobe_remove_area_blacklist(entry, entry + 1); } +#endif /* CONFIG_MODULES */ int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym) @@ -2557,6 +2562,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret ? : arch_populate_kprobe_blacklist(); } +#ifdef CONFIG_MODULES static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; @@ -2658,6 +2664,7 @@ static struct notifier_block kprobe_module_nb = { .notifier_call = kprobes_module_callback, .priority = 0 }; +#endif /* CONFIG_MODULES */ void kprobe_free_init_mem(void) { @@ -2717,8 +2724,11 @@ static int __init init_kprobes(void) err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + +#ifdef CONFIG_MODULES if (!err) err = register_module_notifier(&kprobe_module_nb); +#endif kprobes_initialized = (err == 0); kprobe_sysctls_init(); diff --git a/kernel/module/main.c b/kernel/module/main.c index fed58d30725d..7fa182b78550 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1121,16 +1121,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -void __weak module_memfree(void *module_region) -{ - /* - * This memory may be RO, and freeing RO memory in an interrupt is not - * supported by vmalloc. - */ - WARN_ON(in_interrupt()); - vfree(module_region); -} - void __weak module_arch_cleanup(struct module *mod) { } @@ -1606,13 +1596,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) ddebug_remove_module(mod->name); } -void * __weak module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - bool __weak module_init_section(const char *name) { return strstarts(name, ".init"); diff --git a/kernel/module_alloc.c b/kernel/module_alloc.c new file mode 100644 index 000000000000..26a4c60998ad --- /dev/null +++ b/kernel/module_alloc.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2002 Richard Henderson + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + */ + +#include <linux/mm.h> +#include <linux/moduleloader.h> +#include <linux/vmalloc.h> + +void * __weak module_alloc(unsigned long size) +{ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); +} + +void __weak module_memfree(void *module_region) +{ + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); + vfree(module_region); +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 93507330462c..050b2975332e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -101,6 +101,7 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) return kprobe_gone(&tk->rp.kp); } +#ifdef CONFIG_MODULES static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { @@ -109,11 +110,13 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } +#endif /* CONFIG_MODULES */ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { + bool ret = false; +#ifdef CONFIG_MODULES char *p; - bool ret; if (!tk->symbol) return false; @@ -125,6 +128,7 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) ret = !!find_module(tk->symbol); rcu_read_unlock_sched(); *p = ':'; +#endif /* CONFIG_MODULES */ return ret; } @@ -668,6 +672,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) return ret; } +#ifdef CONFIG_MODULES /* Module notifier call back, checking event on the module */ static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -702,6 +707,7 @@ static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; +#endif /* CONFIG_MODULES */ static int __trace_kprobe_create(int argc, const char *argv[]) { @@ -1896,8 +1902,10 @@ static __init int init_kprobe_trace_early(void) if (ret) return ret; +#ifdef CONFIG_MODULES if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; +#endif /* CONFIG_MODULES */ return 0; }
Tracing with kprobes while running a monolithic kernel is currently impossible because CONFIG_KPROBES is dependent of CONFIG_MODULES. This dependency is a result of kprobes code using the module allocator for the trampoline code. Detaching kprobes from modules helps to squeeze down the user space, e.g. when developing new core kernel features, while still having all the nice tracing capabilities. For kernel/ and arch/*, move module_alloc() and module_memfree() to module_alloc.c, and compile as part of vmlinux when either CONFIG_MODULES or CONFIG_KPROBES is enabled. In addition, flag kernel module specific code with CONFIG_MODULES. As the result, kprobes can be used with a monolithic kernel. Signed-off-by: Jarkko Sakkinen <jarkko@profian.com> --- Tested with the help of BuildRoot and QEMU: - arm (function tracer) - arm64 (function tracer) - mips (function tracer) - powerpc (function tracer) - riscv (function tracer) - s390 (function tracer) - sparc (function tracer) - x86 (function tracer) - sh (function tracer, for the "pure" kernel/modules_alloc.c path) --- arch/Kconfig | 1 - arch/arm/kernel/Makefile | 5 +++ arch/arm/kernel/module.c | 32 ---------------- arch/arm/kernel/module_alloc.c | 42 ++++++++++++++++++++ arch/arm64/kernel/Makefile | 5 +++ arch/arm64/kernel/module.c | 47 ----------------------- arch/arm64/kernel/module_alloc.c | 57 ++++++++++++++++++++++++++++ arch/mips/kernel/Makefile | 5 +++ arch/mips/kernel/module.c | 9 ----- arch/mips/kernel/module_alloc.c | 18 +++++++++ arch/parisc/kernel/Makefile | 5 +++ arch/parisc/kernel/module.c | 11 ------ arch/parisc/kernel/module_alloc.c | 23 +++++++++++ arch/powerpc/kernel/Makefile | 5 +++ arch/powerpc/kernel/module.c | 37 ------------------ arch/powerpc/kernel/module_alloc.c | 47 +++++++++++++++++++++++ arch/riscv/kernel/Makefile | 5 +++ arch/riscv/kernel/module.c | 10 ----- arch/riscv/kernel/module_alloc.c | 19 ++++++++++ arch/s390/kernel/Makefile | 5 +++ arch/s390/kernel/module.c | 17 --------- arch/s390/kernel/module_alloc.c | 33 ++++++++++++++++ arch/sparc/kernel/Makefile | 5 +++ arch/sparc/kernel/module.c | 30 --------------- arch/sparc/kernel/module_alloc.c | 39 +++++++++++++++++++ arch/x86/kernel/Makefile | 5 +++ arch/x86/kernel/module.c | 50 ------------------------ arch/x86/kernel/module_alloc.c | 61 ++++++++++++++++++++++++++++++ kernel/Makefile | 5 +++ kernel/kprobes.c | 10 +++++ kernel/module/main.c | 17 --------- kernel/module_alloc.c | 26 +++++++++++++ kernel/trace/trace_kprobe.c | 10 ++++- 33 files changed, 434 insertions(+), 262 deletions(-) create mode 100644 arch/arm/kernel/module_alloc.c create mode 100644 arch/arm64/kernel/module_alloc.c create mode 100644 arch/mips/kernel/module_alloc.c create mode 100644 arch/parisc/kernel/module_alloc.c create mode 100644 arch/powerpc/kernel/module_alloc.c create mode 100644 arch/riscv/kernel/module_alloc.c create mode 100644 arch/s390/kernel/module_alloc.c create mode 100644 arch/sparc/kernel/module_alloc.c create mode 100644 arch/x86/kernel/module_alloc.c create mode 100644 kernel/module_alloc.c