diff mbox series

[RFC,3/5] modules, x86: use vmalloc_exec for module core

Message ID 20220818224218.2399791-4-song@kernel.org (mailing list archive)
State New
Headers show
Series vmalloc_exec for modules and BPF programs | expand

Commit Message

Song Liu Aug. 18, 2022, 10:42 p.m. UTC
This is a prototype that allows modules to share 2MB text pages with other
modules and BPF programs.

Current version only covers core_layout.
---
 arch/x86/Kconfig              |  1 +
 arch/x86/kernel/alternative.c | 30 ++++++++++++++++++++++++------
 arch/x86/kernel/module.c      |  1 +
 kernel/module/main.c          | 23 +++++++++++++----------
 kernel/module/strict_rwx.c    |  3 ---
 kernel/trace/ftrace.c         |  3 ++-
 6 files changed, 41 insertions(+), 20 deletions(-)

Comments

Luis Chamberlain Oct. 6, 2022, 11:38 p.m. UTC | #1
On Thu, Aug 18, 2022 at 03:42:16PM -0700, Song Liu wrote:
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index 100446ffdc1d..570af623e28f 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -229,6 +229,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
>  	bool early = me->state == MODULE_STATE_UNFORMED;
>  	void *(*write)(void *, const void *, size_t) = memcpy;
>  
> +	early = false;
>  	if (!early) {
>  		write = text_poke;
>  		mutex_lock(&text_mutex);

As per 88fc078a7a8f6 ("x86/module: Use text_poke() for late
relocations") I'm curious why we have to take the live patching
path now all the time?

  Luis
Song Liu Oct. 7, 2022, 6:46 a.m. UTC | #2
> On Oct 6, 2022, at 4:38 PM, Luis Chamberlain <mcgrof@kernel.org> wrote:
> 
> On Thu, Aug 18, 2022 at 03:42:16PM -0700, Song Liu wrote:
>> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
>> index 100446ffdc1d..570af623e28f 100644
>> --- a/arch/x86/kernel/module.c
>> +++ b/arch/x86/kernel/module.c
>> @@ -229,6 +229,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
>> 	bool early = me->state == MODULE_STATE_UNFORMED;
>> 	void *(*write)(void *, const void *, size_t) = memcpy;
>> 
>> +	early = false;
>> 	if (!early) {
>> 		write = text_poke;
>> 		mutex_lock(&text_mutex);
> 
> As per 88fc078a7a8f6 ("x86/module: Use text_poke() for late
> relocations") I'm curious why we have to take the live patching
> path now all the time?

Since vmalloc_exec returns read-only memory, we need text_poke()
for any operation to it. 

Does this answer your question?

Song
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fb5900e2c29a..e932bceb7f23 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -91,6 +91,7 @@  config X86
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
+	select ARCH_WANTS_MODULES_DATA_IN_VMALLOC	if X86_64
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 62f6b8b7c4a5..c83888ec232b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -332,7 +332,13 @@  void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 
 		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 
-		text_poke_early(instr, insn_buff, insn_buff_sz);
+		if (system_state < SYSTEM_RUNNING) {
+			text_poke_early(instr, insn_buff, insn_buff_sz);
+		} else {
+			mutex_lock(&text_mutex);
+			text_poke(instr, insn_buff, insn_buff_sz);
+			mutex_unlock(&text_mutex);
+		}
 
 next:
 		optimize_nops(instr, a->instrlen);
@@ -503,7 +509,13 @@  void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 			optimize_nops(bytes, len);
 			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
 			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
-			text_poke_early(addr, bytes, len);
+			if (system_state == SYSTEM_BOOTING) {
+				text_poke_early(addr, bytes, len);
+			} else {
+				mutex_lock(&text_mutex);
+				text_poke(addr, bytes, len);
+				mutex_unlock(&text_mutex);
+			}
 		}
 	}
 }
@@ -568,7 +580,13 @@  void __init_or_module noinline apply_returns(s32 *start, s32 *end)
 		if (len == insn.length) {
 			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
 			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
-			text_poke_early(addr, bytes, len);
+			if (unlikely(system_state == SYSTEM_BOOTING)) {
+				text_poke_early(addr, bytes, len);
+			} else {
+				mutex_lock(&text_mutex);
+				text_poke(addr, bytes, len);
+				mutex_unlock(&text_mutex);
+			}
 		}
 	}
 }
@@ -609,7 +627,7 @@  void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
 		 */
 		DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
 		DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
-		text_poke_early(addr, &poison, 4);
+		text_poke(addr, &poison, 4);
 	}
 }
 
@@ -791,7 +809,7 @@  void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 
 		/* Pad the rest with nops */
 		add_nops(insn_buff + used, p->len - used);
-		text_poke_early(p->instr, insn_buff, p->len);
+		text_poke(p->instr, insn_buff, p->len);
 	}
 }
 extern struct paravirt_patch_site __start_parainstructions[],
@@ -1698,7 +1716,7 @@  void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *
 	struct text_poke_loc tp;
 
 	if (unlikely(system_state == SYSTEM_BOOTING)) {
-		text_poke_early(addr, opcode, len);
+		text_poke(addr, opcode, len);
 		return;
 	}
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 100446ffdc1d..570af623e28f 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -229,6 +229,7 @@  int apply_relocate_add(Elf64_Shdr *sechdrs,
 	bool early = me->state == MODULE_STATE_UNFORMED;
 	void *(*write)(void *, const void *, size_t) = memcpy;
 
+	early = false;
 	if (!early) {
 		write = text_poke;
 		mutex_lock(&text_mutex);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 57fc2821be63..c51dafa1089a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -53,6 +53,7 @@ 
 #include <linux/bsearch.h>
 #include <linux/dynamic_debug.h>
 #include <linux/audit.h>
+#include <linux/bpf.h>
 #include <uapi/linux/module.h>
 #include "internal.h"
 
@@ -1198,7 +1199,7 @@  static void free_module(struct module *mod)
 	lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
 
 	/* Finally, free the core (containing the module structure) */
-	module_memfree(mod->core_layout.base);
+	vfree_exec(mod->core_layout.base);
 #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 	vfree(mod->data_layout.base);
 #endif
@@ -1316,7 +1317,8 @@  static int simplify_symbols(struct module *mod, const struct load_info *info)
 			ksym = resolve_symbol_wait(mod, info, name);
 			/* Ok if resolved.  */
 			if (ksym && !IS_ERR(ksym)) {
-				sym[i].st_value = kernel_symbol_value(ksym);
+				unsigned long val = kernel_symbol_value(ksym);
+				bpf_arch_text_copy(&sym[i].st_value, &val, sizeof(val));
 				break;
 			}
 
@@ -1337,7 +1339,8 @@  static int simplify_symbols(struct module *mod, const struct load_info *info)
 				secbase = (unsigned long)mod_percpu(mod);
 			else
 				secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
-			sym[i].st_value += secbase;
+			secbase += sym[i].st_value;
+			bpf_arch_text_copy(&sym[i].st_value, &secbase, sizeof(secbase));
 			break;
 		}
 	}
@@ -2118,7 +2121,7 @@  static int move_module(struct module *mod, struct load_info *info)
 	void *ptr;
 
 	/* Do the allocs. */
-	ptr = module_alloc(mod->core_layout.size);
+	ptr = vmalloc_exec(mod->core_layout.size, PAGE_SIZE);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -2128,7 +2131,7 @@  static int move_module(struct module *mod, struct load_info *info)
 	if (!ptr)
 		return -ENOMEM;
 
-	memset(ptr, 0, mod->core_layout.size);
+/* 	memset(ptr, 0, mod->core_layout.size); */
 	mod->core_layout.base = ptr;
 
 	if (mod->init_layout.size) {
@@ -2141,7 +2144,7 @@  static int move_module(struct module *mod, struct load_info *info)
 		 */
 		kmemleak_ignore(ptr);
 		if (!ptr) {
-			module_memfree(mod->core_layout.base);
+			vfree_exec(mod->core_layout.base);
 			return -ENOMEM;
 		}
 		memset(ptr, 0, mod->init_layout.size);
@@ -2151,7 +2154,7 @@  static int move_module(struct module *mod, struct load_info *info)
 
 #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 	/* Do the allocs. */
-	ptr = vmalloc(mod->data_layout.size);
+	ptr = module_alloc(mod->data_layout.size);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -2159,7 +2162,7 @@  static int move_module(struct module *mod, struct load_info *info)
 	 */
 	kmemleak_not_leak(ptr);
 	if (!ptr) {
-		module_memfree(mod->core_layout.base);
+		vfree_exec(mod->core_layout.base);
 		module_memfree(mod->init_layout.base);
 		return -ENOMEM;
 	}
@@ -2185,7 +2188,7 @@  static int move_module(struct module *mod, struct load_info *info)
 			dest = mod->core_layout.base + shdr->sh_entsize;
 
 		if (shdr->sh_type != SHT_NOBITS)
-			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+			bpf_arch_text_copy(dest, (void *)shdr->sh_addr, shdr->sh_size);
 		/* Update sh_addr to point to copy in image. */
 		shdr->sh_addr = (unsigned long)dest;
 		pr_debug("\t0x%lx %s\n",
@@ -2341,7 +2344,7 @@  static void module_deallocate(struct module *mod, struct load_info *info)
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
 	module_memfree(mod->init_layout.base);
-	module_memfree(mod->core_layout.base);
+	vfree_exec(mod->core_layout.base);
 #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 	vfree(mod->data_layout.base);
 #endif
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 14fbea66f12f..d392eb7bf574 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -85,7 +85,6 @@  void module_enable_x(const struct module *mod)
 	    !PAGE_ALIGNED(mod->init_layout.base))
 		return;
 
-	frob_text(&mod->core_layout, set_memory_x);
 	frob_text(&mod->init_layout, set_memory_x);
 }
 
@@ -98,9 +97,7 @@  void module_enable_ro(const struct module *mod, bool after_init)
 		return;
 #endif
 
-	set_vm_flush_reset_perms(mod->core_layout.base);
 	set_vm_flush_reset_perms(mod->init_layout.base);
-	frob_text(&mod->core_layout, set_memory_ro);
 
 	frob_rodata(&mod->data_layout, set_memory_ro);
 	frob_text(&mod->init_layout, set_memory_ro);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bc921a3f7ea8..8cd31dc9ac84 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3177,6 +3177,7 @@  static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 	if (mod)
 		rec_flags |= FTRACE_FL_DISABLED;
 
+	ftrace_arch_code_modify_prepare();
 	for (pg = new_pgs; pg; pg = pg->next) {
 
 		for (i = 0; i < pg->index; i++) {
@@ -3198,7 +3199,7 @@  static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 			update_cnt++;
 		}
 	}
-
+	ftrace_arch_code_modify_post_process();
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += update_cnt;