diff mbox series

[bpf-next,v6,1/2] bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules

Message ID e627742ab86ed28632bc9b6c56ef65d7f98eadbc.1676542796.git.vmalik@redhat.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Fix attaching fentry/fexit/fmod_ret/lsm to modules | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1427 this patch: 1427
netdev/cc_maintainers warning 1 maintainers not CCed: linux-modules@vger.kernel.org
netdev/build_clang success Errors and warnings before: 152 this patch: 152
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1422 this patch: 1422
netdev/checkpatch warning WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16

Commit Message

Viktor Malik Feb. 16, 2023, 10:32 a.m. UTC
This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm
to functions located in modules:

1. The verifier tries to find the address to attach to in kallsyms. This
   is always done by searching the entire kallsyms, not respecting the
   module in which the function is located. Such approach causes an
   incorrect attachment address to be computed if the function to attach
   to is shadowed by a function of the same name located earlier in
   kallsyms.

2. If the address to attach to is located in a module, the module
   reference is only acquired in register_fentry. If the module is
   unloaded between the place where the address is found
   (bpf_check_attach_target in the verifier) and register_fentry, it is
   possible that another module is loaded to the same address which may
   lead to potential errors.

Since the attachment must contain the BTF of the program to attach to,
we extract the module from it and search for the function address in the
correct module (resolving problem no. 1). Then, the module reference is
taken directly in bpf_check_attach_target and stored in the bpf program
(in bpf_prog_aux). The reference is only released when the program is
unloaded (resolving problem no. 2).

Signed-off-by: Viktor Malik <vmalik@redhat.com>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/syscall.c     |  2 ++
 kernel/bpf/trampoline.c  | 27 ---------------------------
 kernel/bpf/verifier.c    | 20 +++++++++++++++++++-
 kernel/module/internal.h |  5 +++++
 5 files changed, 27 insertions(+), 28 deletions(-)

Comments

Jiri Olsa Feb. 16, 2023, 1:50 p.m. UTC | #1
On Thu, Feb 16, 2023 at 11:32:41AM +0100, Viktor Malik wrote:
> This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm
> to functions located in modules:
> 
> 1. The verifier tries to find the address to attach to in kallsyms. This
>    is always done by searching the entire kallsyms, not respecting the
>    module in which the function is located. Such approach causes an
>    incorrect attachment address to be computed if the function to attach
>    to is shadowed by a function of the same name located earlier in
>    kallsyms.
> 
> 2. If the address to attach to is located in a module, the module
>    reference is only acquired in register_fentry. If the module is
>    unloaded between the place where the address is found
>    (bpf_check_attach_target in the verifier) and register_fentry, it is
>    possible that another module is loaded to the same address which may
>    lead to potential errors.
> 
> Since the attachment must contain the BTF of the program to attach to,
> we extract the module from it and search for the function address in the
> correct module (resolving problem no. 1). Then, the module reference is
> taken directly in bpf_check_attach_target and stored in the bpf program
> (in bpf_prog_aux). The reference is only released when the program is
> unloaded (resolving problem no. 2).
> 
> Signed-off-by: Viktor Malik <vmalik@redhat.com>
> ---
>  include/linux/bpf.h      |  1 +
>  kernel/bpf/syscall.c     |  2 ++
>  kernel/bpf/trampoline.c  | 27 ---------------------------
>  kernel/bpf/verifier.c    | 20 +++++++++++++++++++-
>  kernel/module/internal.h |  5 +++++
>  5 files changed, 27 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4385418118f6..2aadc78fe3c1 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1330,6 +1330,7 @@ struct bpf_prog_aux {
>  	 * main prog always has linfo_idx == 0
>  	 */
>  	u32 linfo_idx;
> +	struct module *mod;
>  	u32 num_exentries;
>  	struct exception_table_entry *extable;
>  	union {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index cda8d00f3762..5b8227e08182 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2064,6 +2064,8 @@ static void bpf_prog_put_deferred(struct work_struct *work)
>  	prog = aux->prog;
>  	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
>  	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
> +	if (aux->mod)
> +		module_put(aux->mod);

you can call just module_put, there's != NULL check inside

also should we call it from __bpf_prog_put_noref instead
to cover bpf_prog_load error path?

>  	bpf_prog_free_id(prog);
>  	__bpf_prog_put_noref(prog, true);
>  }
> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> index d0ed7d6f5eec..ebb20bf252c7 100644
> --- a/kernel/bpf/trampoline.c
> +++ b/kernel/bpf/trampoline.c
> @@ -172,26 +172,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
>  	return tr;
>  }
>  
> -static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
> -{
> -	struct module *mod;
> -	int err = 0;
> -
> -	preempt_disable();
> -	mod = __module_text_address((unsigned long) tr->func.addr);
> -	if (mod && !try_module_get(mod))
> -		err = -ENOENT;
> -	preempt_enable();
> -	tr->mod = mod;
> -	return err;
> -}
> -
> -static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
> -{
> -	module_put(tr->mod);
> -	tr->mod = NULL;
> -}
> -
>  static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
>  {
>  	void *ip = tr->func.addr;
> @@ -202,8 +182,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
>  	else
>  		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
>  
> -	if (!ret)
> -		bpf_trampoline_module_put(tr);
>  	return ret;
>  }
>  
> @@ -238,9 +216,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
>  		tr->func.ftrace_managed = true;
>  	}
>  
> -	if (bpf_trampoline_module_get(tr))
> -		return -ENOENT;
> -
>  	if (tr->func.ftrace_managed) {
>  		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
>  		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
> @@ -248,8 +223,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
>  		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
>  	}
>  
> -	if (ret)
> -		bpf_trampoline_module_put(tr);
>  	return ret;
>  }
>  
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 388245e8826e..6a19bd450558 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -24,6 +24,7 @@
>  #include <linux/bpf_lsm.h>
>  #include <linux/btf_ids.h>
>  #include <linux/poison.h>
> +#include "../module/internal.h"
>  
>  #include "disasm.h"
>  
> @@ -16868,6 +16869,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
>  	const char *tname;
>  	struct btf *btf;
>  	long addr = 0;
> +	struct module *mod = NULL;
>  
>  	if (!btf_id) {
>  		bpf_log(log, "Tracing programs must provide btf_id\n");
> @@ -17041,7 +17043,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
>  			else
>  				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
>  		} else {
> -			addr = kallsyms_lookup_name(tname);
> +			if (btf_is_module(btf)) {
> +				preempt_disable();

btf_try_get_module takes mutex, so you can't preempt_disable in here,
I got this when running the test:

[  691.916989][ T2585] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580

> +				mod = btf_try_get_module(btf);
> +				if (mod)
> +					addr = find_kallsyms_symbol_value(mod, tname);
> +				else
> +					addr = 0;
> +				preempt_enable();
> +			} else {
> +				addr = kallsyms_lookup_name(tname);
> +			}
>  			if (!addr) {
>  				bpf_log(log,
>  					"The address of function %s cannot be found\n",
> @@ -17105,6 +17117,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
>  	tgt_info->tgt_addr = addr;
>  	tgt_info->tgt_name = tname;
>  	tgt_info->tgt_type = t;
> +	if (mod) {
> +		if (!prog->aux->mod)
> +			prog->aux->mod = mod;

can this actually happen? would it be better to have bpf_check_attach_target
just to take take the module ref and return it in tgt_info->tgt_mod and it'd
be up to caller to decide what to do with that

thanks,
jirka

> +		else
> +			module_put(mod);
> +	}
>  	return 0;
>  }
>  
> diff --git a/kernel/module/internal.h b/kernel/module/internal.h
> index 2e2bf236f558..5cb103a46018 100644
> --- a/kernel/module/internal.h
> +++ b/kernel/module/internal.h
> @@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect)
>  static inline void init_build_id(struct module *mod, const struct load_info *info) { }
>  static inline void layout_symtab(struct module *mod, struct load_info *info) { }
>  static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
> +static inline unsigned long find_kallsyms_symbol_value(struct module *mod
> +						       const char *name)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_KALLSYMS */
>  
>  #ifdef CONFIG_SYSFS
> -- 
> 2.39.1
>
Viktor Malik Feb. 16, 2023, 2:45 p.m. UTC | #2
On 2/16/23 14:50, Jiri Olsa wrote:
> On Thu, Feb 16, 2023 at 11:32:41AM +0100, Viktor Malik wrote:
>> This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm
>> to functions located in modules:
>>
>> 1. The verifier tries to find the address to attach to in kallsyms. This
>>     is always done by searching the entire kallsyms, not respecting the
>>     module in which the function is located. Such approach causes an
>>     incorrect attachment address to be computed if the function to attach
>>     to is shadowed by a function of the same name located earlier in
>>     kallsyms.
>>
>> 2. If the address to attach to is located in a module, the module
>>     reference is only acquired in register_fentry. If the module is
>>     unloaded between the place where the address is found
>>     (bpf_check_attach_target in the verifier) and register_fentry, it is
>>     possible that another module is loaded to the same address which may
>>     lead to potential errors.
>>
>> Since the attachment must contain the BTF of the program to attach to,
>> we extract the module from it and search for the function address in the
>> correct module (resolving problem no. 1). Then, the module reference is
>> taken directly in bpf_check_attach_target and stored in the bpf program
>> (in bpf_prog_aux). The reference is only released when the program is
>> unloaded (resolving problem no. 2).
>>
>> Signed-off-by: Viktor Malik <vmalik@redhat.com>
>> ---
>>   include/linux/bpf.h      |  1 +
>>   kernel/bpf/syscall.c     |  2 ++
>>   kernel/bpf/trampoline.c  | 27 ---------------------------
>>   kernel/bpf/verifier.c    | 20 +++++++++++++++++++-
>>   kernel/module/internal.h |  5 +++++
>>   5 files changed, 27 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 4385418118f6..2aadc78fe3c1 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1330,6 +1330,7 @@ struct bpf_prog_aux {
>>   	 * main prog always has linfo_idx == 0
>>   	 */
>>   	u32 linfo_idx;
>> +	struct module *mod;
>>   	u32 num_exentries;
>>   	struct exception_table_entry *extable;
>>   	union {
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index cda8d00f3762..5b8227e08182 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -2064,6 +2064,8 @@ static void bpf_prog_put_deferred(struct work_struct *work)
>>   	prog = aux->prog;
>>   	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
>>   	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
>> +	if (aux->mod)
>> +		module_put(aux->mod);
> 
> you can call just module_put, there's != NULL check inside
> 
> also should we call it from __bpf_prog_put_noref instead
> to cover bpf_prog_load error path?

Yes, good point, will do that.

> 
>>   	bpf_prog_free_id(prog);
>>   	__bpf_prog_put_noref(prog, true);
>>   }
>> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
>> index d0ed7d6f5eec..ebb20bf252c7 100644
>> --- a/kernel/bpf/trampoline.c
>> +++ b/kernel/bpf/trampoline.c
>> @@ -172,26 +172,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
>>   	return tr;
>>   }
>>   
>> -static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
>> -{
>> -	struct module *mod;
>> -	int err = 0;
>> -
>> -	preempt_disable();
>> -	mod = __module_text_address((unsigned long) tr->func.addr);
>> -	if (mod && !try_module_get(mod))
>> -		err = -ENOENT;
>> -	preempt_enable();
>> -	tr->mod = mod;
>> -	return err;
>> -}
>> -
>> -static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
>> -{
>> -	module_put(tr->mod);
>> -	tr->mod = NULL;
>> -}
>> -
>>   static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
>>   {
>>   	void *ip = tr->func.addr;
>> @@ -202,8 +182,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
>>   	else
>>   		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
>>   
>> -	if (!ret)
>> -		bpf_trampoline_module_put(tr);
>>   	return ret;
>>   }
>>   
>> @@ -238,9 +216,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
>>   		tr->func.ftrace_managed = true;
>>   	}
>>   
>> -	if (bpf_trampoline_module_get(tr))
>> -		return -ENOENT;
>> -
>>   	if (tr->func.ftrace_managed) {
>>   		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
>>   		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
>> @@ -248,8 +223,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
>>   		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
>>   	}
>>   
>> -	if (ret)
>> -		bpf_trampoline_module_put(tr);
>>   	return ret;
>>   }
>>   
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 388245e8826e..6a19bd450558 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -24,6 +24,7 @@
>>   #include <linux/bpf_lsm.h>
>>   #include <linux/btf_ids.h>
>>   #include <linux/poison.h>
>> +#include "../module/internal.h"
>>   
>>   #include "disasm.h"
>>   
>> @@ -16868,6 +16869,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
>>   	const char *tname;
>>   	struct btf *btf;
>>   	long addr = 0;
>> +	struct module *mod = NULL;
>>   
>>   	if (!btf_id) {
>>   		bpf_log(log, "Tracing programs must provide btf_id\n");
>> @@ -17041,7 +17043,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
>>   			else
>>   				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
>>   		} else {
>> -			addr = kallsyms_lookup_name(tname);
>> +			if (btf_is_module(btf)) {
>> +				preempt_disable();
> 
> btf_try_get_module takes mutex, so you can't preempt_disable in here,
> I got this when running the test:
> 
> [  691.916989][ T2585] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580
> 

Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
in module kallsyms to prevent taking the module lock b/c kallsyms are
used in the oops path. That shouldn't be an issue here, is that correct?

>> +				mod = btf_try_get_module(btf);
>> +				if (mod)
>> +					addr = find_kallsyms_symbol_value(mod, tname);
>> +				else
>> +					addr = 0;
>> +				preempt_enable();
>> +			} else {
>> +				addr = kallsyms_lookup_name(tname);
>> +			}
>>   			if (!addr) {
>>   				bpf_log(log,
>>   					"The address of function %s cannot be found\n",
>> @@ -17105,6 +17117,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
>>   	tgt_info->tgt_addr = addr;
>>   	tgt_info->tgt_name = tname;
>>   	tgt_info->tgt_type = t;
>> +	if (mod) {
>> +		if (!prog->aux->mod)
>> +			prog->aux->mod = mod;
> 
> can this actually happen? would it be better to have bpf_check_attach_target
> just to take take the module ref and return it in tgt_info->tgt_mod and it'd
> be up to caller to decide what to do with that

Ok, I'll try to do it that way.

Thanks for the review!
Viktor

> 
> thanks,
> jirka
> 
>> +		else
>> +			module_put(mod);
>> +	}
>>   	return 0;
>>   }
>>   
>> diff --git a/kernel/module/internal.h b/kernel/module/internal.h
>> index 2e2bf236f558..5cb103a46018 100644
>> --- a/kernel/module/internal.h
>> +++ b/kernel/module/internal.h
>> @@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect)
>>   static inline void init_build_id(struct module *mod, const struct load_info *info) { }
>>   static inline void layout_symtab(struct module *mod, struct load_info *info) { }
>>   static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
>> +static inline unsigned long find_kallsyms_symbol_value(struct module *mod
>> +						       const char *name)
>> +{
>> +	return 0;
>> +}
>>   #endif /* CONFIG_KALLSYMS */
>>   
>>   #ifdef CONFIG_SYSFS
>> -- 
>> 2.39.1
>>
>
Jiri Olsa Feb. 16, 2023, 3:50 p.m. UTC | #3
On Thu, Feb 16, 2023 at 03:45:11PM +0100, Viktor Malik wrote:

SNIP

> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > index 388245e8826e..6a19bd450558 100644
> > > --- a/kernel/bpf/verifier.c
> > > +++ b/kernel/bpf/verifier.c
> > > @@ -24,6 +24,7 @@
> > >   #include <linux/bpf_lsm.h>
> > >   #include <linux/btf_ids.h>
> > >   #include <linux/poison.h>
> > > +#include "../module/internal.h"
> > >   #include "disasm.h"
> > > @@ -16868,6 +16869,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
> > >   	const char *tname;
> > >   	struct btf *btf;
> > >   	long addr = 0;
> > > +	struct module *mod = NULL;
> > >   	if (!btf_id) {
> > >   		bpf_log(log, "Tracing programs must provide btf_id\n");
> > > @@ -17041,7 +17043,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
> > >   			else
> > >   				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
> > >   		} else {
> > > -			addr = kallsyms_lookup_name(tname);
> > > +			if (btf_is_module(btf)) {
> > > +				preempt_disable();
> > 
> > btf_try_get_module takes mutex, so you can't preempt_disable in here,
> > I got this when running the test:
> > 
> > [  691.916989][ T2585] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580
> > 
> 
> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> in module kallsyms to prevent taking the module lock b/c kallsyms are
> used in the oops path. That shouldn't be an issue here, is that correct?

btf_try_get_module calls try_module_get which disables the preemption,
so no need to call it in here

jirka

> 
> > > +				mod = btf_try_get_module(btf);
> > > +				if (mod)
> > > +					addr = find_kallsyms_symbol_value(mod, tname);
> > > +				else
> > > +					addr = 0;
> > > +				preempt_enable();
> > > +			} else {
> > > +				addr = kallsyms_lookup_name(tname);
> > > +			}
> > >   			if (!addr) {
> > >   				bpf_log(log,
> > >   					"The address of function %s cannot be found\n",
> > > @@ -17105,6 +17117,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
> > >   	tgt_info->tgt_addr = addr;
> > >   	tgt_info->tgt_name = tname;
> > >   	tgt_info->tgt_type = t;
> > > +	if (mod) {
> > > +		if (!prog->aux->mod)
> > > +			prog->aux->mod = mod;
> > 
> > can this actually happen? would it be better to have bpf_check_attach_target
> > just to take take the module ref and return it in tgt_info->tgt_mod and it'd
> > be up to caller to decide what to do with that
> 
> Ok, I'll try to do it that way.
> 
> Thanks for the review!
> Viktor
> 
> > 
> > thanks,
> > jirka
> > 
> > > +		else
> > > +			module_put(mod);
> > > +	}
> > >   	return 0;
> > >   }
> > > diff --git a/kernel/module/internal.h b/kernel/module/internal.h
> > > index 2e2bf236f558..5cb103a46018 100644
> > > --- a/kernel/module/internal.h
> > > +++ b/kernel/module/internal.h
> > > @@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect)
> > >   static inline void init_build_id(struct module *mod, const struct load_info *info) { }
> > >   static inline void layout_symtab(struct module *mod, struct load_info *info) { }
> > >   static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
> > > +static inline unsigned long find_kallsyms_symbol_value(struct module *mod
> > > +						       const char *name)
> > > +{
> > > +	return 0;
> > > +}
> > >   #endif /* CONFIG_KALLSYMS */
> > >   #ifdef CONFIG_SYSFS
> > > -- 
> > > 2.39.1
> > > 
> > 
>
Artem Savkov March 22, 2023, 9:49 a.m. UTC | #4
On Thu, Feb 16, 2023 at 04:50:41PM +0100, Jiri Olsa wrote:
> On Thu, Feb 16, 2023 at 03:45:11PM +0100, Viktor Malik wrote:
> 
> SNIP
> 
> > > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > > index 388245e8826e..6a19bd450558 100644
> > > > --- a/kernel/bpf/verifier.c
> > > > +++ b/kernel/bpf/verifier.c
> > > > @@ -24,6 +24,7 @@
> > > >   #include <linux/bpf_lsm.h>
> > > >   #include <linux/btf_ids.h>
> > > >   #include <linux/poison.h>
> > > > +#include "../module/internal.h"
> > > >   #include "disasm.h"
> > > > @@ -16868,6 +16869,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
> > > >   	const char *tname;
> > > >   	struct btf *btf;
> > > >   	long addr = 0;
> > > > +	struct module *mod = NULL;
> > > >   	if (!btf_id) {
> > > >   		bpf_log(log, "Tracing programs must provide btf_id\n");
> > > > @@ -17041,7 +17043,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
> > > >   			else
> > > >   				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
> > > >   		} else {
> > > > -			addr = kallsyms_lookup_name(tname);
> > > > +			if (btf_is_module(btf)) {
> > > > +				preempt_disable();
> > > 
> > > btf_try_get_module takes mutex, so you can't preempt_disable in here,
> > > I got this when running the test:
> > > 
> > > [  691.916989][ T2585] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580
> > > 
> > 
> > Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> > in module kallsyms to prevent taking the module lock b/c kallsyms are
> > used in the oops path. That shouldn't be an issue here, is that correct?
> 
> btf_try_get_module calls try_module_get which disables the preemption,
> so no need to call it in here

It does, but it reenables preemption right away so it is enabled by the
time we call find_kallsyms_symbol_value(). I am getting the following
lockdep splat while running module_fentry_shadow test from test_progs.

[   12.017973][  T488] =============================                                                          
[   12.018529][  T488] WARNING: suspicious RCU usage                                                          
[   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE                        
[   12.019898][  T488] -----------------------------                                                          
[   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!                
[   12.021335][  T488]                                                                                        
[   12.021335][  T488] other info that might help us debug this:                                              
[   12.021335][  T488]                                                                                        
[   12.022416][  T488]                                                                                        
[   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1                                              
[   12.023297][  T488] no locks held by test_progs/488.                                                       
[   12.023854][  T488]                                                                                        
[   12.023854][  T488] stack backtrace:                                                                       
[   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
[   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014                                                                                                               
[   12.026108][  T488] Call Trace:                                                                            
[   12.026381][  T488]  <TASK>                                                                                
[   12.026649][  T488]  dump_stack_lvl+0xb4/0x110                                                             
[   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0                                                    
[   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110                                                 
[   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20                                                   
[   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0                                                      
[   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10                                                       
[   12.029408][  T488]  bpf_check+0xeec/0x1850                                                                
[   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0                                                   
[   12.030247][  T488]  bpf_prog_load+0x87a/0xed0                                                             
[   12.030627][  T488]  ? __lock_release+0x5f/0x160                                                           
[   12.031010][  T488]  ? __might_fault+0x53/0xb0                                                            
[   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0                                                              
[   12.031756][  T488]  __sys_bpf+0x53c/0x1240                                                                
[   12.032115][  T488]  __x64_sys_bpf+0x27/0x40                                                              
[   12.032476][  T488]  do_syscall_64+0x3e/0x90                                                              
[   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc                                             
[   12.033313][  T488] RIP: 0033:0x7f174ea0e92d                                                              
[   12.033668][  T488] Code: 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d d3 e4 0c 00 f7 d8 64 89 0
1 48
[   12.035197][  T488] RSP: 002b:00007ffee5cefc68 EFLAGS: 00000202 ORIG_RAX: 0000000000000141                
[   12.035864][  T488] RAX: ffffffffffffffda RBX: 00007ffee5cf02a8 RCX: 00007f174ea0e92d                     
[   12.036495][  T488] RDX: 0000000000000080 RSI: 00007ffee5cefd20 RDI: 0000000000000005                     
[   12.037123][  T488] RBP: 00007ffee5cefc80 R08: 00007ffee5cefea0 R09: 00007ffee5cefd20                     
[   12.037752][  T488] R10: 0000000000000002 R11: 0000000000000202 R12: 0000000000000000                     
[   12.038382][  T488] R13: 00007ffee5cf02c8 R14: 0000000000f2edb0 R15: 00007f174eb59000                     
[   12.039022][  T488]  </TASK>                       


> jirka
> 
> > 
> > > > +				mod = btf_try_get_module(btf);
> > > > +				if (mod)
> > > > +					addr = find_kallsyms_symbol_value(mod, tname);
> > > > +				else
> > > > +					addr = 0;
> > > > +				preempt_enable();
> > > > +			} else {
> > > > +				addr = kallsyms_lookup_name(tname);
> > > > +			}
> > > >   			if (!addr) {
> > > >   				bpf_log(log,
> > > >   					"The address of function %s cannot be found\n",
> > > > @@ -17105,6 +17117,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
> > > >   	tgt_info->tgt_addr = addr;
> > > >   	tgt_info->tgt_name = tname;
> > > >   	tgt_info->tgt_type = t;
> > > > +	if (mod) {
> > > > +		if (!prog->aux->mod)
> > > > +			prog->aux->mod = mod;
> > > 
> > > can this actually happen? would it be better to have bpf_check_attach_target
> > > just to take take the module ref and return it in tgt_info->tgt_mod and it'd
> > > be up to caller to decide what to do with that
> > 
> > Ok, I'll try to do it that way.
> > 
> > Thanks for the review!
> > Viktor
> > 
> > > 
> > > thanks,
> > > jirka
> > > 
> > > > +		else
> > > > +			module_put(mod);
> > > > +	}
> > > >   	return 0;
> > > >   }
> > > > diff --git a/kernel/module/internal.h b/kernel/module/internal.h
> > > > index 2e2bf236f558..5cb103a46018 100644
> > > > --- a/kernel/module/internal.h
> > > > +++ b/kernel/module/internal.h
> > > > @@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect)
> > > >   static inline void init_build_id(struct module *mod, const struct load_info *info) { }
> > > >   static inline void layout_symtab(struct module *mod, struct load_info *info) { }
> > > >   static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
> > > > +static inline unsigned long find_kallsyms_symbol_value(struct module *mod
> > > > +						       const char *name)
> > > > +{
> > > > +	return 0;
> > > > +}
> > > >   #endif /* CONFIG_KALLSYMS */
> > > >   #ifdef CONFIG_SYSFS
> > > > -- 
> > > > 2.39.1
> > > > 
> > > 
> >
Jiri Olsa March 22, 2023, 12:14 p.m. UTC | #5
On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:

SNIP

> > > Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> > > in module kallsyms to prevent taking the module lock b/c kallsyms are
> > > used in the oops path. That shouldn't be an issue here, is that correct?
> > 
> > btf_try_get_module calls try_module_get which disables the preemption,
> > so no need to call it in here
> 
> It does, but it reenables preemption right away so it is enabled by the
> time we call find_kallsyms_symbol_value(). I am getting the following
> lockdep splat while running module_fentry_shadow test from test_progs.
> 
> [   12.017973][  T488] =============================                                                          
> [   12.018529][  T488] WARNING: suspicious RCU usage                                                          
> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE                        
> [   12.019898][  T488] -----------------------------                                                          
> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!                
> [   12.021335][  T488]                                                                                        
> [   12.021335][  T488] other info that might help us debug this:                                              
> [   12.021335][  T488]                                                                                        
> [   12.022416][  T488]                                                                                        
> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1                                              
> [   12.023297][  T488] no locks held by test_progs/488.                                                       
> [   12.023854][  T488]                                                                                        
> [   12.023854][  T488] stack backtrace:                                                                       
> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014                                                                                                               
> [   12.026108][  T488] Call Trace:                                                                            
> [   12.026381][  T488]  <TASK>                                                                                
> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110                                                             
> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0                                                    
> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110                                                 
> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20                                                   
> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0                                                      
> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10                                                       
> [   12.029408][  T488]  bpf_check+0xeec/0x1850                                                                
> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0                                                   
> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0                                                             
> [   12.030627][  T488]  ? __lock_release+0x5f/0x160                                                           
> [   12.031010][  T488]  ? __might_fault+0x53/0xb0                                                            
> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0                                                              
> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240                                                                
> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40                                                              
> [   12.032476][  T488]  do_syscall_64+0x3e/0x90                                                              
> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc                                             


hum, for some reason I can't reproduce, but looks like we need to disable
preemption for find_kallsyms_symbol_value.. could you please check with
patch below?

also could you please share your .config? not sure why I can't reproduce

thanks,
jirka


---
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index ab2376a1be88..bdc911dbcde5 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 }
 
 /* Given a module and name of symbol, find and return the symbol's value */
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
 {
 	unsigned int i;
 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
@@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
 	if (colon) {
 		mod = find_module_all(name, colon - name, false);
 		if (mod)
-			return find_kallsyms_symbol_value(mod, colon + 1);
+			return __find_kallsyms_symbol_value(mod, colon + 1);
 		return 0;
 	}
 
@@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
 
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
-		ret = find_kallsyms_symbol_value(mod, name);
+		ret = __find_kallsyms_symbol_value(mod, name);
 		if (ret)
 			return ret;
 	}
@@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	return ret;
 }
 
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+	unsigned long ret;
+
+	preempt_disable();
+	ret = __find_kallsyms_symbol_value(mod, name);
+	preempt_enable();
+	return ret;
+}
+
 int module_kallsyms_on_each_symbol(const char *modname,
 				   int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
Alexei Starovoitov March 22, 2023, 4:03 p.m. UTC | #6
On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
>
> SNIP
>
> > > > Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> > > > in module kallsyms to prevent taking the module lock b/c kallsyms are
> > > > used in the oops path. That shouldn't be an issue here, is that correct?
> > >
> > > btf_try_get_module calls try_module_get which disables the preemption,
> > > so no need to call it in here
> >
> > It does, but it reenables preemption right away so it is enabled by the
> > time we call find_kallsyms_symbol_value(). I am getting the following
> > lockdep splat while running module_fentry_shadow test from test_progs.
> >
> > [   12.017973][  T488] =============================
> > [   12.018529][  T488] WARNING: suspicious RCU usage
> > [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> > [   12.019898][  T488] -----------------------------
> > [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> > [   12.021335][  T488]
> > [   12.021335][  T488] other info that might help us debug this:
> > [   12.021335][  T488]
> > [   12.022416][  T488]
> > [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> > [   12.023297][  T488] no locks held by test_progs/488.
> > [   12.023854][  T488]
> > [   12.023854][  T488] stack backtrace:
> > [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> > [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> > [   12.026108][  T488] Call Trace:
> > [   12.026381][  T488]  <TASK>
> > [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> > [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> > [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> > [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> > [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> > [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> > [   12.029408][  T488]  bpf_check+0xeec/0x1850
> > [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> > [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> > [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> > [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> > [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> > [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> > [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> > [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> > [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
>
> hum, for some reason I can't reproduce, but looks like we need to disable
> preemption for find_kallsyms_symbol_value.. could you please check with
> patch below?
>
> also could you please share your .config? not sure why I can't reproduce
>
> thanks,
> jirka
>
>
> ---
> diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
> index ab2376a1be88..bdc911dbcde5 100644
> --- a/kernel/module/kallsyms.c
> +++ b/kernel/module/kallsyms.c
> @@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
>  }
>
>  /* Given a module and name of symbol, find and return the symbol's value */
> -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> +static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
>  {
>         unsigned int i;
>         struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
> @@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
>         if (colon) {
>                 mod = find_module_all(name, colon - name, false);
>                 if (mod)
> -                       return find_kallsyms_symbol_value(mod, colon + 1);
> +                       return __find_kallsyms_symbol_value(mod, colon + 1);
>                 return 0;
>         }
>
> @@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
>
>                 if (mod->state == MODULE_STATE_UNFORMED)
>                         continue;
> -               ret = find_kallsyms_symbol_value(mod, name);
> +               ret = __find_kallsyms_symbol_value(mod, name);
>                 if (ret)
>                         return ret;
>         }
> @@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
>         return ret;
>  }
>
> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> +{
> +       unsigned long ret;
> +
> +       preempt_disable();
> +       ret = __find_kallsyms_symbol_value(mod, name);
> +       preempt_enable();
> +       return ret;
> +}

That doesn't look right.
I think the issue is misuse of rcu_dereference_sched in
find_kallsyms_symbol_value.
Jiri Olsa March 23, 2023, 2 p.m. UTC | #7
On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
> >
> > SNIP
> >
> > > > > Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> > > > > in module kallsyms to prevent taking the module lock b/c kallsyms are
> > > > > used in the oops path. That shouldn't be an issue here, is that correct?
> > > >
> > > > btf_try_get_module calls try_module_get which disables the preemption,
> > > > so no need to call it in here
> > >
> > > It does, but it reenables preemption right away so it is enabled by the
> > > time we call find_kallsyms_symbol_value(). I am getting the following
> > > lockdep splat while running module_fentry_shadow test from test_progs.
> > >
> > > [   12.017973][  T488] =============================
> > > [   12.018529][  T488] WARNING: suspicious RCU usage
> > > [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> > > [   12.019898][  T488] -----------------------------
> > > [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> > > [   12.021335][  T488]
> > > [   12.021335][  T488] other info that might help us debug this:
> > > [   12.021335][  T488]
> > > [   12.022416][  T488]
> > > [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> > > [   12.023297][  T488] no locks held by test_progs/488.
> > > [   12.023854][  T488]
> > > [   12.023854][  T488] stack backtrace:
> > > [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> > > [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> > > [   12.026108][  T488] Call Trace:
> > > [   12.026381][  T488]  <TASK>
> > > [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> > > [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> > > [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> > > [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> > > [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> > > [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> > > [   12.029408][  T488]  bpf_check+0xeec/0x1850
> > > [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> > > [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> > > [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> > > [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> > > [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> > > [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> > > [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> > > [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> > > [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> >
> >
> > hum, for some reason I can't reproduce, but looks like we need to disable
> > preemption for find_kallsyms_symbol_value.. could you please check with
> > patch below?
> >
> > also could you please share your .config? not sure why I can't reproduce
> >
> > thanks,
> > jirka
> >
> >
> > ---
> > diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
> > index ab2376a1be88..bdc911dbcde5 100644
> > --- a/kernel/module/kallsyms.c
> > +++ b/kernel/module/kallsyms.c
> > @@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
> >  }
> >
> >  /* Given a module and name of symbol, find and return the symbol's value */
> > -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> > +static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
> >  {
> >         unsigned int i;
> >         struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
> > @@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
> >         if (colon) {
> >                 mod = find_module_all(name, colon - name, false);
> >                 if (mod)
> > -                       return find_kallsyms_symbol_value(mod, colon + 1);
> > +                       return __find_kallsyms_symbol_value(mod, colon + 1);
> >                 return 0;
> >         }
> >
> > @@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
> >
> >                 if (mod->state == MODULE_STATE_UNFORMED)
> >                         continue;
> > -               ret = find_kallsyms_symbol_value(mod, name);
> > +               ret = __find_kallsyms_symbol_value(mod, name);
> >                 if (ret)
> >                         return ret;
> >         }
> > @@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
> >         return ret;
> >  }
> >
> > +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> > +{
> > +       unsigned long ret;
> > +
> > +       preempt_disable();
> > +       ret = __find_kallsyms_symbol_value(mod, name);
> > +       preempt_enable();
> > +       return ret;
> > +}
> 
> That doesn't look right.
> I think the issue is misuse of rcu_dereference_sched in
> find_kallsyms_symbol_value.

it seems to be using rcu pointer to keep symbols for module init time and
then core symbols for after init.. and switch between them when module is
loaded, hence the strange rcu usage I think

Petr, Zhen, any idea/insight?

thanks,
jirka
Jiri Olsa March 30, 2023, 7:29 a.m. UTC | #8
ping,

Petr, Zhen, any comment on discussion below?

thanks,
jirka

On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
> > On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >
> > > On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
> > >
> > > SNIP
> > >
> > > > > > Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> > > > > > in module kallsyms to prevent taking the module lock b/c kallsyms are
> > > > > > used in the oops path. That shouldn't be an issue here, is that correct?
> > > > >
> > > > > btf_try_get_module calls try_module_get which disables the preemption,
> > > > > so no need to call it in here
> > > >
> > > > It does, but it reenables preemption right away so it is enabled by the
> > > > time we call find_kallsyms_symbol_value(). I am getting the following
> > > > lockdep splat while running module_fentry_shadow test from test_progs.
> > > >
> > > > [   12.017973][  T488] =============================
> > > > [   12.018529][  T488] WARNING: suspicious RCU usage
> > > > [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> > > > [   12.019898][  T488] -----------------------------
> > > > [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> > > > [   12.021335][  T488]
> > > > [   12.021335][  T488] other info that might help us debug this:
> > > > [   12.021335][  T488]
> > > > [   12.022416][  T488]
> > > > [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> > > > [   12.023297][  T488] no locks held by test_progs/488.
> > > > [   12.023854][  T488]
> > > > [   12.023854][  T488] stack backtrace:
> > > > [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> > > > [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> > > > [   12.026108][  T488] Call Trace:
> > > > [   12.026381][  T488]  <TASK>
> > > > [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> > > > [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> > > > [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> > > > [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> > > > [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> > > > [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> > > > [   12.029408][  T488]  bpf_check+0xeec/0x1850
> > > > [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> > > > [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> > > > [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> > > > [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> > > > [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> > > > [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> > > > [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> > > > [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> > > > [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> > >
> > >
> > > hum, for some reason I can't reproduce, but looks like we need to disable
> > > preemption for find_kallsyms_symbol_value.. could you please check with
> > > patch below?
> > >
> > > also could you please share your .config? not sure why I can't reproduce
> > >
> > > thanks,
> > > jirka
> > >
> > >
> > > ---
> > > diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
> > > index ab2376a1be88..bdc911dbcde5 100644
> > > --- a/kernel/module/kallsyms.c
> > > +++ b/kernel/module/kallsyms.c
> > > @@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
> > >  }
> > >
> > >  /* Given a module and name of symbol, find and return the symbol's value */
> > > -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> > > +static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
> > >  {
> > >         unsigned int i;
> > >         struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
> > > @@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
> > >         if (colon) {
> > >                 mod = find_module_all(name, colon - name, false);
> > >                 if (mod)
> > > -                       return find_kallsyms_symbol_value(mod, colon + 1);
> > > +                       return __find_kallsyms_symbol_value(mod, colon + 1);
> > >                 return 0;
> > >         }
> > >
> > > @@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
> > >
> > >                 if (mod->state == MODULE_STATE_UNFORMED)
> > >                         continue;
> > > -               ret = find_kallsyms_symbol_value(mod, name);
> > > +               ret = __find_kallsyms_symbol_value(mod, name);
> > >                 if (ret)
> > >                         return ret;
> > >         }
> > > @@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
> > >         return ret;
> > >  }
> > >
> > > +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> > > +{
> > > +       unsigned long ret;
> > > +
> > > +       preempt_disable();
> > > +       ret = __find_kallsyms_symbol_value(mod, name);
> > > +       preempt_enable();
> > > +       return ret;
> > > +}
> > 
> > That doesn't look right.
> > I think the issue is misuse of rcu_dereference_sched in
> > find_kallsyms_symbol_value.
> 
> it seems to be using rcu pointer to keep symbols for module init time and
> then core symbols for after init.. and switch between them when module is
> loaded, hence the strange rcu usage I think
> 
> Petr, Zhen, any idea/insight?
> 
> thanks,
> jirka
Zhen Lei March 30, 2023, 12:26 p.m. UTC | #9
On 2023/3/30 15:29, Jiri Olsa wrote:
> ping,
> 
> Petr, Zhen, any comment on discussion below?
> 
> thanks,
> jirka
> 
> On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
>> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
>>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>>>>
>>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
>>>>
>>>> SNIP
>>>>
>>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
>>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
>>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
>>>>>>
>>>>>> btf_try_get_module calls try_module_get which disables the preemption,
>>>>>> so no need to call it in here
>>>>>
>>>>> It does, but it reenables preemption right away so it is enabled by the
>>>>> time we call find_kallsyms_symbol_value(). I am getting the following
>>>>> lockdep splat while running module_fentry_shadow test from test_progs.
>>>>>
>>>>> [   12.017973][  T488] =============================
>>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
>>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
>>>>> [   12.019898][  T488] -----------------------------
>>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
>>>>> [   12.021335][  T488]
>>>>> [   12.021335][  T488] other info that might help us debug this:
>>>>> [   12.021335][  T488]
>>>>> [   12.022416][  T488]
>>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
>>>>> [   12.023297][  T488] no locks held by test_progs/488.
>>>>> [   12.023854][  T488]
>>>>> [   12.023854][  T488] stack backtrace:
>>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
>>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
>>>>> [   12.026108][  T488] Call Trace:
>>>>> [   12.026381][  T488]  <TASK>
>>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
>>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
>>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
>>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
>>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
>>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
>>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
>>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
>>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
>>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
>>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
>>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
>>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
>>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
>>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
>>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
>>>>
>>>>
>>>> hum, for some reason I can't reproduce, but looks like we need to disable
>>>> preemption for find_kallsyms_symbol_value.. could you please check with
>>>> patch below?
>>>>
>>>> also could you please share your .config? not sure why I can't reproduce
>>>>
>>>> thanks,
>>>> jirka
>>>>
>>>>
>>>> ---
>>>> diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
>>>> index ab2376a1be88..bdc911dbcde5 100644
>>>> --- a/kernel/module/kallsyms.c
>>>> +++ b/kernel/module/kallsyms.c
>>>> @@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
>>>>  }
>>>>
>>>>  /* Given a module and name of symbol, find and return the symbol's value */
>>>> -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
>>>> +static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
>>>>  {
>>>>         unsigned int i;
>>>>         struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
>>>> @@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
>>>>         if (colon) {
>>>>                 mod = find_module_all(name, colon - name, false);
>>>>                 if (mod)
>>>> -                       return find_kallsyms_symbol_value(mod, colon + 1);
>>>> +                       return __find_kallsyms_symbol_value(mod, colon + 1);
>>>>                 return 0;
>>>>         }
>>>>
>>>> @@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
>>>>
>>>>                 if (mod->state == MODULE_STATE_UNFORMED)
>>>>                         continue;
>>>> -               ret = find_kallsyms_symbol_value(mod, name);
>>>> +               ret = __find_kallsyms_symbol_value(mod, name);
>>>>                 if (ret)
>>>>                         return ret;
>>>>         }
>>>> @@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
>>>>         return ret;
>>>>  }
>>>>
>>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
>>>> +{
>>>> +       unsigned long ret;
>>>> +
>>>> +       preempt_disable();
>>>> +       ret = __find_kallsyms_symbol_value(mod, name);
>>>> +       preempt_enable();
>>>> +       return ret;
>>>> +}
>>>
>>> That doesn't look right.
>>> I think the issue is misuse of rcu_dereference_sched in
>>> find_kallsyms_symbol_value.
>>
>> it seems to be using rcu pointer to keep symbols for module init time and
>> then core symbols for after init.. and switch between them when module is
>> loaded, hence the strange rcu usage I think
>>
>> Petr, Zhen, any idea/insight?

Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
the answer. find_kallsyms_symbol_value() was originally a static function, and it
is only called by module_kallsyms_lookup_name() and is preemptive-protected.

Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
we should do the same thing as function module_kallsyms_lookup_name().

Like this?
+				mod = btf_try_get_module(btf);
+				if (mod) {
+					preempt_disable();
+					addr = find_kallsyms_symbol_value(mod, tname);
+					preempt_enable();
+				} else
+					addr = 0;


>>
>> thanks,
>> jirka
> .
>
Jiri Olsa March 30, 2023, 8:59 p.m. UTC | #10
On Thu, Mar 30, 2023 at 08:26:41PM +0800, Leizhen (ThunderTown) wrote:
> 
> 
> On 2023/3/30 15:29, Jiri Olsa wrote:
> > ping,
> > 
> > Petr, Zhen, any comment on discussion below?
> > 
> > thanks,
> > jirka
> > 
> > On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
> >> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
> >>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >>>>
> >>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
> >>>>
> >>>> SNIP
> >>>>
> >>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> >>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
> >>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
> >>>>>>
> >>>>>> btf_try_get_module calls try_module_get which disables the preemption,
> >>>>>> so no need to call it in here
> >>>>>
> >>>>> It does, but it reenables preemption right away so it is enabled by the
> >>>>> time we call find_kallsyms_symbol_value(). I am getting the following
> >>>>> lockdep splat while running module_fentry_shadow test from test_progs.
> >>>>>
> >>>>> [   12.017973][  T488] =============================
> >>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
> >>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> >>>>> [   12.019898][  T488] -----------------------------
> >>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> >>>>> [   12.021335][  T488]
> >>>>> [   12.021335][  T488] other info that might help us debug this:
> >>>>> [   12.021335][  T488]
> >>>>> [   12.022416][  T488]
> >>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> >>>>> [   12.023297][  T488] no locks held by test_progs/488.
> >>>>> [   12.023854][  T488]
> >>>>> [   12.023854][  T488] stack backtrace:
> >>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> >>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> >>>>> [   12.026108][  T488] Call Trace:
> >>>>> [   12.026381][  T488]  <TASK>
> >>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> >>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> >>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> >>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> >>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> >>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> >>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
> >>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> >>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> >>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> >>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> >>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> >>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> >>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> >>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> >>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> >>>>
> >>>>
> >>>> hum, for some reason I can't reproduce, but looks like we need to disable
> >>>> preemption for find_kallsyms_symbol_value.. could you please check with
> >>>> patch below?
> >>>>
> >>>> also could you please share your .config? not sure why I can't reproduce
> >>>>
> >>>> thanks,
> >>>> jirka
> >>>>
> >>>>
> >>>> ---
> >>>> diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
> >>>> index ab2376a1be88..bdc911dbcde5 100644
> >>>> --- a/kernel/module/kallsyms.c
> >>>> +++ b/kernel/module/kallsyms.c
> >>>> @@ -442,7 +442,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
> >>>>  }
> >>>>
> >>>>  /* Given a module and name of symbol, find and return the symbol's value */
> >>>> -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> >>>> +static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
> >>>>  {
> >>>>         unsigned int i;
> >>>>         struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
> >>>> @@ -466,7 +466,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
> >>>>         if (colon) {
> >>>>                 mod = find_module_all(name, colon - name, false);
> >>>>                 if (mod)
> >>>> -                       return find_kallsyms_symbol_value(mod, colon + 1);
> >>>> +                       return __find_kallsyms_symbol_value(mod, colon + 1);
> >>>>                 return 0;
> >>>>         }
> >>>>
> >>>> @@ -475,7 +475,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
> >>>>
> >>>>                 if (mod->state == MODULE_STATE_UNFORMED)
> >>>>                         continue;
> >>>> -               ret = find_kallsyms_symbol_value(mod, name);
> >>>> +               ret = __find_kallsyms_symbol_value(mod, name);
> >>>>                 if (ret)
> >>>>                         return ret;
> >>>>         }
> >>>> @@ -494,6 +494,16 @@ unsigned long module_kallsyms_lookup_name(const char *name)
> >>>>         return ret;
> >>>>  }
> >>>>
> >>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> >>>> +{
> >>>> +       unsigned long ret;
> >>>> +
> >>>> +       preempt_disable();
> >>>> +       ret = __find_kallsyms_symbol_value(mod, name);
> >>>> +       preempt_enable();
> >>>> +       return ret;
> >>>> +}
> >>>
> >>> That doesn't look right.
> >>> I think the issue is misuse of rcu_dereference_sched in
> >>> find_kallsyms_symbol_value.
> >>
> >> it seems to be using rcu pointer to keep symbols for module init time and
> >> then core symbols for after init.. and switch between them when module is
> >> loaded, hence the strange rcu usage I think
> >>
> >> Petr, Zhen, any idea/insight?
> 
> Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
> the answer. find_kallsyms_symbol_value() was originally a static function, and it
> is only called by module_kallsyms_lookup_name() and is preemptive-protected.
> 
> Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
> we should do the same thing as function module_kallsyms_lookup_name().
> 
> Like this?
> +				mod = btf_try_get_module(btf);
> +				if (mod) {
> +					preempt_disable();
> +					addr = find_kallsyms_symbol_value(mod, tname);
> +					preempt_enable();
> +				} else
> +					addr = 0;

yes, that's what I did above, but I was just curious about the strange
RCU usage Alexei commented on earlier:

	>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
	>>> +{
	>>> +       unsigned long ret;
	>>> +
	>>> +       preempt_disable();
	>>> +       ret = __find_kallsyms_symbol_value(mod, name);
	>>> +       preempt_enable();
	>>> +       return ret;
	>>> +}
	>>
	>> That doesn't look right.
	>> I think the issue is misuse of rcu_dereference_sched in
	>> find_kallsyms_symbol_value.
	>
	> it seems to be using rcu pointer to keep symbols for module init time and
	> then core symbols for after init.. and switch between them when module is
	> loaded, hence the strange rcu usage I think
	>
	> Petr, Zhen, any idea/insight?

thanks,
jirka
Petr Mladek March 31, 2023, 8:31 a.m. UTC | #11
On Thu 2023-03-30 22:59:12, Jiri Olsa wrote:
> On Thu, Mar 30, 2023 at 08:26:41PM +0800, Leizhen (ThunderTown) wrote:
> > 
> > 
> > On 2023/3/30 15:29, Jiri Olsa wrote:
> > > ping,
> > > 
> > > Petr, Zhen, any comment on discussion below?
> > > 
> > > thanks,
> > > jirka
> > > 
> > > On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
> > >> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
> > >>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >>>>
> > >>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
> > >>>>
> > >>>> SNIP
> > >>>>
> > >>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> > >>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
> > >>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
> > >>>>>>
> > >>>>>> btf_try_get_module calls try_module_get which disables the preemption,
> > >>>>>> so no need to call it in here
> > >>>>>
> > >>>>> It does, but it reenables preemption right away so it is enabled by the
> > >>>>> time we call find_kallsyms_symbol_value(). I am getting the following
> > >>>>> lockdep splat while running module_fentry_shadow test from test_progs.
> > >>>>>
> > >>>>> [   12.017973][  T488] =============================
> > >>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
> > >>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> > >>>>> [   12.019898][  T488] -----------------------------
> > >>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> > >>>>> [   12.021335][  T488]
> > >>>>> [   12.021335][  T488] other info that might help us debug this:
> > >>>>> [   12.021335][  T488]
> > >>>>> [   12.022416][  T488]
> > >>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> > >>>>> [   12.023297][  T488] no locks held by test_progs/488.
> > >>>>> [   12.023854][  T488]
> > >>>>> [   12.023854][  T488] stack backtrace:
> > >>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> > >>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> > >>>>> [   12.026108][  T488] Call Trace:
> > >>>>> [   12.026381][  T488]  <TASK>
> > >>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> > >>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> > >>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> > >>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> > >>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> > >>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> > >>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
> > >>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> > >>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> > >>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> > >>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> > >>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> > >>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> > >>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> > >>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> > >>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> > >>>>
> > >>>> --- a/kernel/module/kallsyms.c
> > >>>> +++ b/kernel/module/kallsyms.c
> > Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
> > the answer. find_kallsyms_symbol_value() was originally a static function, and it
> > is only called by module_kallsyms_lookup_name() and is preemptive-protected.
> > 
> > Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
> > we should do the same thing as function module_kallsyms_lookup_name().
> > 
> > Like this?
> > +				mod = btf_try_get_module(btf);
> > +				if (mod) {
> > +					preempt_disable();
> > +					addr = find_kallsyms_symbol_value(mod, tname);
> > +					preempt_enable();
> > +				} else
> > +					addr = 0;
> 
> yes, that's what I did above, but I was just curious about the strange
> RCU usage Alexei commented on earlier:
> 
> 	>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> 	>>> +{
> 	>>> +       unsigned long ret;
> 	>>> +
> 	>>> +       preempt_disable();
> 	>>> +       ret = __find_kallsyms_symbol_value(mod, name);
> 	>>> +       preempt_enable();
> 	>>> +       return ret;
> 	>>> +}
> 	>>
> 	>> That doesn't look right.
> 	>> I think the issue is misuse of rcu_dereference_sched in
> 	>> find_kallsyms_symbol_value.
> 	>
> 	> it seems to be using rcu pointer to keep symbols for module init time and
> 	> then core symbols for after init.. and switch between them when module is
> 	> loaded, hence the strange rcu usage I think

My understanding is that rcu is needed to prevent module from being freed.
It should be related to:

static void free_module(struct module *mod)
{
[...]
	/* Now we can delete it from the lists */
	mutex_lock(&module_mutex);
	/* Unlink carefully: kallsyms could be walking list. */
	list_del_rcu(&mod->list);
[...]
}

I am sorry for the late reply. I was busy and I thought that it was
related to the refactoring. I hoped that peopled doing the refactoring
would answer.

Best Regards,
Petr
Zhen Lei March 31, 2023, 9:15 a.m. UTC | #12
On 2023/3/31 16:31, Petr Mladek wrote:
> On Thu 2023-03-30 22:59:12, Jiri Olsa wrote:
>> On Thu, Mar 30, 2023 at 08:26:41PM +0800, Leizhen (ThunderTown) wrote:
>>>
>>>
>>> On 2023/3/30 15:29, Jiri Olsa wrote:
>>>> ping,
>>>>
>>>> Petr, Zhen, any comment on discussion below?
>>>>
>>>> thanks,
>>>> jirka
>>>>
>>>> On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
>>>>> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
>>>>>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>>>>>>>
>>>>>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
>>>>>>>
>>>>>>> SNIP
>>>>>>>
>>>>>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
>>>>>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
>>>>>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
>>>>>>>>>
>>>>>>>>> btf_try_get_module calls try_module_get which disables the preemption,
>>>>>>>>> so no need to call it in here
>>>>>>>>
>>>>>>>> It does, but it reenables preemption right away so it is enabled by the
>>>>>>>> time we call find_kallsyms_symbol_value(). I am getting the following
>>>>>>>> lockdep splat while running module_fentry_shadow test from test_progs.
>>>>>>>>
>>>>>>>> [   12.017973][  T488] =============================
>>>>>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
>>>>>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
>>>>>>>> [   12.019898][  T488] -----------------------------
>>>>>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
>>>>>>>> [   12.021335][  T488]
>>>>>>>> [   12.021335][  T488] other info that might help us debug this:
>>>>>>>> [   12.021335][  T488]
>>>>>>>> [   12.022416][  T488]
>>>>>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
>>>>>>>> [   12.023297][  T488] no locks held by test_progs/488.
>>>>>>>> [   12.023854][  T488]
>>>>>>>> [   12.023854][  T488] stack backtrace:
>>>>>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
>>>>>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
>>>>>>>> [   12.026108][  T488] Call Trace:
>>>>>>>> [   12.026381][  T488]  <TASK>
>>>>>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
>>>>>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
>>>>>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
>>>>>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
>>>>>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
>>>>>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
>>>>>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
>>>>>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
>>>>>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
>>>>>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
>>>>>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
>>>>>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
>>>>>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
>>>>>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
>>>>>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
>>>>>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
>>>>>>>
>>>>>>> --- a/kernel/module/kallsyms.c
>>>>>>> +++ b/kernel/module/kallsyms.c
>>> Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
>>> the answer. find_kallsyms_symbol_value() was originally a static function, and it
>>> is only called by module_kallsyms_lookup_name() and is preemptive-protected.
>>>
>>> Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
>>> we should do the same thing as function module_kallsyms_lookup_name().
>>>
>>> Like this?
>>> +				mod = btf_try_get_module(btf);
>>> +				if (mod) {
>>> +					preempt_disable();
>>> +					addr = find_kallsyms_symbol_value(mod, tname);
>>> +					preempt_enable();
>>> +				} else
>>> +					addr = 0;
>>
>> yes, that's what I did above, but I was just curious about the strange
>> RCU usage Alexei commented on earlier:
>>
>> 	>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
>> 	>>> +{
>> 	>>> +       unsigned long ret;
>> 	>>> +
>> 	>>> +       preempt_disable();
>> 	>>> +       ret = __find_kallsyms_symbol_value(mod, name);
>> 	>>> +       preempt_enable();
>> 	>>> +       return ret;
>> 	>>> +}
>> 	>>
>> 	>> That doesn't look right.
>> 	>> I think the issue is misuse of rcu_dereference_sched in
>> 	>> find_kallsyms_symbol_value.
>> 	>
>> 	> it seems to be using rcu pointer to keep symbols for module init time and
>> 	> then core symbols for after init.. and switch between them when module is
>> 	> loaded, hence the strange rcu usage I think

load_module
	post_relocation
		add_kallsyms
			mod->kallsyms = (void __rcu *)mod->init_layout.base + info->mod_kallsyms_init_off;   (1)
	do_init_module
		freeinit->module_init = mod->init_layout.base;
		rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);                                      (2)
		if (llist_add(&freeinit->node, &init_free_list))
			schedule_work(&init_free_wq);

do_free_init
	synchronize_rcu();
	module_memfree(initfree->module_init);

IIUC, the RCU can help synchronize_rcu() in do_free_init() to make sure that no one
is still using the first mod->kallsyms (1). If find_kallsyms_symbol_value() is executed
between (1) and (2).

> 
> My understanding is that rcu is needed to prevent module from being freed.
> It should be related to:
> 
> static void free_module(struct module *mod)
> {
> [...]
> 	/* Now we can delete it from the lists */
> 	mutex_lock(&module_mutex);
> 	/* Unlink carefully: kallsyms could be walking list. */
> 	list_del_rcu(&mod->list);
> [...]
> }
> 
> I am sorry for the late reply. I was busy and I thought that it was
> related to the refactoring. I hoped that peopled doing the refactoring
> would answer.
> 
> Best Regards,
> Petr
> .
>
Petr Mladek March 31, 2023, 11:08 a.m. UTC | #13
On Fri 2023-03-31 17:15:56, Leizhen (ThunderTown) wrote:
> 
> 
> On 2023/3/31 16:31, Petr Mladek wrote:
> > On Thu 2023-03-30 22:59:12, Jiri Olsa wrote:
> >> On Thu, Mar 30, 2023 at 08:26:41PM +0800, Leizhen (ThunderTown) wrote:
> >>>
> >>>
> >>> On 2023/3/30 15:29, Jiri Olsa wrote:
> >>>> ping,
> >>>>
> >>>> Petr, Zhen, any comment on discussion below?
> >>>>
> >>>> thanks,
> >>>> jirka
> >>>>
> >>>> On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
> >>>>> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
> >>>>>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >>>>>>>
> >>>>>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
> >>>>>>>
> >>>>>>> SNIP
> >>>>>>>
> >>>>>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> >>>>>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
> >>>>>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
> >>>>>>>>>
> >>>>>>>>> btf_try_get_module calls try_module_get which disables the preemption,
> >>>>>>>>> so no need to call it in here
> >>>>>>>>
> >>>>>>>> It does, but it reenables preemption right away so it is enabled by the
> >>>>>>>> time we call find_kallsyms_symbol_value(). I am getting the following
> >>>>>>>> lockdep splat while running module_fentry_shadow test from test_progs.
> >>>>>>>>
> >>>>>>>> [   12.017973][  T488] =============================
> >>>>>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
> >>>>>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> >>>>>>>> [   12.019898][  T488] -----------------------------
> >>>>>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> >>>>>>>> [   12.021335][  T488]
> >>>>>>>> [   12.021335][  T488] other info that might help us debug this:
> >>>>>>>> [   12.021335][  T488]
> >>>>>>>> [   12.022416][  T488]
> >>>>>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> >>>>>>>> [   12.023297][  T488] no locks held by test_progs/488.
> >>>>>>>> [   12.023854][  T488]
> >>>>>>>> [   12.023854][  T488] stack backtrace:
> >>>>>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> >>>>>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> >>>>>>>> [   12.026108][  T488] Call Trace:
> >>>>>>>> [   12.026381][  T488]  <TASK>
> >>>>>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> >>>>>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> >>>>>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> >>>>>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> >>>>>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> >>>>>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> >>>>>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
> >>>>>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> >>>>>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> >>>>>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> >>>>>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> >>>>>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> >>>>>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> >>>>>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> >>>>>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> >>>>>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> >>>>>>>
> >>>>>>> --- a/kernel/module/kallsyms.c
> >>>>>>> +++ b/kernel/module/kallsyms.c
> >>> Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
> >>> the answer. find_kallsyms_symbol_value() was originally a static function, and it
> >>> is only called by module_kallsyms_lookup_name() and is preemptive-protected.
> >>>
> >>> Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
> >>> we should do the same thing as function module_kallsyms_lookup_name().
> >>>
> >>> Like this?
> >>> +				mod = btf_try_get_module(btf);
> >>> +				if (mod) {
> >>> +					preempt_disable();
> >>> +					addr = find_kallsyms_symbol_value(mod, tname);
> >>> +					preempt_enable();
> >>> +				} else
> >>> +					addr = 0;
> >>
> >> yes, that's what I did above, but I was just curious about the strange
> >> RCU usage Alexei commented on earlier:
> >>
> >> 	>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> >> 	>>> +{
> >> 	>>> +       unsigned long ret;
> >> 	>>> +
> >> 	>>> +       preempt_disable();
> >> 	>>> +       ret = __find_kallsyms_symbol_value(mod, name);
> >> 	>>> +       preempt_enable();
> >> 	>>> +       return ret;
> >> 	>>> +}
> >> 	>>
> >> 	>> That doesn't look right.
> >> 	>> I think the issue is misuse of rcu_dereference_sched in
> >> 	>> find_kallsyms_symbol_value.
> >> 	>
> >> 	> it seems to be using rcu pointer to keep symbols for module init time and
> >> 	> then core symbols for after init.. and switch between them when module is
> >> 	> loaded, hence the strange rcu usage I think
> 
> load_module
> 	post_relocation
> 		add_kallsyms
> 			mod->kallsyms = (void __rcu *)mod->init_layout.base + info->mod_kallsyms_init_off;   (1)
> 	do_init_module
> 		freeinit->module_init = mod->init_layout.base;
> 		rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);                                      (2)
> 		if (llist_add(&freeinit->node, &init_free_list))
> 			schedule_work(&init_free_wq);
> 
> do_free_init
> 	synchronize_rcu();
> 	module_memfree(initfree->module_init);
> 
> IIUC, the RCU can help synchronize_rcu() in do_free_init() to make sure that no one
> is still using the first mod->kallsyms (1). If find_kallsyms_symbol_value() is executed
> between (1) and (2).

Yes, this seems to be another scenario where the RCU synchronization/access
is needed.

Best Regards,
Petr
Jiri Olsa March 31, 2023, 9:25 p.m. UTC | #14
On Fri, Mar 31, 2023 at 01:08:42PM +0200, Petr Mladek wrote:
> On Fri 2023-03-31 17:15:56, Leizhen (ThunderTown) wrote:
> > 
> > 
> > On 2023/3/31 16:31, Petr Mladek wrote:
> > > On Thu 2023-03-30 22:59:12, Jiri Olsa wrote:
> > >> On Thu, Mar 30, 2023 at 08:26:41PM +0800, Leizhen (ThunderTown) wrote:
> > >>>
> > >>>
> > >>> On 2023/3/30 15:29, Jiri Olsa wrote:
> > >>>> ping,
> > >>>>
> > >>>> Petr, Zhen, any comment on discussion below?
> > >>>>
> > >>>> thanks,
> > >>>> jirka
> > >>>>
> > >>>> On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
> > >>>>> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
> > >>>>>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> > >>>>>>>
> > >>>>>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
> > >>>>>>>
> > >>>>>>> SNIP
> > >>>>>>>
> > >>>>>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> > >>>>>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
> > >>>>>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
> > >>>>>>>>>
> > >>>>>>>>> btf_try_get_module calls try_module_get which disables the preemption,
> > >>>>>>>>> so no need to call it in here
> > >>>>>>>>
> > >>>>>>>> It does, but it reenables preemption right away so it is enabled by the
> > >>>>>>>> time we call find_kallsyms_symbol_value(). I am getting the following
> > >>>>>>>> lockdep splat while running module_fentry_shadow test from test_progs.
> > >>>>>>>>
> > >>>>>>>> [   12.017973][  T488] =============================
> > >>>>>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
> > >>>>>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> > >>>>>>>> [   12.019898][  T488] -----------------------------
> > >>>>>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> > >>>>>>>> [   12.021335][  T488]
> > >>>>>>>> [   12.021335][  T488] other info that might help us debug this:
> > >>>>>>>> [   12.021335][  T488]
> > >>>>>>>> [   12.022416][  T488]
> > >>>>>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> > >>>>>>>> [   12.023297][  T488] no locks held by test_progs/488.
> > >>>>>>>> [   12.023854][  T488]
> > >>>>>>>> [   12.023854][  T488] stack backtrace:
> > >>>>>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> > >>>>>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> > >>>>>>>> [   12.026108][  T488] Call Trace:
> > >>>>>>>> [   12.026381][  T488]  <TASK>
> > >>>>>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> > >>>>>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> > >>>>>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> > >>>>>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> > >>>>>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> > >>>>>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> > >>>>>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
> > >>>>>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> > >>>>>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> > >>>>>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> > >>>>>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> > >>>>>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> > >>>>>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> > >>>>>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> > >>>>>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> > >>>>>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> > >>>>>>>
> > >>>>>>> --- a/kernel/module/kallsyms.c
> > >>>>>>> +++ b/kernel/module/kallsyms.c
> > >>> Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
> > >>> the answer. find_kallsyms_symbol_value() was originally a static function, and it
> > >>> is only called by module_kallsyms_lookup_name() and is preemptive-protected.
> > >>>
> > >>> Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
> > >>> we should do the same thing as function module_kallsyms_lookup_name().
> > >>>
> > >>> Like this?
> > >>> +				mod = btf_try_get_module(btf);
> > >>> +				if (mod) {
> > >>> +					preempt_disable();
> > >>> +					addr = find_kallsyms_symbol_value(mod, tname);
> > >>> +					preempt_enable();
> > >>> +				} else
> > >>> +					addr = 0;
> > >>
> > >> yes, that's what I did above, but I was just curious about the strange
> > >> RCU usage Alexei commented on earlier:
> > >>
> > >> 	>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> > >> 	>>> +{
> > >> 	>>> +       unsigned long ret;
> > >> 	>>> +
> > >> 	>>> +       preempt_disable();
> > >> 	>>> +       ret = __find_kallsyms_symbol_value(mod, name);
> > >> 	>>> +       preempt_enable();
> > >> 	>>> +       return ret;
> > >> 	>>> +}
> > >> 	>>
> > >> 	>> That doesn't look right.
> > >> 	>> I think the issue is misuse of rcu_dereference_sched in
> > >> 	>> find_kallsyms_symbol_value.
> > >> 	>
> > >> 	> it seems to be using rcu pointer to keep symbols for module init time and
> > >> 	> then core symbols for after init.. and switch between them when module is
> > >> 	> loaded, hence the strange rcu usage I think
> > 
> > load_module
> > 	post_relocation
> > 		add_kallsyms
> > 			mod->kallsyms = (void __rcu *)mod->init_layout.base + info->mod_kallsyms_init_off;   (1)
> > 	do_init_module
> > 		freeinit->module_init = mod->init_layout.base;
> > 		rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);                                      (2)
> > 		if (llist_add(&freeinit->node, &init_free_list))
> > 			schedule_work(&init_free_wq);
> > 
> > do_free_init
> > 	synchronize_rcu();
> > 	module_memfree(initfree->module_init);
> > 
> > IIUC, the RCU can help synchronize_rcu() in do_free_init() to make sure that no one
> > is still using the first mod->kallsyms (1). If find_kallsyms_symbol_value() is executed
> > between (1) and (2).
> 
> Yes, this seems to be another scenario where the RCU synchronization/access
> is needed.

thanks for the details

still curious.. confusing part for me is the use of rcu_dereference in
add_kallsyms IIUC there's no need for that because mod->kallsyms is not
exposed at that time? we could do without it like in patch below?

thanks,
jirka


---
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index bdc911dbcde5..bc1e748a1357 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -170,20 +170,18 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 	Elf_Sym *dst;
 	char *s;
 	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+	struct mod_kallsyms *kallsyms;
 	unsigned long strtab_size;
 
-	/* Set up to point into init section. */
-	mod->kallsyms = (void __rcu *)mod->init_layout.base +
-		info->mod_kallsyms_init_off;
+	kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
 
-	rcu_read_lock();
 	/* The following is safe since this pointer cannot change */
-	rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
-	rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+	kallsyms->symtab = (void *)symsec->sh_addr;
+	kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
 	/* Make sure we get permanent strtab: don't use info->strtab. */
-	rcu_dereference(mod->kallsyms)->strtab =
+	kallsyms->strtab =
 		(void *)info->sechdrs[info->index.str].sh_addr;
-	rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
+	kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
 
 	/*
 	 * Now populate the cut down core kallsyms for after init
@@ -193,20 +191,20 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 	mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
 	mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
 	strtab_size = info->core_typeoffs - info->stroffs;
-	src = rcu_dereference(mod->kallsyms)->symtab;
-	for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
-		rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
+	src = kallsyms->symtab;
+	for (ndst = i = 0; i < kallsyms->num_symtab; i++) {
+		kallsyms->typetab[i] = elf_type(src + i, info);
 		if (i == 0 || is_livepatch_module(mod) ||
 		    is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
 			ssize_t ret;
 
 			mod->core_kallsyms.typetab[ndst] =
-			    rcu_dereference(mod->kallsyms)->typetab[i];
+			    kallsyms->typetab[i];
 			dst[ndst] = src[i];
 			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
 			ret = strscpy(s,
-				      &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name],
+				      &kallsyms->strtab[src[i].st_name],
 				      strtab_size);
 			if (ret < 0)
 				break;
@@ -214,8 +212,10 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 			strtab_size -= ret + 1;
 		}
 	}
-	rcu_read_unlock();
 	mod->core_kallsyms.num_symtab = ndst;
+
+	/* Set up to point into init section. */
+	rcu_assign_pointer(mod->kallsyms, kallsyms);
 }
 
 #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
Zhen Lei April 3, 2023, 1:46 a.m. UTC | #15
On 2023/4/1 5:25, Jiri Olsa wrote:
> On Fri, Mar 31, 2023 at 01:08:42PM +0200, Petr Mladek wrote:
>> On Fri 2023-03-31 17:15:56, Leizhen (ThunderTown) wrote:
>>>
>>>
>>> On 2023/3/31 16:31, Petr Mladek wrote:
>>>> On Thu 2023-03-30 22:59:12, Jiri Olsa wrote:
>>>>> On Thu, Mar 30, 2023 at 08:26:41PM +0800, Leizhen (ThunderTown) wrote:
>>>>>>
>>>>>>
>>>>>> On 2023/3/30 15:29, Jiri Olsa wrote:
>>>>>>> ping,
>>>>>>>
>>>>>>> Petr, Zhen, any comment on discussion below?
>>>>>>>
>>>>>>> thanks,
>>>>>>> jirka
>>>>>>>
>>>>>>> On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
>>>>>>>> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
>>>>>>>>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
>>>>>>>>>>
>>>>>>>>>> SNIP
>>>>>>>>>>
>>>>>>>>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
>>>>>>>>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
>>>>>>>>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
>>>>>>>>>>>>
>>>>>>>>>>>> btf_try_get_module calls try_module_get which disables the preemption,
>>>>>>>>>>>> so no need to call it in here
>>>>>>>>>>>
>>>>>>>>>>> It does, but it reenables preemption right away so it is enabled by the
>>>>>>>>>>> time we call find_kallsyms_symbol_value(). I am getting the following
>>>>>>>>>>> lockdep splat while running module_fentry_shadow test from test_progs.
>>>>>>>>>>>
>>>>>>>>>>> [   12.017973][  T488] =============================
>>>>>>>>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
>>>>>>>>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
>>>>>>>>>>> [   12.019898][  T488] -----------------------------
>>>>>>>>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
>>>>>>>>>>> [   12.021335][  T488]
>>>>>>>>>>> [   12.021335][  T488] other info that might help us debug this:
>>>>>>>>>>> [   12.021335][  T488]
>>>>>>>>>>> [   12.022416][  T488]
>>>>>>>>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
>>>>>>>>>>> [   12.023297][  T488] no locks held by test_progs/488.
>>>>>>>>>>> [   12.023854][  T488]
>>>>>>>>>>> [   12.023854][  T488] stack backtrace:
>>>>>>>>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
>>>>>>>>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
>>>>>>>>>>> [   12.026108][  T488] Call Trace:
>>>>>>>>>>> [   12.026381][  T488]  <TASK>
>>>>>>>>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
>>>>>>>>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
>>>>>>>>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
>>>>>>>>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
>>>>>>>>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
>>>>>>>>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
>>>>>>>>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
>>>>>>>>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
>>>>>>>>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
>>>>>>>>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
>>>>>>>>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
>>>>>>>>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
>>>>>>>>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
>>>>>>>>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
>>>>>>>>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
>>>>>>>>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
>>>>>>>>>>
>>>>>>>>>> --- a/kernel/module/kallsyms.c
>>>>>>>>>> +++ b/kernel/module/kallsyms.c
>>>>>> Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
>>>>>> the answer. find_kallsyms_symbol_value() was originally a static function, and it
>>>>>> is only called by module_kallsyms_lookup_name() and is preemptive-protected.
>>>>>>
>>>>>> Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
>>>>>> we should do the same thing as function module_kallsyms_lookup_name().
>>>>>>
>>>>>> Like this?
>>>>>> +				mod = btf_try_get_module(btf);
>>>>>> +				if (mod) {
>>>>>> +					preempt_disable();
>>>>>> +					addr = find_kallsyms_symbol_value(mod, tname);
>>>>>> +					preempt_enable();
>>>>>> +				} else
>>>>>> +					addr = 0;
>>>>>
>>>>> yes, that's what I did above, but I was just curious about the strange
>>>>> RCU usage Alexei commented on earlier:
>>>>>
>>>>> 	>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
>>>>> 	>>> +{
>>>>> 	>>> +       unsigned long ret;
>>>>> 	>>> +
>>>>> 	>>> +       preempt_disable();
>>>>> 	>>> +       ret = __find_kallsyms_symbol_value(mod, name);
>>>>> 	>>> +       preempt_enable();
>>>>> 	>>> +       return ret;
>>>>> 	>>> +}
>>>>> 	>>
>>>>> 	>> That doesn't look right.
>>>>> 	>> I think the issue is misuse of rcu_dereference_sched in
>>>>> 	>> find_kallsyms_symbol_value.
>>>>> 	>
>>>>> 	> it seems to be using rcu pointer to keep symbols for module init time and
>>>>> 	> then core symbols for after init.. and switch between them when module is
>>>>> 	> loaded, hence the strange rcu usage I think
>>>
>>> load_module
>>> 	post_relocation
>>> 		add_kallsyms
>>> 			mod->kallsyms = (void __rcu *)mod->init_layout.base + info->mod_kallsyms_init_off;   (1)
>>> 	do_init_module
>>> 		freeinit->module_init = mod->init_layout.base;
>>> 		rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);                                      (2)
>>> 		if (llist_add(&freeinit->node, &init_free_list))
>>> 			schedule_work(&init_free_wq);
>>>
>>> do_free_init
>>> 	synchronize_rcu();
>>> 	module_memfree(initfree->module_init);
>>>
>>> IIUC, the RCU can help synchronize_rcu() in do_free_init() to make sure that no one
>>> is still using the first mod->kallsyms (1). If find_kallsyms_symbol_value() is executed
>>> between (1) and (2).
>>
>> Yes, this seems to be another scenario where the RCU synchronization/access
>> is needed.
> 
> thanks for the details
> 
> still curious.. confusing part for me is the use of rcu_dereference in
> add_kallsyms IIUC there's no need for that because mod->kallsyms is not
> exposed at that time? we could do without it like in patch below?

Looks good to me. Prepare the data first, and then pointer to it.
This is the standard operation procedure of the RCU. But there
may be special considerations that we don't know about.


> 
> thanks,
> jirka
> 
> 
> ---
> diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
> index bdc911dbcde5..bc1e748a1357 100644
> --- a/kernel/module/kallsyms.c
> +++ b/kernel/module/kallsyms.c
> @@ -170,20 +170,18 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
>  	Elf_Sym *dst;
>  	char *s;
>  	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
> +	struct mod_kallsyms *kallsyms;
>  	unsigned long strtab_size;
>  
> -	/* Set up to point into init section. */
> -	mod->kallsyms = (void __rcu *)mod->init_layout.base +
> -		info->mod_kallsyms_init_off;
> +	kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
>  
> -	rcu_read_lock();
>  	/* The following is safe since this pointer cannot change */
> -	rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
> -	rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
> +	kallsyms->symtab = (void *)symsec->sh_addr;
> +	kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
>  	/* Make sure we get permanent strtab: don't use info->strtab. */
> -	rcu_dereference(mod->kallsyms)->strtab =
> +	kallsyms->strtab =
>  		(void *)info->sechdrs[info->index.str].sh_addr;
> -	rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
> +	kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
>  
>  	/*
>  	 * Now populate the cut down core kallsyms for after init
> @@ -193,20 +191,20 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
>  	mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
>  	mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
>  	strtab_size = info->core_typeoffs - info->stroffs;
> -	src = rcu_dereference(mod->kallsyms)->symtab;
> -	for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
> -		rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
> +	src = kallsyms->symtab;
> +	for (ndst = i = 0; i < kallsyms->num_symtab; i++) {
> +		kallsyms->typetab[i] = elf_type(src + i, info);
>  		if (i == 0 || is_livepatch_module(mod) ||
>  		    is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
>  				   info->index.pcpu)) {
>  			ssize_t ret;
>  
>  			mod->core_kallsyms.typetab[ndst] =
> -			    rcu_dereference(mod->kallsyms)->typetab[i];
> +			    kallsyms->typetab[i];
>  			dst[ndst] = src[i];
>  			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
>  			ret = strscpy(s,
> -				      &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name],
> +				      &kallsyms->strtab[src[i].st_name],
>  				      strtab_size);
>  			if (ret < 0)
>  				break;
> @@ -214,8 +212,10 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
>  			strtab_size -= ret + 1;
>  		}
>  	}
> -	rcu_read_unlock();
>  	mod->core_kallsyms.num_symtab = ndst;
> +
> +	/* Set up to point into init section. */
> +	rcu_assign_pointer(mod->kallsyms, kallsyms);
>  }
>  
>  #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
> .
>
Petr Mladek April 3, 2023, 8:46 a.m. UTC | #16
On Mon 2023-04-03 09:46:11, Leizhen (ThunderTown) wrote:
> 
> 
> On 2023/4/1 5:25, Jiri Olsa wrote:
> > On Fri, Mar 31, 2023 at 01:08:42PM +0200, Petr Mladek wrote:
> >> On Fri 2023-03-31 17:15:56, Leizhen (ThunderTown) wrote:
> >>>
> >>>
> >>> On 2023/3/31 16:31, Petr Mladek wrote:
> >>>> On Thu 2023-03-30 22:59:12, Jiri Olsa wrote:
> >>>>> On Thu, Mar 30, 2023 at 08:26:41PM +0800, Leizhen (ThunderTown) wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 2023/3/30 15:29, Jiri Olsa wrote:
> >>>>>>> ping,
> >>>>>>>
> >>>>>>> Petr, Zhen, any comment on discussion below?
> >>>>>>>
> >>>>>>> thanks,
> >>>>>>> jirka
> >>>>>>>
> >>>>>>> On Thu, Mar 23, 2023 at 03:00:25PM +0100, Jiri Olsa wrote:
> >>>>>>>> On Wed, Mar 22, 2023 at 09:03:46AM -0700, Alexei Starovoitov wrote:
> >>>>>>>>> On Wed, Mar 22, 2023 at 5:14 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On Wed, Mar 22, 2023 at 10:49:38AM +0100, Artem Savkov wrote:
> >>>>>>>>>>
> >>>>>>>>>> SNIP
> >>>>>>>>>>
> >>>>>>>>>>>>> Hm, do we even need to preempt_disable? IIUC, preempt_disable is used
> >>>>>>>>>>>>> in module kallsyms to prevent taking the module lock b/c kallsyms are
> >>>>>>>>>>>>> used in the oops path. That shouldn't be an issue here, is that correct?
> >>>>>>>>>>>>
> >>>>>>>>>>>> btf_try_get_module calls try_module_get which disables the preemption,
> >>>>>>>>>>>> so no need to call it in here
> >>>>>>>>>>>
> >>>>>>>>>>> It does, but it reenables preemption right away so it is enabled by the
> >>>>>>>>>>> time we call find_kallsyms_symbol_value(). I am getting the following
> >>>>>>>>>>> lockdep splat while running module_fentry_shadow test from test_progs.
> >>>>>>>>>>>
> >>>>>>>>>>> [   12.017973][  T488] =============================
> >>>>>>>>>>> [   12.018529][  T488] WARNING: suspicious RCU usage
> >>>>>>>>>>> [   12.018987][  T488] 6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804 Tainted: G           OE
> >>>>>>>>>>> [   12.019898][  T488] -----------------------------
> >>>>>>>>>>> [   12.020391][  T488] kernel/module/kallsyms.c:448 suspicious rcu_dereference_check() usage!
> >>>>>>>>>>> [   12.021335][  T488]
> >>>>>>>>>>> [   12.021335][  T488] other info that might help us debug this:
> >>>>>>>>>>> [   12.021335][  T488]
> >>>>>>>>>>> [   12.022416][  T488]
> >>>>>>>>>>> [   12.022416][  T488] rcu_scheduler_active = 2, debug_locks = 1
> >>>>>>>>>>> [   12.023297][  T488] no locks held by test_progs/488.
> >>>>>>>>>>> [   12.023854][  T488]
> >>>>>>>>>>> [   12.023854][  T488] stack backtrace:
> >>>>>>>>>>> [   12.024336][  T488] CPU: 0 PID: 488 Comm: test_progs Tainted: G           OE      6.2.0.bpf-test-13063-g6a9f5cdba3c5 #804
> >>>>>>>>>>> [   12.025290][  T488] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014
> >>>>>>>>>>> [   12.026108][  T488] Call Trace:
> >>>>>>>>>>> [   12.026381][  T488]  <TASK>
> >>>>>>>>>>> [   12.026649][  T488]  dump_stack_lvl+0xb4/0x110
> >>>>>>>>>>> [   12.027060][  T488]  lockdep_rcu_suspicious+0x158/0x1f0
> >>>>>>>>>>> [   12.027541][  T488]  find_kallsyms_symbol_value+0xe8/0x110
> >>>>>>>>>>> [   12.028028][  T488]  bpf_check_attach_target+0x838/0xa20
> >>>>>>>>>>> [   12.028511][  T488]  check_attach_btf_id+0x144/0x3f0
> >>>>>>>>>>> [   12.028957][  T488]  ? __pfx_cmp_subprogs+0x10/0x10
> >>>>>>>>>>> [   12.029408][  T488]  bpf_check+0xeec/0x1850
> >>>>>>>>>>> [   12.029799][  T488]  ? ktime_get_with_offset+0x124/0x1d0
> >>>>>>>>>>> [   12.030247][  T488]  bpf_prog_load+0x87a/0xed0
> >>>>>>>>>>> [   12.030627][  T488]  ? __lock_release+0x5f/0x160
> >>>>>>>>>>> [   12.031010][  T488]  ? __might_fault+0x53/0xb0
> >>>>>>>>>>> [   12.031394][  T488]  ? selinux_bpf+0x6c/0xa0
> >>>>>>>>>>> [   12.031756][  T488]  __sys_bpf+0x53c/0x1240
> >>>>>>>>>>> [   12.032115][  T488]  __x64_sys_bpf+0x27/0x40
> >>>>>>>>>>> [   12.032476][  T488]  do_syscall_64+0x3e/0x90
> >>>>>>>>>>> [   12.032835][  T488]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> >>>>>>>>>>
> >>>>>>>>>> --- a/kernel/module/kallsyms.c
> >>>>>>>>>> +++ b/kernel/module/kallsyms.c
> >>>>>> Commit 91fb02f31505 ("module: Move kallsyms support into a separate file") hides
> >>>>>> the answer. find_kallsyms_symbol_value() was originally a static function, and it
> >>>>>> is only called by module_kallsyms_lookup_name() and is preemptive-protected.
> >>>>>>
> >>>>>> Now that we've added a call to function find_kallsyms_symbol_value(), it seems like
> >>>>>> we should do the same thing as function module_kallsyms_lookup_name().
> >>>>>>
> >>>>>> Like this?
> >>>>>> +				mod = btf_try_get_module(btf);
> >>>>>> +				if (mod) {
> >>>>>> +					preempt_disable();
> >>>>>> +					addr = find_kallsyms_symbol_value(mod, tname);
> >>>>>> +					preempt_enable();
> >>>>>> +				} else
> >>>>>> +					addr = 0;
> >>>>>
> >>>>> yes, that's what I did above, but I was just curious about the strange
> >>>>> RCU usage Alexei commented on earlier:
> >>>>>
> >>>>> 	>>> +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
> >>>>> 	>>> +{
> >>>>> 	>>> +       unsigned long ret;
> >>>>> 	>>> +
> >>>>> 	>>> +       preempt_disable();
> >>>>> 	>>> +       ret = __find_kallsyms_symbol_value(mod, name);
> >>>>> 	>>> +       preempt_enable();
> >>>>> 	>>> +       return ret;
> >>>>> 	>>> +}
> >>>>> 	>>
> >>>>> 	>> That doesn't look right.
> >>>>> 	>> I think the issue is misuse of rcu_dereference_sched in
> >>>>> 	>> find_kallsyms_symbol_value.
> >>>>> 	>
> >>>>> 	> it seems to be using rcu pointer to keep symbols for module init time and
> >>>>> 	> then core symbols for after init.. and switch between them when module is
> >>>>> 	> loaded, hence the strange rcu usage I think
> >>>
> >>> load_module
> >>> 	post_relocation
> >>> 		add_kallsyms
> >>> 			mod->kallsyms = (void __rcu *)mod->init_layout.base + info->mod_kallsyms_init_off;   (1)
> >>> 	do_init_module
> >>> 		freeinit->module_init = mod->init_layout.base;
> >>> 		rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);                                      (2)
> >>> 		if (llist_add(&freeinit->node, &init_free_list))
> >>> 			schedule_work(&init_free_wq);
> >>>
> >>> do_free_init
> >>> 	synchronize_rcu();
> >>> 	module_memfree(initfree->module_init);
> >>>
> >>> IIUC, the RCU can help synchronize_rcu() in do_free_init() to make sure that no one
> >>> is still using the first mod->kallsyms (1). If find_kallsyms_symbol_value() is executed
> >>> between (1) and (2).
> >>
> >> Yes, this seems to be another scenario where the RCU synchronization/access
> >> is needed.
> > 
> > thanks for the details
> > 
> > still curious.. confusing part for me is the use of rcu_dereference in
> > add_kallsyms IIUC there's no need for that because mod->kallsyms is not
> > exposed at that time? we could do without it like in patch below?
> 
> Looks good to me. Prepare the data first, and then pointer to it.
> This is the standard operation procedure of the RCU. But there
> may be special considerations that we don't know about.

I was curious and found the commit 8244062ef1e54502ef5 ("modules: fix
longstanding /proc/kallsyms vs module insertion race.").

So we need to be careful about races. But I would expect that
the proposed change could only make things better.

Best Reagrds,
Petr


> 
> > 
> > thanks,
> > jirka
> > 
> > 
> > ---
> > diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
> > index bdc911dbcde5..bc1e748a1357 100644
> > --- a/kernel/module/kallsyms.c
> > +++ b/kernel/module/kallsyms.c
> > @@ -170,20 +170,18 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
> >  	Elf_Sym *dst;
> >  	char *s;
> >  	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
> > +	struct mod_kallsyms *kallsyms;
> >  	unsigned long strtab_size;
> >  
> > -	/* Set up to point into init section. */
> > -	mod->kallsyms = (void __rcu *)mod->init_layout.base +
> > -		info->mod_kallsyms_init_off;
> > +	kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
> >  
> > -	rcu_read_lock();
> >  	/* The following is safe since this pointer cannot change */
> > -	rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
> > -	rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
> > +	kallsyms->symtab = (void *)symsec->sh_addr;
> > +	kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
> >  	/* Make sure we get permanent strtab: don't use info->strtab. */
> > -	rcu_dereference(mod->kallsyms)->strtab =
> > +	kallsyms->strtab =
> >  		(void *)info->sechdrs[info->index.str].sh_addr;
> > -	rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
> > +	kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
> >  
> >  	/*
> >  	 * Now populate the cut down core kallsyms for after init
> > @@ -193,20 +191,20 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
> >  	mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
> >  	mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
> >  	strtab_size = info->core_typeoffs - info->stroffs;
> > -	src = rcu_dereference(mod->kallsyms)->symtab;
> > -	for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
> > -		rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
> > +	src = kallsyms->symtab;
> > +	for (ndst = i = 0; i < kallsyms->num_symtab; i++) {
> > +		kallsyms->typetab[i] = elf_type(src + i, info);
> >  		if (i == 0 || is_livepatch_module(mod) ||
> >  		    is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
> >  				   info->index.pcpu)) {
> >  			ssize_t ret;
> >  
> >  			mod->core_kallsyms.typetab[ndst] =
> > -			    rcu_dereference(mod->kallsyms)->typetab[i];
> > +			    kallsyms->typetab[i];
> >  			dst[ndst] = src[i];
> >  			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
> >  			ret = strscpy(s,
> > -				      &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name],
> > +				      &kallsyms->strtab[src[i].st_name],
> >  				      strtab_size);
> >  			if (ret < 0)
> >  				break;
> > @@ -214,8 +212,10 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
> >  			strtab_size -= ret + 1;
> >  		}
> >  	}
> > -	rcu_read_unlock();
> >  	mod->core_kallsyms.num_symtab = ndst;
> > +
> > +	/* Set up to point into init section. */
> > +	rcu_assign_pointer(mod->kallsyms, kallsyms);
> >  }
> >  
> >  #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
> > .
> > 
> 
> -- 
> Regards,
>   Zhen Lei
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4385418118f6..2aadc78fe3c1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1330,6 +1330,7 @@  struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct module *mod;
 	u32 num_exentries;
 	struct exception_table_entry *extable;
 	union {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cda8d00f3762..5b8227e08182 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2064,6 +2064,8 @@  static void bpf_prog_put_deferred(struct work_struct *work)
 	prog = aux->prog;
 	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
 	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+	if (aux->mod)
+		module_put(aux->mod);
 	bpf_prog_free_id(prog);
 	__bpf_prog_put_noref(prog, true);
 }
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..ebb20bf252c7 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -172,26 +172,6 @@  static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	return tr;
 }
 
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
-	struct module *mod;
-	int err = 0;
-
-	preempt_disable();
-	mod = __module_text_address((unsigned long) tr->func.addr);
-	if (mod && !try_module_get(mod))
-		err = -ENOENT;
-	preempt_enable();
-	tr->mod = mod;
-	return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
-	module_put(tr->mod);
-	tr->mod = NULL;
-}
-
 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 {
 	void *ip = tr->func.addr;
@@ -202,8 +182,6 @@  static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
-	if (!ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
@@ -238,9 +216,6 @@  static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		tr->func.ftrace_managed = true;
 	}
 
-	if (bpf_trampoline_module_get(tr))
-		return -ENOENT;
-
 	if (tr->func.ftrace_managed) {
 		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
 		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
@@ -248,8 +223,6 @@  static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
 	}
 
-	if (ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 388245e8826e..6a19bd450558 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,6 +24,7 @@ 
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/poison.h>
+#include "../module/internal.h"
 
 #include "disasm.h"
 
@@ -16868,6 +16869,7 @@  int bpf_check_attach_target(struct bpf_verifier_log *log,
 	const char *tname;
 	struct btf *btf;
 	long addr = 0;
+	struct module *mod = NULL;
 
 	if (!btf_id) {
 		bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -17041,7 +17043,17 @@  int bpf_check_attach_target(struct bpf_verifier_log *log,
 			else
 				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
 		} else {
-			addr = kallsyms_lookup_name(tname);
+			if (btf_is_module(btf)) {
+				preempt_disable();
+				mod = btf_try_get_module(btf);
+				if (mod)
+					addr = find_kallsyms_symbol_value(mod, tname);
+				else
+					addr = 0;
+				preempt_enable();
+			} else {
+				addr = kallsyms_lookup_name(tname);
+			}
 			if (!addr) {
 				bpf_log(log,
 					"The address of function %s cannot be found\n",
@@ -17105,6 +17117,12 @@  int bpf_check_attach_target(struct bpf_verifier_log *log,
 	tgt_info->tgt_addr = addr;
 	tgt_info->tgt_name = tname;
 	tgt_info->tgt_type = t;
+	if (mod) {
+		if (!prog->aux->mod)
+			prog->aux->mod = mod;
+		else
+			module_put(mod);
+	}
 	return 0;
 }
 
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 2e2bf236f558..5cb103a46018 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -256,6 +256,11 @@  static inline bool sect_empty(const Elf_Shdr *sect)
 static inline void init_build_id(struct module *mod, const struct load_info *info) { }
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod
+						       const char *name)
+{
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 #ifdef CONFIG_SYSFS