diff mbox series

[RFC,v1,1/1] tracing/kprobe: Add multi-probe support for 'perf_kprobe' PMU

Message ID 20230816163517.112518-2-flaniel@linux.microsoft.com (mailing list archive)
State Superseded
Headers show
Series [RFC,v1,1/1] tracing/kprobe: Add multi-probe support for 'perf_kprobe' PMU | expand

Commit Message

Francis Laniel Aug. 16, 2023, 4:35 p.m. UTC
When using sysfs, it is possible to create kprobe for several kernel functions
sharing the same name, but of course with different addresses, by writing their
addresses in kprobe_events file.

When using PMU, if only the symbol name is given, the event will be created for
the first address which matches the symbol, as returned by
kallsyms_lookup_name().
The idea here is to search all kernel functions which match this symbol and
create a trace_kprobe for each of them.
All these trace_kprobes are linked together by sharing the same trace_probe.

Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
---
 kernel/trace/trace_kprobe.c | 86 +++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

Comments

Steven Rostedt Aug. 16, 2023, 6:42 p.m. UTC | #1
On Wed, 16 Aug 2023 18:35:17 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> When using sysfs, it is possible to create kprobe for several kernel functions
> sharing the same name, but of course with different addresses, by writing their
> addresses in kprobe_events file.

So this only happens if you write in the address?

> 
> When using PMU, if only the symbol name is given, the event will be created for
> the first address which matches the symbol, as returned by
> kallsyms_lookup_name().
> The idea here is to search all kernel functions which match this symbol and
> create a trace_kprobe for each of them.
> All these trace_kprobes are linked together by sharing the same trace_probe.
> 

So this makes the PMU version enable all by name, so there's still a
disconnect between how sysfs works and how PMU does.

Why can't you just pass in the address like sysfs does?

-- Steve


> Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> ---
>  kernel/trace/trace_kprobe.c | 86 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 86 insertions(+)
> 
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 1b3fa7b854aa..08580f1466c7 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
>  }
>  
>  #ifdef CONFIG_PERF_EVENTS
> +
> +struct address_array {
> +	unsigned long *addrs;
> +	size_t size;
> +};
> +
> +static int add_addr(void *data, unsigned long addr)
> +{
> +	struct address_array *array = data;
> +	unsigned long *p;
> +
> +	array->size++;
> +	p = krealloc(array->addrs,
> +				sizeof(*array->addrs) * array->size,
> +				GFP_KERNEL);
> +	if (!p) {
> +		kfree(array->addrs);
> +		return -ENOMEM;
> +	}
> +
> +	array->addrs = p;
> +	array->addrs[array->size - 1] = addr;
> +
> +	return 0;
> +}
> +
>  /* create a trace_kprobe, but don't add it to global lists */
>  struct trace_event_call *
>  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
>  			  bool is_return)
>  {
>  	enum probe_print_type ptype;
> +	struct address_array array;
>  	struct trace_kprobe *tk;
> +	unsigned long func_addr;
> +	unsigned int i;
>  	int ret;
>  	char *event;
>  
> @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
>  	if (ret < 0)
>  		goto error;
>  
> +	array.addrs = NULL;
> +	array.size = 0;
> +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> +	if (ret)
> +		goto error_free;
> +
> +	if (array.size == 1)
> +		goto end;
> +
> +	/*
> +	 * Below loop allocates a trace_kprobe for each function with the same
> +	 * name in kernel source code.
> +	 * All this differente trace_kprobes will be linked together through
> +	 * append_trace_kprobe().
> +	 * NOTE append_trace_kprobe() is called in register_trace_kprobe() which
> +	 * is called when a kprobe is added through sysfs.
> +	 */
> +	func_addr = kallsyms_lookup_name(func);
> +	for (i = 0; i < array.size; i++) {
> +		struct trace_kprobe *tk_same_name;
> +		unsigned long address;
> +
> +		address = array.addrs[i];
> +		/* Skip the function address as we already registered it. */
> +		if (address == func_addr)
> +			continue;
> +
> +		/*
> +		 * alloc_trace_kprobe() first considers symbol name, so we set
> +		 * this to NULL to allocate this kprobe on the given address.
> +		 */
> +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> +						  (void *)address, NULL, offs,
> +						  0 /* maxactive */,
> +						  0 /* nargs */, is_return);
> +
> +		if (IS_ERR(tk_same_name)) {
> +			ret = -ENOMEM;
> +			goto error_free;
> +		}
> +
> +		init_trace_event_call(tk_same_name);
> +
> +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> +			ret = -ENOMEM;
> +			goto error_free;
> +		}
> +
> +		ret = append_trace_kprobe(tk_same_name, tk);
> +		if (ret)
> +			goto error_free;
> +	}
> +
> +end:
> +	kfree(array.addrs);
>  	return trace_probe_event_call(&tk->tp);
> +error_free:
> +	kfree(array.addrs);
>  error:
>  	free_trace_kprobe(tk);
>  	return ERR_PTR(ret);
Masami Hiramatsu (Google) Aug. 17, 2023, 7:50 a.m. UTC | #2
Hi,

On Wed, 16 Aug 2023 18:35:17 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> When using sysfs, it is possible to create kprobe for several kernel functions
> sharing the same name, but of course with different addresses, by writing their
> addresses in kprobe_events file.
> 
> When using PMU, if only the symbol name is given, the event will be created for
> the first address which matches the symbol, as returned by
> kallsyms_lookup_name().

Do you mean probing the same name symbols? Yes, it is intended behavior, since
it is not always true that the same name function has the same prototype (it is
mostly true but is not ensured), it is better to leave user to decide which one
is what you want to probe.

Have you used 'perf probe' tool? It tries to find the appropriate function by
line number and creates the probe by 'text+OFFSET' style, not by symbol.
I think this is the correct way to do that, because user will not know which
'address' of the symbol is what the user want.

Thought?

Thank you,

> The idea here is to search all kernel functions which match this symbol and
> create a trace_kprobe for each of them.
> All these trace_kprobes are linked together by sharing the same trace_probe.
> 
> Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> ---
>  kernel/trace/trace_kprobe.c | 86 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 86 insertions(+)
> 
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 1b3fa7b854aa..08580f1466c7 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
>  }
>  
>  #ifdef CONFIG_PERF_EVENTS
> +
> +struct address_array {
> +	unsigned long *addrs;
> +	size_t size;
> +};
> +
> +static int add_addr(void *data, unsigned long addr)
> +{
> +	struct address_array *array = data;
> +	unsigned long *p;
> +
> +	array->size++;
> +	p = krealloc(array->addrs,
> +				sizeof(*array->addrs) * array->size,
> +				GFP_KERNEL);
> +	if (!p) {
> +		kfree(array->addrs);
> +		return -ENOMEM;
> +	}
> +
> +	array->addrs = p;
> +	array->addrs[array->size - 1] = addr;
> +
> +	return 0;
> +}
> +
>  /* create a trace_kprobe, but don't add it to global lists */
>  struct trace_event_call *
>  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
>  			  bool is_return)
>  {
>  	enum probe_print_type ptype;
> +	struct address_array array;
>  	struct trace_kprobe *tk;
> +	unsigned long func_addr;
> +	unsigned int i;
>  	int ret;
>  	char *event;
>  
> @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
>  	if (ret < 0)
>  		goto error;
>  
> +	array.addrs = NULL;
> +	array.size = 0;
> +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> +	if (ret)
> +		goto error_free;
> +
> +	if (array.size == 1)
> +		goto end;
> +
> +	/*
> +	 * Below loop allocates a trace_kprobe for each function with the same
> +	 * name in kernel source code.
> +	 * All this differente trace_kprobes will be linked together through
> +	 * append_trace_kprobe().
> +	 * NOTE append_trace_kprobe() is called in register_trace_kprobe() which
> +	 * is called when a kprobe is added through sysfs.
> +	 */
> +	func_addr = kallsyms_lookup_name(func);
> +	for (i = 0; i < array.size; i++) {
> +		struct trace_kprobe *tk_same_name;
> +		unsigned long address;
> +
> +		address = array.addrs[i];
> +		/* Skip the function address as we already registered it. */
> +		if (address == func_addr)
> +			continue;
> +
> +		/*
> +		 * alloc_trace_kprobe() first considers symbol name, so we set
> +		 * this to NULL to allocate this kprobe on the given address.
> +		 */
> +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> +						  (void *)address, NULL, offs,
> +						  0 /* maxactive */,
> +						  0 /* nargs */, is_return);
> +
> +		if (IS_ERR(tk_same_name)) {
> +			ret = -ENOMEM;
> +			goto error_free;
> +		}
> +
> +		init_trace_event_call(tk_same_name);
> +
> +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> +			ret = -ENOMEM;
> +			goto error_free;
> +		}
> +
> +		ret = append_trace_kprobe(tk_same_name, tk);
> +		if (ret)
> +			goto error_free;
> +	}
> +
> +end:
> +	kfree(array.addrs);
>  	return trace_probe_event_call(&tk->tp);
> +error_free:
> +	kfree(array.addrs);
>  error:
>  	free_trace_kprobe(tk);
>  	return ERR_PTR(ret);
> -- 
> 2.34.1
>
Francis Laniel Aug. 17, 2023, 10:59 a.m. UTC | #3
Le mercredi 16 août 2023, 20:42:13 CEST Steven Rostedt a écrit :
> On Wed, 16 Aug 2023 18:35:17 +0200
> 
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > When using sysfs, it is possible to create kprobe for several kernel
> > functions sharing the same name, but of course with different addresses,
> > by writing their addresses in kprobe_events file.
> 
> So this only happens if you write in the address?

From my understanding yes, but I will double check just in case.

> > When using PMU, if only the symbol name is given, the event will be
> > created for the first address which matches the symbol, as returned by
> > kallsyms_lookup_name().
> > The idea here is to search all kernel functions which match this symbol
> > and
> > create a trace_kprobe for each of them.
> > All these trace_kprobes are linked together by sharing the same
> > trace_probe.
> So this makes the PMU version enable all by name, so there's still a
> disconnect between how sysfs works and how PMU does.
> 
> Why can't you just pass in the address like sysfs does?

To get the addresses from /proc/kallsyms, you need to either have CAP_SYSLOG 
or even CAP_SYS_ADMIN.
But to call perf_event_open(), you only need CAP_PERFMON.
This way, by giving only the name you can trace function with less privileges 
(i.e. without CAP_SYS_ADMIN).
Please correct me if I am wrong as I am not an expert in knowing the minimal 
set of capabilities you need to trace.

> -- Steve
> 
> > Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> > ---
> > 
> >  kernel/trace/trace_kprobe.c | 86 +++++++++++++++++++++++++++++++++++++
> >  1 file changed, 86 insertions(+)
> > 
> > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > index 1b3fa7b854aa..08580f1466c7 100644
> > --- a/kernel/trace/trace_kprobe.c
> > +++ b/kernel/trace/trace_kprobe.c
> > @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct
> > trace_kprobe *tk)> 
> >  }
> >  
> >  #ifdef CONFIG_PERF_EVENTS
> > 
> > +
> > +struct address_array {
> > +	unsigned long *addrs;
> > +	size_t size;
> > +};
> > +
> > +static int add_addr(void *data, unsigned long addr)
> > +{
> > +	struct address_array *array = data;
> > +	unsigned long *p;
> > +
> > +	array->size++;
> > +	p = krealloc(array->addrs,
> > +				sizeof(*array->addrs) * array->size,
> > +				GFP_KERNEL);
> > +	if (!p) {
> > +		kfree(array->addrs);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	array->addrs = p;
> > +	array->addrs[array->size - 1] = addr;
> > +
> > +	return 0;
> > +}
> > +
> > 
> >  /* create a trace_kprobe, but don't add it to global lists */
> >  struct trace_event_call *
> >  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> >  
> >  			  bool is_return)
> >  
> >  {
> >  
> >  	enum probe_print_type ptype;
> > 
> > +	struct address_array array;
> > 
> >  	struct trace_kprobe *tk;
> > 
> > +	unsigned long func_addr;
> > +	unsigned int i;
> > 
> >  	int ret;
> >  	char *event;
> > 
> > @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void *addr,
> > unsigned long offs,> 
> >  	if (ret < 0)
> >  	
> >  		goto error;
> > 
> > +	array.addrs = NULL;
> > +	array.size = 0;
> > +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > +	if (ret)
> > +		goto error_free;
> > +
> > +	if (array.size == 1)
> > +		goto end;
> > +
> > +	/*
> > +	 * Below loop allocates a trace_kprobe for each function with the same
> > +	 * name in kernel source code.
> > +	 * All this differente trace_kprobes will be linked together through
> > +	 * append_trace_kprobe().
> > +	 * NOTE append_trace_kprobe() is called in register_trace_kprobe() 
which
> > +	 * is called when a kprobe is added through sysfs.
> > +	 */
> > +	func_addr = kallsyms_lookup_name(func);
> > +	for (i = 0; i < array.size; i++) {
> > +		struct trace_kprobe *tk_same_name;
> > +		unsigned long address;
> > +
> > +		address = array.addrs[i];
> > +		/* Skip the function address as we already registered it. */
> > +		if (address == func_addr)
> > +			continue;
> > +
> > +		/*
> > +		 * alloc_trace_kprobe() first considers symbol name, so we set
> > +		 * this to NULL to allocate this kprobe on the given address.
> > +		 */
> > +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > +						  (void *)address, NULL, offs,
> > +						  0 /* maxactive */,
> > +						  0 /* nargs */, is_return);
> > +
> > +		if (IS_ERR(tk_same_name)) {
> > +			ret = -ENOMEM;
> > +			goto error_free;
> > +		}
> > +
> > +		init_trace_event_call(tk_same_name);
> > +
> > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> > +			ret = -ENOMEM;
> > +			goto error_free;
> > +		}
> > +
> > +		ret = append_trace_kprobe(tk_same_name, tk);
> > +		if (ret)
> > +			goto error_free;
> > +	}
> > +
> > +end:
> > +	kfree(array.addrs);
> > 
> >  	return trace_probe_event_call(&tk->tp);
> > 
> > +error_free:
> > +	kfree(array.addrs);
> > 
> >  error:
> >  	free_trace_kprobe(tk);
> >  	return ERR_PTR(ret);
Francis Laniel Aug. 17, 2023, 11:06 a.m. UTC | #4
Hi.

Le jeudi 17 août 2023, 09:50:57 CEST Masami Hiramatsu a écrit :
> Hi,
> 
> On Wed, 16 Aug 2023 18:35:17 +0200
> 
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > When using sysfs, it is possible to create kprobe for several kernel
> > functions sharing the same name, but of course with different addresses,
> > by writing their addresses in kprobe_events file.
> > 
> > When using PMU, if only the symbol name is given, the event will be
> > created for the first address which matches the symbol, as returned by
> > kallsyms_lookup_name().
> 
> Do you mean probing the same name symbols? Yes, it is intended behavior,
> since it is not always true that the same name function has the same
> prototype (it is mostly true but is not ensured), it is better to leave
> user to decide which one is what you want to probe.

This is what I meant.
I also share your mind regarding leaving the users deciding which one they 
want to probe but in my case (which I agree is a bit a corner one) it leaded 
me to misunderstanding as the PMU kprobe was only added to the first 
ntfs_file_write_iter() which is not the one for ntfs3.

> Have you used 'perf probe' tool? It tries to find the appropriate function
> by line number and creates the probe by 'text+OFFSET' style, not by symbol.
> I think this is the correct way to do that, because user will not know
> which 'address' of the symbol is what the user want.

'perf probe' perfectly does the trick, as it would find all the kernel 
addresses which correspond to the symbol name and create as many probes as 
corresponding symbols [1]:
root@vm-amd64:~# perf probe --add ntfs_file_write_iter

Added new events:
  probe:ntfs_file_write_iter (on ntfs_file_write_iter)
  probe:ntfs_file_write_iter (on ntfs_file_write_iter)

You can now use it in all perf tools, such as:

        perf record -e probe:ntfs_file_write_iter -aR sleep 1
root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
p:probe/ntfs_file_write_iter _text+5088544
p:probe/ntfs_file_write_iter _text+5278560

> Thought?

This contribution is basically here to sort of mimic what perf does but with 
PMU kprobes, as this is not possible to write in a sysfs file with this type of 
probe.

> 
> Thank you,
> 
> > The idea here is to search all kernel functions which match this symbol
> > and
> > create a trace_kprobe for each of them.
> > All these trace_kprobes are linked together by sharing the same
> > trace_probe.
> > 
> > Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> > ---
> > 
> >  kernel/trace/trace_kprobe.c | 86 +++++++++++++++++++++++++++++++++++++
> >  1 file changed, 86 insertions(+)
> > 
> > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > index 1b3fa7b854aa..08580f1466c7 100644
> > --- a/kernel/trace/trace_kprobe.c
> > +++ b/kernel/trace/trace_kprobe.c
> > @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct
> > trace_kprobe *tk)> 
> >  }
> >  
> >  #ifdef CONFIG_PERF_EVENTS
> > 
> > +
> > +struct address_array {
> > +	unsigned long *addrs;
> > +	size_t size;
> > +};
> > +
> > +static int add_addr(void *data, unsigned long addr)
> > +{
> > +	struct address_array *array = data;
> > +	unsigned long *p;
> > +
> > +	array->size++;
> > +	p = krealloc(array->addrs,
> > +				sizeof(*array->addrs) * array->size,
> > +				GFP_KERNEL);
> > +	if (!p) {
> > +		kfree(array->addrs);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	array->addrs = p;
> > +	array->addrs[array->size - 1] = addr;
> > +
> > +	return 0;
> > +}
> > +
> > 
> >  /* create a trace_kprobe, but don't add it to global lists */
> >  struct trace_event_call *
> >  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> >  
> >  			  bool is_return)
> >  
> >  {
> >  
> >  	enum probe_print_type ptype;
> > 
> > +	struct address_array array;
> > 
> >  	struct trace_kprobe *tk;
> > 
> > +	unsigned long func_addr;
> > +	unsigned int i;
> > 
> >  	int ret;
> >  	char *event;
> > 
> > @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void *addr,
> > unsigned long offs,> 
> >  	if (ret < 0)
> >  	
> >  		goto error;
> > 
> > +	array.addrs = NULL;
> > +	array.size = 0;
> > +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > +	if (ret)
> > +		goto error_free;
> > +
> > +	if (array.size == 1)
> > +		goto end;
> > +
> > +	/*
> > +	 * Below loop allocates a trace_kprobe for each function with the same
> > +	 * name in kernel source code.
> > +	 * All this differente trace_kprobes will be linked together through
> > +	 * append_trace_kprobe().
> > +	 * NOTE append_trace_kprobe() is called in register_trace_kprobe() 
which
> > +	 * is called when a kprobe is added through sysfs.
> > +	 */
> > +	func_addr = kallsyms_lookup_name(func);
> > +	for (i = 0; i < array.size; i++) {
> > +		struct trace_kprobe *tk_same_name;
> > +		unsigned long address;
> > +
> > +		address = array.addrs[i];
> > +		/* Skip the function address as we already registered it. */
> > +		if (address == func_addr)
> > +			continue;
> > +
> > +		/*
> > +		 * alloc_trace_kprobe() first considers symbol name, so we set
> > +		 * this to NULL to allocate this kprobe on the given address.
> > +		 */
> > +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > +						  (void *)address, NULL, offs,
> > +						  0 /* maxactive */,
> > +						  0 /* nargs */, is_return);
> > +
> > +		if (IS_ERR(tk_same_name)) {
> > +			ret = -ENOMEM;
> > +			goto error_free;
> > +		}
> > +
> > +		init_trace_event_call(tk_same_name);
> > +
> > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> > +			ret = -ENOMEM;
> > +			goto error_free;
> > +		}
> > +
> > +		ret = append_trace_kprobe(tk_same_name, tk);
> > +		if (ret)
> > +			goto error_free;
> > +	}
> > +
> > +end:
> > +	kfree(array.addrs);
> > 
> >  	return trace_probe_event_call(&tk->tp);
> > 
> > +error_free:
> > +	kfree(array.addrs);
> > 
> >  error:
> >  	free_trace_kprobe(tk);
> >  	return ERR_PTR(ret);

---
[1]: https://github.com/torvalds/linux/blob/
57012c57536f8814dec92e74197ee96c3498d24e/tools/perf/util/probe-event.c#L2989-
L2993
Steven Rostedt Aug. 17, 2023, 3:13 p.m. UTC | #5
On Thu, 17 Aug 2023 12:59:30 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> > Why can't you just pass in the address like sysfs does?  
> 
> To get the addresses from /proc/kallsyms, you need to either have CAP_SYSLOG 
> or even CAP_SYS_ADMIN.
> But to call perf_event_open(), you only need CAP_PERFMON.
> This way, by giving only the name you can trace function with less privileges 
> (i.e. without CAP_SYS_ADMIN).
> Please correct me if I am wrong as I am not an expert in knowing the minimal 
> set of capabilities you need to trace.

I wonder if we should add an option to put in the non-relocated address?
One that can be acquired by debuginfo in the vmlinux. I'm assuming that the
kernel has access to the added offset (I haven't looked). If it does, then
we could allow users to just add something like "+@0xffffffffdeadbeef" and
add the relocation offset to get to the mapped address of the function.

This would allow those without kallsym privileges to pass in kernel address
for tracing.

-- Steve
Francis Laniel Aug. 18, 2023, 9:01 a.m. UTC | #6
Hi.

Le jeudi 17 août 2023, 17:13:03 CEST Steven Rostedt a écrit :
> On Thu, 17 Aug 2023 12:59:30 +0200
> 
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > Why can't you just pass in the address like sysfs does?
> > 
> > To get the addresses from /proc/kallsyms, you need to either have
> > CAP_SYSLOG or even CAP_SYS_ADMIN.
> > But to call perf_event_open(), you only need CAP_PERFMON.
> > This way, by giving only the name you can trace function with less
> > privileges (i.e. without CAP_SYS_ADMIN).
> > Please correct me if I am wrong as I am not an expert in knowing the
> > minimal set of capabilities you need to trace.
> 
> I wonder if we should add an option to put in the non-relocated address?
> One that can be acquired by debuginfo in the vmlinux. I'm assuming that the
> kernel has access to the added offset (I haven't looked). If it does, then
> we could allow users to just add something like "+@0xffffffffdeadbeef" and
> add the relocation offset to get to the mapped address of the function.
> 
> This would allow those without kallsym privileges to pass in kernel address
> for tracing.

This seems interesting but I am wondering about this when using KASLR.
Would it be possible to compute the final address as:
final_address = debuginfo_address + relocation_offset + kaslr_offset?
I will check regarding both the relocation offset and how KASLR works (I only 
know what it does, not how it does it).

Moreover, regarding accessing vmlinux, I can only think to access it through 
vmlinuz which is in /boot.
Sadly, you cannot read /boot/vmlinuz without being root on a several 
distributions.
Note that, the same occurs for /boot/System.map*.

> -- Steve

Best regards.
Masami Hiramatsu (Google) Aug. 18, 2023, 12:37 p.m. UTC | #7
On Thu, 17 Aug 2023 11:13:03 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 17 Aug 2023 12:59:30 +0200
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> 
> > > Why can't you just pass in the address like sysfs does?  
> > 
> > To get the addresses from /proc/kallsyms, you need to either have CAP_SYSLOG 
> > or even CAP_SYS_ADMIN.
> > But to call perf_event_open(), you only need CAP_PERFMON.
> > This way, by giving only the name you can trace function with less privileges 
> > (i.e. without CAP_SYS_ADMIN).
> > Please correct me if I am wrong as I am not an expert in knowing the minimal 
> > set of capabilities you need to trace.
> 
> I wonder if we should add an option to put in the non-relocated address?
> One that can be acquired by debuginfo in the vmlinux. I'm assuming that the
> kernel has access to the added offset (I haven't looked). If it does, then
> we could allow users to just add something like "+@0xffffffffdeadbeef" and
> add the relocation offset to get to the mapped address of the function.

That's why perf probe uses the offset from '_text'. Normal KASLR will just
moves all symbols. (Finer one will move all symbols randomely)
This should not need to access /proc/kallsyms but vmlinux or SystemMap.

Thank you,

> 
> This would allow those without kallsym privileges to pass in kernel address
> for tracing.
> 
> -- Steve
Masami Hiramatsu (Google) Aug. 18, 2023, 1:05 p.m. UTC | #8
On Thu, 17 Aug 2023 13:06:20 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> Hi.
> 
> Le jeudi 17 août 2023, 09:50:57 CEST Masami Hiramatsu a écrit :
> > Hi,
> > 
> > On Wed, 16 Aug 2023 18:35:17 +0200
> > 
> > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > When using sysfs, it is possible to create kprobe for several kernel
> > > functions sharing the same name, but of course with different addresses,
> > > by writing their addresses in kprobe_events file.
> > > 
> > > When using PMU, if only the symbol name is given, the event will be
> > > created for the first address which matches the symbol, as returned by
> > > kallsyms_lookup_name().
> > 
> > Do you mean probing the same name symbols? Yes, it is intended behavior,
> > since it is not always true that the same name function has the same
> > prototype (it is mostly true but is not ensured), it is better to leave
> > user to decide which one is what you want to probe.
> 
> This is what I meant.
> I also share your mind regarding leaving the users deciding which one they 
> want to probe but in my case (which I agree is a bit a corner one) it leaded 
> me to misunderstanding as the PMU kprobe was only added to the first 
> ntfs_file_write_iter() which is not the one for ntfs3.

Hmm, OK. I think in that case (multiple same-name symbols exist) the default
behavior is rejecting with error message. And optionally, it will probe all
or them like your patch.

> 
> > Have you used 'perf probe' tool? It tries to find the appropriate function
> > by line number and creates the probe by 'text+OFFSET' style, not by symbol.
> > I think this is the correct way to do that, because user will not know
> > which 'address' of the symbol is what the user want.
> 
> 'perf probe' perfectly does the trick, as it would find all the kernel 
> addresses which correspond to the symbol name and create as many probes as 
> corresponding symbols [1]:
> root@vm-amd64:~# perf probe --add ntfs_file_write_iter

If you can specify the (last part of) file path as below,

perf probe --add ntfs_file_write_iter@ntfs3/file.c

Then it will choose correct one. :)

> 
> Added new events:
>   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
>   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> 
> You can now use it in all perf tools, such as:
> 
>         perf record -e probe:ntfs_file_write_iter -aR sleep 1
> root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
> p:probe/ntfs_file_write_iter _text+5088544
> p:probe/ntfs_file_write_iter _text+5278560
> 
> > Thought?
> 
> This contribution is basically here to sort of mimic what perf does but with 
> PMU kprobes, as this is not possible to write in a sysfs file with this type of 
> probe.

OK, I see it is for BPF only. Maybe BPF program can filter correct one
to access the argument etc. 

Thank you,

> 
> > 
> > Thank you,
> > 
> > > The idea here is to search all kernel functions which match this symbol
> > > and
> > > create a trace_kprobe for each of them.
> > > All these trace_kprobes are linked together by sharing the same
> > > trace_probe.
> > > 
> > > Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> > > ---
> > > 
> > >  kernel/trace/trace_kprobe.c | 86 +++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 86 insertions(+)
> > > 
> > > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > > index 1b3fa7b854aa..08580f1466c7 100644
> > > --- a/kernel/trace/trace_kprobe.c
> > > +++ b/kernel/trace/trace_kprobe.c
> > > @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct
> > > trace_kprobe *tk)> 
> > >  }
> > >  
> > >  #ifdef CONFIG_PERF_EVENTS
> > > 
> > > +
> > > +struct address_array {
> > > +	unsigned long *addrs;
> > > +	size_t size;
> > > +};
> > > +
> > > +static int add_addr(void *data, unsigned long addr)
> > > +{
> > > +	struct address_array *array = data;
> > > +	unsigned long *p;
> > > +
> > > +	array->size++;
> > > +	p = krealloc(array->addrs,
> > > +				sizeof(*array->addrs) * array->size,
> > > +				GFP_KERNEL);
> > > +	if (!p) {
> > > +		kfree(array->addrs);
> > > +		return -ENOMEM;
> > > +	}
> > > +
> > > +	array->addrs = p;
> > > +	array->addrs[array->size - 1] = addr;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > 
> > >  /* create a trace_kprobe, but don't add it to global lists */
> > >  struct trace_event_call *
> > >  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> > >  
> > >  			  bool is_return)
> > >  
> > >  {
> > >  
> > >  	enum probe_print_type ptype;
> > > 
> > > +	struct address_array array;
> > > 
> > >  	struct trace_kprobe *tk;
> > > 
> > > +	unsigned long func_addr;
> > > +	unsigned int i;
> > > 
> > >  	int ret;
> > >  	char *event;
> > > 
> > > @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void *addr,
> > > unsigned long offs,> 
> > >  	if (ret < 0)
> > >  	
> > >  		goto error;
> > > 
> > > +	array.addrs = NULL;
> > > +	array.size = 0;
> > > +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > > +	if (ret)
> > > +		goto error_free;
> > > +
> > > +	if (array.size == 1)
> > > +		goto end;
> > > +
> > > +	/*
> > > +	 * Below loop allocates a trace_kprobe for each function with the same
> > > +	 * name in kernel source code.
> > > +	 * All this differente trace_kprobes will be linked together through
> > > +	 * append_trace_kprobe().
> > > +	 * NOTE append_trace_kprobe() is called in register_trace_kprobe() 
> which
> > > +	 * is called when a kprobe is added through sysfs.
> > > +	 */
> > > +	func_addr = kallsyms_lookup_name(func);
> > > +	for (i = 0; i < array.size; i++) {
> > > +		struct trace_kprobe *tk_same_name;
> > > +		unsigned long address;
> > > +
> > > +		address = array.addrs[i];
> > > +		/* Skip the function address as we already registered it. */
> > > +		if (address == func_addr)
> > > +			continue;
> > > +
> > > +		/*
> > > +		 * alloc_trace_kprobe() first considers symbol name, so we set
> > > +		 * this to NULL to allocate this kprobe on the given address.
> > > +		 */
> > > +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > +						  (void *)address, NULL, offs,
> > > +						  0 /* maxactive */,
> > > +						  0 /* nargs */, is_return);
> > > +
> > > +		if (IS_ERR(tk_same_name)) {
> > > +			ret = -ENOMEM;
> > > +			goto error_free;
> > > +		}
> > > +
> > > +		init_trace_event_call(tk_same_name);
> > > +
> > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> > > +			ret = -ENOMEM;
> > > +			goto error_free;
> > > +		}
> > > +
> > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > +		if (ret)
> > > +			goto error_free;
> > > +	}
> > > +
> > > +end:
> > > +	kfree(array.addrs);
> > > 
> > >  	return trace_probe_event_call(&tk->tp);
> > > 
> > > +error_free:
> > > +	kfree(array.addrs);
> > > 
> > >  error:
> > >  	free_trace_kprobe(tk);
> > >  	return ERR_PTR(ret);
> 
> ---
> [1]: https://github.com/torvalds/linux/blob/
> 57012c57536f8814dec92e74197ee96c3498d24e/tools/perf/util/probe-event.c#L2989-
> L2993
> 
>
Steven Rostedt Aug. 18, 2023, 3:41 p.m. UTC | #9
On Fri, 18 Aug 2023 21:37:05 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> That's why perf probe uses the offset from '_text'. Normal KASLR will just
> moves all symbols. (Finer one will move all symbols randomely)
> This should not need to access /proc/kallsyms but vmlinux or SystemMap.

We could just pass in: "_text+offset" too.

-- Steve
Francis Laniel Aug. 18, 2023, 6:12 p.m. UTC | #10
Hi.

Le vendredi 18 août 2023, 15:05:37 CEST Masami Hiramatsu a écrit :
> On Thu, 17 Aug 2023 13:06:20 +0200
> 
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > Hi.
> > 
> > Le jeudi 17 août 2023, 09:50:57 CEST Masami Hiramatsu a écrit :
> > > Hi,
> > > 
> > > On Wed, 16 Aug 2023 18:35:17 +0200
> > > 
> > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > When using sysfs, it is possible to create kprobe for several kernel
> > > > functions sharing the same name, but of course with different
> > > > addresses,
> > > > by writing their addresses in kprobe_events file.
> > > > 
> > > > When using PMU, if only the symbol name is given, the event will be
> > > > created for the first address which matches the symbol, as returned by
> > > > kallsyms_lookup_name().
> > > 
> > > Do you mean probing the same name symbols? Yes, it is intended behavior,
> > > since it is not always true that the same name function has the same
> > > prototype (it is mostly true but is not ensured), it is better to leave
> > > user to decide which one is what you want to probe.
> > 
> > This is what I meant.
> > I also share your mind regarding leaving the users deciding which one they
> > want to probe but in my case (which I agree is a bit a corner one) it
> > leaded me to misunderstanding as the PMU kprobe was only added to the
> > first ntfs_file_write_iter() which is not the one for ntfs3.
> 
> Hmm, OK. I think in that case (multiple same-name symbols exist) the default
> behavior is rejecting with error message. And optionally, it will probe all
> or them like your patch.

I am not sure to understand.
Can you please precise the default behavior of which software component?

> > > Have you used 'perf probe' tool? It tries to find the appropriate
> > > function
> > > by line number and creates the probe by 'text+OFFSET' style, not by
> > > symbol.
> > > I think this is the correct way to do that, because user will not know
> > > which 'address' of the symbol is what the user want.
> > 
> > 'perf probe' perfectly does the trick, as it would find all the kernel
> > addresses which correspond to the symbol name and create as many probes as
> > corresponding symbols [1]:
> > root@vm-amd64:~# perf probe --add ntfs_file_write_iter
> 
> If you can specify the (last part of) file path as below,
> 
> perf probe --add ntfs_file_write_iter@ntfs3/file.c
> 
> Then it will choose correct one. :)

Nice! TIL thank you! perf is really powerful!

> > Added new events:
> >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > 
> > You can now use it in all perf tools, such as:
> >         perf record -e probe:ntfs_file_write_iter -aR sleep 1
> > 
> > root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
> > p:probe/ntfs_file_write_iter _text+5088544
> > p:probe/ntfs_file_write_iter _text+5278560
> > 
> > > Thought?
> > 
> > This contribution is basically here to sort of mimic what perf does but
> > with PMU kprobes, as this is not possible to write in a sysfs file with
> > this type of probe.
> 
> OK, I see it is for BPF only. Maybe BPF program can filter correct one
> to access the argument etc.

I am not sure I understand, can you please precise?
The eBPF program will be run when the kprobe will be triggered, so if the 
kprobe is armed for the function (e.g. old ntfs_file_write_iter()), the eBPF 
program will never be called.

> 
> Thank you,
> 
> > > Thank you,
> > > 
> > > > The idea here is to search all kernel functions which match this
> > > > symbol
> > > > and
> > > > create a trace_kprobe for each of them.
> > > > All these trace_kprobes are linked together by sharing the same
> > > > trace_probe.
> > > > 
> > > > Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> > > > ---
> > > > 
> > > >  kernel/trace/trace_kprobe.c | 86
> > > >  +++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 86 insertions(+)
> > > > 
> > > > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > > > index 1b3fa7b854aa..08580f1466c7 100644
> > > > --- a/kernel/trace/trace_kprobe.c
> > > > +++ b/kernel/trace/trace_kprobe.c
> > > > @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct
> > > > trace_kprobe *tk)>
> > > > 
> > > >  }
> > > >  
> > > >  #ifdef CONFIG_PERF_EVENTS
> > > > 
> > > > +
> > > > +struct address_array {
> > > > +	unsigned long *addrs;
> > > > +	size_t size;
> > > > +};
> > > > +
> > > > +static int add_addr(void *data, unsigned long addr)
> > > > +{
> > > > +	struct address_array *array = data;
> > > > +	unsigned long *p;
> > > > +
> > > > +	array->size++;
> > > > +	p = krealloc(array->addrs,
> > > > +				sizeof(*array->addrs) * array->size,
> > > > +				GFP_KERNEL);
> > > > +	if (!p) {
> > > > +		kfree(array->addrs);
> > > > +		return -ENOMEM;
> > > > +	}
> > > > +
> > > > +	array->addrs = p;
> > > > +	array->addrs[array->size - 1] = addr;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > 
> > > >  /* create a trace_kprobe, but don't add it to global lists */
> > > >  struct trace_event_call *
> > > >  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> > > >  
> > > >  			  bool is_return)
> > > >  
> > > >  {
> > > >  
> > > >  	enum probe_print_type ptype;
> > > > 
> > > > +	struct address_array array;
> > > > 
> > > >  	struct trace_kprobe *tk;
> > > > 
> > > > +	unsigned long func_addr;
> > > > +	unsigned int i;
> > > > 
> > > >  	int ret;
> > > >  	char *event;
> > > > 
> > > > @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void
> > > > *addr,
> > > > unsigned long offs,>
> > > > 
> > > >  	if (ret < 0)
> > > >  	
> > > >  		goto error;
> > > > 
> > > > +	array.addrs = NULL;
> > > > +	array.size = 0;
> > > > +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > > > +	if (ret)
> > > > +		goto error_free;
> > > > +
> > > > +	if (array.size == 1)
> > > > +		goto end;
> > > > +
> > > > +	/*
> > > > +	 * Below loop allocates a trace_kprobe for each function with the
> > > > same
> > > > +	 * name in kernel source code.
> > > > +	 * All this differente trace_kprobes will be linked together through
> > > > +	 * append_trace_kprobe().
> > > > +	 * NOTE append_trace_kprobe() is called in register_trace_kprobe()
> > 
> > which
> > 
> > > > +	 * is called when a kprobe is added through sysfs.
> > > > +	 */
> > > > +	func_addr = kallsyms_lookup_name(func);
> > > > +	for (i = 0; i < array.size; i++) {
> > > > +		struct trace_kprobe *tk_same_name;
> > > > +		unsigned long address;
> > > > +
> > > > +		address = array.addrs[i];
> > > > +		/* Skip the function address as we already registered it. */
> > > > +		if (address == func_addr)
> > > > +			continue;
> > > > +
> > > > +		/*
> > > > +		 * alloc_trace_kprobe() first considers symbol name, so we set
> > > > +		 * this to NULL to allocate this kprobe on the given address.
> > > > +		 */
> > > > +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > +						  (void *)address, NULL, offs,
> > > > +						  0 /* maxactive */,
> > > > +						  0 /* nargs */, is_return);
> > > > +
> > > > +		if (IS_ERR(tk_same_name)) {
> > > > +			ret = -ENOMEM;
> > > > +			goto error_free;
> > > > +		}
> > > > +
> > > > +		init_trace_event_call(tk_same_name);
> > > > +
> > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> > > > +			ret = -ENOMEM;
> > > > +			goto error_free;
> > > > +		}
> > > > +
> > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > +		if (ret)
> > > > +			goto error_free;
> > > > +	}
> > > > +
> > > > +end:
> > > > +	kfree(array.addrs);
> > > > 
> > > >  	return trace_probe_event_call(&tk->tp);
> > > > 
> > > > +error_free:
> > > > +	kfree(array.addrs);
> > > > 
> > > >  error:
> > > >  	free_trace_kprobe(tk);
> > > >  	return ERR_PTR(ret);
> > 
> > ---
> > [1]: https://github.com/torvalds/linux/blob/
> > 57012c57536f8814dec92e74197ee96c3498d24e/tools/perf/util/probe-event.c#L29
> > 89- L2993
Francis Laniel Aug. 18, 2023, 6:13 p.m. UTC | #11
Hi.

Le vendredi 18 août 2023, 17:41:41 CEST Steven Rostedt a écrit :
> On Fri, 18 Aug 2023 21:37:05 +0900
> 
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > That's why perf probe uses the offset from '_text'. Normal KASLR will just
> > moves all symbols. (Finer one will move all symbols randomely)
> > This should not need to access /proc/kallsyms but vmlinux or SystemMap.
> 
> We could just pass in: "_text+offset" too.

So, the idea would be to change the existing create_local_trace_kprobe() and 
above functions to indicate the user's offset is to be used against _text and 
not address?

> -- Steve

Best regards.
Steven Rostedt Aug. 18, 2023, 6:20 p.m. UTC | #12
On Fri, 18 Aug 2023 20:13:43 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> Hi.
> 
> Le vendredi 18 août 2023, 17:41:41 CEST Steven Rostedt a écrit :
> > On Fri, 18 Aug 2023 21:37:05 +0900
> > 
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:  
> > > That's why perf probe uses the offset from '_text'. Normal KASLR will just
> > > moves all symbols. (Finer one will move all symbols randomely)
> > > This should not need to access /proc/kallsyms but vmlinux or SystemMap.  
> > 
> > We could just pass in: "_text+offset" too.  
> 
> So, the idea would be to change the existing create_local_trace_kprobe() and 
> above functions to indicate the user's offset is to be used against _text and 
> not address?

No, not to modify that function, but if you know the offset from _text (via
the vmlinux), you can easily calculate it for that function.

I mentioned having a way to pass in the vmlinux debug info address and
subtract the kaslr_offset from it. But that's actually unnecessary. If you
have the address of the function you want, and the address of _text, both
from the debug info of vmlinux, you can simply pass in "_text+offset", and
then use kallsyms to give you _text, and add the offset to give you the
address for create_local_trace_kprobe().

-- Steve
Masami Hiramatsu (Google) Aug. 19, 2023, 1:11 a.m. UTC | #13
Hi Francis,
(Cc: Song Liu and BPF ML)

On Fri, 18 Aug 2023 20:12:11 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> Hi.
> 
> Le vendredi 18 août 2023, 15:05:37 CEST Masami Hiramatsu a écrit :
> > On Thu, 17 Aug 2023 13:06:20 +0200
> > 
> > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > Hi.
> > > 
> > > Le jeudi 17 août 2023, 09:50:57 CEST Masami Hiramatsu a écrit :
> > > > Hi,
> > > > 
> > > > On Wed, 16 Aug 2023 18:35:17 +0200
> > > > 
> > > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > > When using sysfs, it is possible to create kprobe for several kernel
> > > > > functions sharing the same name, but of course with different
> > > > > addresses,
> > > > > by writing their addresses in kprobe_events file.
> > > > > 
> > > > > When using PMU, if only the symbol name is given, the event will be
> > > > > created for the first address which matches the symbol, as returned by
> > > > > kallsyms_lookup_name().
> > > > 
> > > > Do you mean probing the same name symbols? Yes, it is intended behavior,
> > > > since it is not always true that the same name function has the same
> > > > prototype (it is mostly true but is not ensured), it is better to leave
> > > > user to decide which one is what you want to probe.
> > > 
> > > This is what I meant.
> > > I also share your mind regarding leaving the users deciding which one they
> > > want to probe but in my case (which I agree is a bit a corner one) it
> > > leaded me to misunderstanding as the PMU kprobe was only added to the
> > > first ntfs_file_write_iter() which is not the one for ntfs3.
> > 
> > Hmm, OK. I think in that case (multiple same-name symbols exist) the default
> > behavior is rejecting with error message. And optionally, it will probe all
> > or them like your patch.
> 
> I am not sure to understand.
> Can you please precise the default behavior of which software component?

I meant that the behavior of the kprobe-events via /sys/kernel/tracing.
But your patch is for the other interface for perf as kprobe-event PMU.
In that case, I think we should CC to other users like BPF because
this may change the expected behavior.

> 
> > > > Have you used 'perf probe' tool? It tries to find the appropriate
> > > > function
> > > > by line number and creates the probe by 'text+OFFSET' style, not by
> > > > symbol.
> > > > I think this is the correct way to do that, because user will not know
> > > > which 'address' of the symbol is what the user want.
> > > 
> > > 'perf probe' perfectly does the trick, as it would find all the kernel
> > > addresses which correspond to the symbol name and create as many probes as
> > > corresponding symbols [1]:
> > > root@vm-amd64:~# perf probe --add ntfs_file_write_iter
> > 
> > If you can specify the (last part of) file path as below,
> > 
> > perf probe --add ntfs_file_write_iter@ntfs3/file.c
> > 
> > Then it will choose correct one. :)
> 
> Nice! TIL thank you! perf is really powerful!

Yeah, but note that the perf-probe is a tool to setup a 'visible' tracepoint
event. After making a new tracepoint event, the perf tool can use such
"[Tracepoint event]" instead of PMU.

Unfortunately, kprobe-event 'PMU' version doesn't support this
because it has been introduced for BPF. See the original series;

https://lore.kernel.org/lkml/20171206224518.3598254-1-songliubraving@fb.com/

So, the "local_kprobe_event" is making a kprobe PMU which is a event for
local session, that is designed for using such event from BPF (if I
understand correctly). Of course BPF tool can setup its local
event with a unique symbol + offset (not just a symbol) in a BPF tool with
perf-probe but it doesn't.

Could you tell me how do you use this feature, for what perpose?

If you just need to trace/profile a specific function which has the same
name symbols, you might be better to use `perf probe` + `/sys/kernel/tracing`
or `perf record -e EVENT`.

Or if you need to run it with CAP_PERFMON, without CAP_SYS_ADMIN,
we need to change a userspace tool to find the correct address and
pass it to the perf_event_open().

> 
> > > Added new events:
> > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > 
> > > You can now use it in all perf tools, such as:
> > >         perf record -e probe:ntfs_file_write_iter -aR sleep 1
> > > 
> > > root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
> > > p:probe/ntfs_file_write_iter _text+5088544
> > > p:probe/ntfs_file_write_iter _text+5278560
> > > 
> > > > Thought?
> > > 
> > > This contribution is basically here to sort of mimic what perf does but
> > > with PMU kprobes, as this is not possible to write in a sysfs file with
> > > this type of probe.
> > 
> > OK, I see it is for BPF only. Maybe BPF program can filter correct one
> > to access the argument etc.
> 
> I am not sure I understand, can you please precise?
> The eBPF program will be run when the kprobe will be triggered, so if the 
> kprobe is armed for the function (e.g. old ntfs_file_write_iter()), the eBPF 
> program will never be called.

As I said above, it is userspace BPF loader issue, because it has to specify
the correct address via unique symbol + offset, instead of attaching all of them.
I think that will be more side-effects.

But anyway, thanks for pointing this issue. I should fix kprobe event to reject
the symbols which is not unique. That should be pointed by other unique symbols. 

Thank you,

> 
> > 
> > Thank you,
> > 
> > > > Thank you,
> > > > 
> > > > > The idea here is to search all kernel functions which match this
> > > > > symbol
> > > > > and
> > > > > create a trace_kprobe for each of them.
> > > > > All these trace_kprobes are linked together by sharing the same
> > > > > trace_probe.
> > > > > 
> > > > > Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> > > > > ---
> > > > > 
> > > > >  kernel/trace/trace_kprobe.c | 86
> > > > >  +++++++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 86 insertions(+)
> > > > > 
> > > > > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > > > > index 1b3fa7b854aa..08580f1466c7 100644
> > > > > --- a/kernel/trace/trace_kprobe.c
> > > > > +++ b/kernel/trace/trace_kprobe.c
> > > > > @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct
> > > > > trace_kprobe *tk)>
> > > > > 
> > > > >  }
> > > > >  
> > > > >  #ifdef CONFIG_PERF_EVENTS
> > > > > 
> > > > > +
> > > > > +struct address_array {
> > > > > +	unsigned long *addrs;
> > > > > +	size_t size;
> > > > > +};
> > > > > +
> > > > > +static int add_addr(void *data, unsigned long addr)
> > > > > +{
> > > > > +	struct address_array *array = data;
> > > > > +	unsigned long *p;
> > > > > +
> > > > > +	array->size++;
> > > > > +	p = krealloc(array->addrs,
> > > > > +				sizeof(*array->addrs) * array->size,
> > > > > +				GFP_KERNEL);
> > > > > +	if (!p) {
> > > > > +		kfree(array->addrs);
> > > > > +		return -ENOMEM;
> > > > > +	}
> > > > > +
> > > > > +	array->addrs = p;
> > > > > +	array->addrs[array->size - 1] = addr;
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > 
> > > > >  /* create a trace_kprobe, but don't add it to global lists */
> > > > >  struct trace_event_call *
> > > > >  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> > > > >  
> > > > >  			  bool is_return)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	enum probe_print_type ptype;
> > > > > 
> > > > > +	struct address_array array;
> > > > > 
> > > > >  	struct trace_kprobe *tk;
> > > > > 
> > > > > +	unsigned long func_addr;
> > > > > +	unsigned int i;
> > > > > 
> > > > >  	int ret;
> > > > >  	char *event;
> > > > > 
> > > > > @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void
> > > > > *addr,
> > > > > unsigned long offs,>
> > > > > 
> > > > >  	if (ret < 0)
> > > > >  	
> > > > >  		goto error;
> > > > > 
> > > > > +	array.addrs = NULL;
> > > > > +	array.size = 0;
> > > > > +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > > > > +	if (ret)
> > > > > +		goto error_free;
> > > > > +
> > > > > +	if (array.size == 1)
> > > > > +		goto end;
> > > > > +
> > > > > +	/*
> > > > > +	 * Below loop allocates a trace_kprobe for each function with the
> > > > > same
> > > > > +	 * name in kernel source code.
> > > > > +	 * All this differente trace_kprobes will be linked together through
> > > > > +	 * append_trace_kprobe().
> > > > > +	 * NOTE append_trace_kprobe() is called in register_trace_kprobe()
> > > 
> > > which
> > > 
> > > > > +	 * is called when a kprobe is added through sysfs.
> > > > > +	 */
> > > > > +	func_addr = kallsyms_lookup_name(func);
> > > > > +	for (i = 0; i < array.size; i++) {
> > > > > +		struct trace_kprobe *tk_same_name;
> > > > > +		unsigned long address;
> > > > > +
> > > > > +		address = array.addrs[i];
> > > > > +		/* Skip the function address as we already registered it. */
> > > > > +		if (address == func_addr)
> > > > > +			continue;
> > > > > +
> > > > > +		/*
> > > > > +		 * alloc_trace_kprobe() first considers symbol name, so we set
> > > > > +		 * this to NULL to allocate this kprobe on the given address.
> > > > > +		 */
> > > > > +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > > +						  (void *)address, NULL, offs,
> > > > > +						  0 /* maxactive */,
> > > > > +						  0 /* nargs */, is_return);
> > > > > +
> > > > > +		if (IS_ERR(tk_same_name)) {
> > > > > +			ret = -ENOMEM;
> > > > > +			goto error_free;
> > > > > +		}
> > > > > +
> > > > > +		init_trace_event_call(tk_same_name);
> > > > > +
> > > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> > > > > +			ret = -ENOMEM;
> > > > > +			goto error_free;
> > > > > +		}
> > > > > +
> > > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > > +		if (ret)
> > > > > +			goto error_free;
> > > > > +	}
> > > > > +
> > > > > +end:
> > > > > +	kfree(array.addrs);
> > > > > 
> > > > >  	return trace_probe_event_call(&tk->tp);
> > > > > 
> > > > > +error_free:
> > > > > +	kfree(array.addrs);
> > > > > 
> > > > >  error:
> > > > >  	free_trace_kprobe(tk);
> > > > >  	return ERR_PTR(ret);
> > > 
> > > ---
> > > [1]: https://github.com/torvalds/linux/blob/
> > > 57012c57536f8814dec92e74197ee96c3498d24e/tools/perf/util/probe-event.c#L29
> > > 89- L2993
> 
> 
> 
>
Masami Hiramatsu (Google) Aug. 19, 2023, 1:15 a.m. UTC | #14
On Fri, 18 Aug 2023 14:20:33 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 18 Aug 2023 20:13:43 +0200
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> 
> > Hi.
> > 
> > Le vendredi 18 août 2023, 17:41:41 CEST Steven Rostedt a écrit :
> > > On Fri, 18 Aug 2023 21:37:05 +0900
> > > 
> > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:  
> > > > That's why perf probe uses the offset from '_text'. Normal KASLR will just
> > > > moves all symbols. (Finer one will move all symbols randomely)
> > > > This should not need to access /proc/kallsyms but vmlinux or SystemMap.  
> > > 
> > > We could just pass in: "_text+offset" too.  
> > 
> > So, the idea would be to change the existing create_local_trace_kprobe() and 
> > above functions to indicate the user's offset is to be used against _text and 
> > not address?
> 
> No, not to modify that function, but if you know the offset from _text (via
> the vmlinux), you can easily calculate it for that function.

Note that the kprobe-event PMU interface itself allows you to specify
FUNC+OFFSET style;

https://lore.kernel.org/lkml/20171206224518.3598254-5-songliubraving@fb.com/

perf_event_attr::kprobe_func = "_text";
perf_event_attr::probe_offset = OFFSET;

Then, it should be able to specify the correct one. Of course you can use
other unique symbols around the target symbol.

Thank you,

> 
> I mentioned having a way to pass in the vmlinux debug info address and
> subtract the kaslr_offset from it. But that's actually unnecessary. If you
> have the address of the function you want, and the address of _text, both
> from the debug info of vmlinux, you can simply pass in "_text+offset", and
> then use kallsyms to give you _text, and add the offset to give you the
> address for create_local_trace_kprobe().
> 
> -- Steve
Song Liu Aug. 19, 2023, 3:22 p.m. UTC | #15
On Fri, Aug 18, 2023 at 6:16 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Fri, 18 Aug 2023 14:20:33 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > On Fri, 18 Aug 2023 20:13:43 +0200
> > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> >
> > > Hi.
> > >
> > > Le vendredi 18 août 2023, 17:41:41 CEST Steven Rostedt a écrit :
> > > > On Fri, 18 Aug 2023 21:37:05 +0900
> > > >
> > > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > > > > That's why perf probe uses the offset from '_text'. Normal KASLR will just
> > > > > moves all symbols. (Finer one will move all symbols randomely)
> > > > > This should not need to access /proc/kallsyms but vmlinux or SystemMap.
> > > >
> > > > We could just pass in: "_text+offset" too.
> > >
> > > So, the idea would be to change the existing create_local_trace_kprobe() and
> > > above functions to indicate the user's offset is to be used against _text and
> > > not address?
> >
> > No, not to modify that function, but if you know the offset from _text (via
> > the vmlinux), you can easily calculate it for that function.
>
> Note that the kprobe-event PMU interface itself allows you to specify
> FUNC+OFFSET style;
>
> https://lore.kernel.org/lkml/20171206224518.3598254-5-songliubraving@fb.com/
>
> perf_event_attr::kprobe_func = "_text";
> perf_event_attr::probe_offset = OFFSET;
>
> Then, it should be able to specify the correct one. Of course you can use
> other unique symbols around the target symbol.

Trying to catch up with the thread.

Besides the CAP_* issue, we can do this with

perf_event_attr::kprobe_func = NULL;
perf_event_attr::kprobe_addr = address;

Then for the CAP_*, I think we should give CAP_PERFMON access to
/proc/kallsyms. Would this work?

Thanks,
Song
Masami Hiramatsu (Google) Aug. 20, 2023, 9:32 a.m. UTC | #16
On Sat, 19 Aug 2023 08:22:50 -0700
Song Liu <song@kernel.org> wrote:

> On Fri, Aug 18, 2023 at 6:16 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Fri, 18 Aug 2023 14:20:33 -0400
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > > On Fri, 18 Aug 2023 20:13:43 +0200
> > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > >
> > > > Hi.
> > > >
> > > > Le vendredi 18 août 2023, 17:41:41 CEST Steven Rostedt a écrit :
> > > > > On Fri, 18 Aug 2023 21:37:05 +0900
> > > > >
> > > > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > > > > > That's why perf probe uses the offset from '_text'. Normal KASLR will just
> > > > > > moves all symbols. (Finer one will move all symbols randomely)
> > > > > > This should not need to access /proc/kallsyms but vmlinux or SystemMap.
> > > > >
> > > > > We could just pass in: "_text+offset" too.
> > > >
> > > > So, the idea would be to change the existing create_local_trace_kprobe() and
> > > > above functions to indicate the user's offset is to be used against _text and
> > > > not address?
> > >
> > > No, not to modify that function, but if you know the offset from _text (via
> > > the vmlinux), you can easily calculate it for that function.
> >
> > Note that the kprobe-event PMU interface itself allows you to specify
> > FUNC+OFFSET style;
> >
> > https://lore.kernel.org/lkml/20171206224518.3598254-5-songliubraving@fb.com/
> >
> > perf_event_attr::kprobe_func = "_text";
> > perf_event_attr::probe_offset = OFFSET;
> >
> > Then, it should be able to specify the correct one. Of course you can use
> > other unique symbols around the target symbol.
> 
> Trying to catch up with the thread.

Thanks for your reply :)

> 
> Besides the CAP_* issue, we can do this with
> 
> perf_event_attr::kprobe_func = NULL;
> perf_event_attr::kprobe_addr = address;

As I pointed, you don't need actual address, instead, you can specify the
probe point via "unique symbol" + offset. 

> 
> Then for the CAP_*, I think we should give CAP_PERFMON access to
> /proc/kallsyms. Would this work?

For the "unique symbol" + offset, you don't need the kallsyms, but need to
access the System.map or vmlinux image. In this case, we don't need to expand
the CAP_PERFMON capabilities.

Thank you,

> 
> Thanks,
> Song
Song Liu Aug. 20, 2023, 10:02 a.m. UTC | #17
On Sun, Aug 20, 2023 at 2:32 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
[...]
> > >
> > > perf_event_attr::kprobe_func = "_text";
> > > perf_event_attr::probe_offset = OFFSET;
> > >
> > > Then, it should be able to specify the correct one. Of course you can use
> > > other unique symbols around the target symbol.
> >
> > Trying to catch up with the thread.
>
> Thanks for your reply :)
>
> >
> > Besides the CAP_* issue, we can do this with
> >
> > perf_event_attr::kprobe_func = NULL;
> > perf_event_attr::kprobe_addr = address;
>
> As I pointed, you don't need actual address, instead, you can specify the
> probe point via "unique symbol" + offset.

Technically, this works. But it is weird to me.

> >
> > Then for the CAP_*, I think we should give CAP_PERFMON access to
> > /proc/kallsyms. Would this work?
>
> For the "unique symbol" + offset, you don't need the kallsyms, but need to
> access the System.map or vmlinux image. In this case, we don't need to expand
> the CAP_PERFMON capabilities.

I agree this is not needed in this case. But I wonder whether it makes sense
to give CAP_PERFMON access to /proc/kallsyms. Will this change make
CAP_PERFMON less secure?

Thanks,
Song
Masami Hiramatsu (Google) Aug. 20, 2023, 1:16 p.m. UTC | #18
On Sun, 20 Aug 2023 03:02:18 -0700
Song Liu <song@kernel.org> wrote:

> On Sun, Aug 20, 2023 at 2:32 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> [...]
> > > >
> > > > perf_event_attr::kprobe_func = "_text";
> > > > perf_event_attr::probe_offset = OFFSET;
> > > >
> > > > Then, it should be able to specify the correct one. Of course you can use
> > > > other unique symbols around the target symbol.
> > >
> > > Trying to catch up with the thread.
> >
> > Thanks for your reply :)
> >
> > >
> > > Besides the CAP_* issue, we can do this with
> > >
> > > perf_event_attr::kprobe_func = NULL;
> > > perf_event_attr::kprobe_addr = address;
> >
> > As I pointed, you don't need actual address, instead, you can specify the
> > probe point via "unique symbol" + offset.
> 
> Technically, this works. But it is weird to me.

It's not so weired because it is a relative address, e.g. from _text,
this means "the address in the text section". And perf probe already
uses it a while.

> > >
> > > Then for the CAP_*, I think we should give CAP_PERFMON access to
> > > /proc/kallsyms. Would this work?
> >
> > For the "unique symbol" + offset, you don't need the kallsyms, but need to
> > access the System.map or vmlinux image. In this case, we don't need to expand
> > the CAP_PERFMON capabilities.
> 
> I agree this is not needed in this case. But I wonder whether it makes sense
> to give CAP_PERFMON access to /proc/kallsyms. Will this change make
> CAP_PERFMON less secure?

Yes, because /proc/kallsyms will expose the real address of the all
symbols, which makes KASLR useless. But on the other hand, it maybe
already useless because BPF program can read any real address, right?
Hmm, from this point of view, is the CAP_PERFMON meaningful?
(maybe it can avoid loading modules etc.)

Thank you,

> 
> Thanks,
> Song
Jiri Olsa Aug. 20, 2023, 8:23 p.m. UTC | #19
On Sat, Aug 19, 2023 at 10:11:05AM +0900, Masami Hiramatsu wrote:
> Hi Francis,
> (Cc: Song Liu and BPF ML)
> 
> On Fri, 18 Aug 2023 20:12:11 +0200
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> 
> > Hi.
> > 
> > Le vendredi 18 août 2023, 15:05:37 CEST Masami Hiramatsu a écrit :
> > > On Thu, 17 Aug 2023 13:06:20 +0200
> > > 
> > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > Hi.
> > > > 
> > > > Le jeudi 17 août 2023, 09:50:57 CEST Masami Hiramatsu a écrit :
> > > > > Hi,
> > > > > 
> > > > > On Wed, 16 Aug 2023 18:35:17 +0200
> > > > > 
> > > > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > > > When using sysfs, it is possible to create kprobe for several kernel
> > > > > > functions sharing the same name, but of course with different
> > > > > > addresses,
> > > > > > by writing their addresses in kprobe_events file.
> > > > > > 
> > > > > > When using PMU, if only the symbol name is given, the event will be
> > > > > > created for the first address which matches the symbol, as returned by
> > > > > > kallsyms_lookup_name().
> > > > > 
> > > > > Do you mean probing the same name symbols? Yes, it is intended behavior,
> > > > > since it is not always true that the same name function has the same
> > > > > prototype (it is mostly true but is not ensured), it is better to leave
> > > > > user to decide which one is what you want to probe.
> > > > 
> > > > This is what I meant.
> > > > I also share your mind regarding leaving the users deciding which one they
> > > > want to probe but in my case (which I agree is a bit a corner one) it
> > > > leaded me to misunderstanding as the PMU kprobe was only added to the
> > > > first ntfs_file_write_iter() which is not the one for ntfs3.
> > > 
> > > Hmm, OK. I think in that case (multiple same-name symbols exist) the default
> > > behavior is rejecting with error message. And optionally, it will probe all
> > > or them like your patch.
> > 
> > I am not sure to understand.
> > Can you please precise the default behavior of which software component?
> 
> I meant that the behavior of the kprobe-events via /sys/kernel/tracing.
> But your patch is for the other interface for perf as kprobe-event PMU.
> In that case, I think we should CC to other users like BPF because
> this may change the expected behavior.

it does not break bpf tests, but of course we don't have such use case, but I
think should make this optional not to potentionaly break existing users,
because you get more probes than you currently ask for

would be great to have some kind of tests for this as well

SNIP

> > > > > > +		/*
> > > > > > +		 * alloc_trace_kprobe() first considers symbol name, so we set
> > > > > > +		 * this to NULL to allocate this kprobe on the given address.
> > > > > > +		 */
> > > > > > +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > > > +						  (void *)address, NULL, offs,
> > > > > > +						  0 /* maxactive */,
> > > > > > +						  0 /* nargs */, is_return);
> > > > > > +
> > > > > > +		if (IS_ERR(tk_same_name)) {
> > > > > > +			ret = -ENOMEM;
> > > > > > +			goto error_free;
> > > > > > +		}
> > > > > > +
> > > > > > +		init_trace_event_call(tk_same_name);
> > > > > > +
> > > > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> > > > > > +			ret = -ENOMEM;
> > > > > > +			goto error_free;
> > > > > > +		}
> > > > > > +
> > > > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > > > +		if (ret)
> > > > > > +			goto error_free;

this seems tricky if offs is specified, because IIUC that will most
likely fail in the __register_trace_kprobe/register_kprobe call inside
the append_trace_kprobe ... should we allow this just for offs == 0 ?

jirka
Jiri Olsa Aug. 20, 2023, 8:34 p.m. UTC | #20
On Sat, Aug 19, 2023 at 10:11:05AM +0900, Masami Hiramatsu wrote:

SNIP

> > > > > > +	func_addr = kallsyms_lookup_name(func);
> > > > > > +	for (i = 0; i < array.size; i++) {
> > > > > > +		struct trace_kprobe *tk_same_name;
> > > > > > +		unsigned long address;
> > > > > > +
> > > > > > +		address = array.addrs[i];
> > > > > > +		/* Skip the function address as we already registered it. */
> > > > > > +		if (address == func_addr)
> > > > > > +			continue;
> > > > > > +
> > > > > > +		/*
> > > > > > +		 * alloc_trace_kprobe() first considers symbol name, so we set
> > > > > > +		 * this to NULL to allocate this kprobe on the given address.
> > > > > > +		 */
> > > > > > +		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > > > +						  (void *)address, NULL, offs,
> > > > > > +						  0 /* maxactive */,
> > > > > > +						  0 /* nargs */, is_return);
> > > > > > +
> > > > > > +		if (IS_ERR(tk_same_name)) {
> > > > > > +			ret = -ENOMEM;
> > > > > > +			goto error_free;
> > > > > > +		}
> > > > > > +
> > > > > > +		init_trace_event_call(tk_same_name);
> > > > > > +
> > > > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
> > > > > > +			ret = -ENOMEM;

also are we leaking tk_same_name in here?


> > > > > > +			goto error_free;
> > > > > > +		}
> > > > > > +
> > > > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > > > +		if (ret)

and here?

jirka

> > > > > > +			goto error_free;
> > > > > > +	}
> > > > > > +
> > > > > > +end:
> > > > > > +	kfree(array.addrs);
> > > > > > 
> > > > > >  	return trace_probe_event_call(&tk->tp);
> > > > > > 
> > > > > > +error_free:
> > > > > > +	kfree(array.addrs);
> > > > > > 
> > > > > >  error:
> > > > > >  	free_trace_kprobe(tk);
> > > > > >  	return ERR_PTR(ret);
> > > > 
> > > > ---
> > > > [1]: https://github.com/torvalds/linux/blob/
> > > > 57012c57536f8814dec92e74197ee96c3498d24e/tools/perf/util/probe-event.c#L29
> > > > 89- L2993
> > 
> > 
> > 
> > 
> 
> 
> -- 
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
>
Song Liu Aug. 21, 2023, 6:09 a.m. UTC | #21
On Sun, Aug 20, 2023 at 6:16 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Sun, 20 Aug 2023 03:02:18 -0700
> Song Liu <song@kernel.org> wrote:
>
> > On Sun, Aug 20, 2023 at 2:32 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > >
> > [...]
> > > > >
> > > > > perf_event_attr::kprobe_func = "_text";
> > > > > perf_event_attr::probe_offset = OFFSET;
> > > > >
> > > > > Then, it should be able to specify the correct one. Of course you can use
> > > > > other unique symbols around the target symbol.
> > > >
> > > > Trying to catch up with the thread.
> > >
> > > Thanks for your reply :)
> > >
> > > >
> > > > Besides the CAP_* issue, we can do this with
> > > >
> > > > perf_event_attr::kprobe_func = NULL;
> > > > perf_event_attr::kprobe_addr = address;
> > >
> > > As I pointed, you don't need actual address, instead, you can specify the
> > > probe point via "unique symbol" + offset.
> >
> > Technically, this works. But it is weird to me.
>
> It's not so weired because it is a relative address, e.g. from _text,
> this means "the address in the text section". And perf probe already
> uses it a while.
>
> > > >
> > > > Then for the CAP_*, I think we should give CAP_PERFMON access to
> > > > /proc/kallsyms. Would this work?
> > >
> > > For the "unique symbol" + offset, you don't need the kallsyms, but need to
> > > access the System.map or vmlinux image. In this case, we don't need to expand
> > > the CAP_PERFMON capabilities.
> >
> > I agree this is not needed in this case. But I wonder whether it makes sense
> > to give CAP_PERFMON access to /proc/kallsyms. Will this change make
> > CAP_PERFMON less secure?
>
> Yes, because /proc/kallsyms will expose the real address of the all
> symbols, which makes KASLR useless. But on the other hand, it maybe
> already useless because BPF program can read any real address, right?
> Hmm, from this point of view, is the CAP_PERFMON meaningful?
> (maybe it can avoid loading modules etc.)

kprobe BPF program has access to pt_regs, so it can read ip of the
attached function. Can we do the same with regular kprobe (no bpf)?

Thanks,
Song

>
> Thank you,
>
> >
> > Thanks,
> > Song
>
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
Masami Hiramatsu (Google) Aug. 21, 2023, 10:01 a.m. UTC | #22
On Sun, 20 Aug 2023 23:09:22 -0700
Song Liu <song@kernel.org> wrote:

> On Sun, Aug 20, 2023 at 6:16 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Sun, 20 Aug 2023 03:02:18 -0700
> > Song Liu <song@kernel.org> wrote:
> >
> > > On Sun, Aug 20, 2023 at 2:32 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > > >
> > > [...]
> > > > > >
> > > > > > perf_event_attr::kprobe_func = "_text";
> > > > > > perf_event_attr::probe_offset = OFFSET;
> > > > > >
> > > > > > Then, it should be able to specify the correct one. Of course you can use
> > > > > > other unique symbols around the target symbol.
> > > > >
> > > > > Trying to catch up with the thread.
> > > >
> > > > Thanks for your reply :)
> > > >
> > > > >
> > > > > Besides the CAP_* issue, we can do this with
> > > > >
> > > > > perf_event_attr::kprobe_func = NULL;
> > > > > perf_event_attr::kprobe_addr = address;
> > > >
> > > > As I pointed, you don't need actual address, instead, you can specify the
> > > > probe point via "unique symbol" + offset.
> > >
> > > Technically, this works. But it is weird to me.
> >
> > It's not so weired because it is a relative address, e.g. from _text,
> > this means "the address in the text section". And perf probe already
> > uses it a while.
> >
> > > > >
> > > > > Then for the CAP_*, I think we should give CAP_PERFMON access to
> > > > > /proc/kallsyms. Would this work?
> > > >
> > > > For the "unique symbol" + offset, you don't need the kallsyms, but need to
> > > > access the System.map or vmlinux image. In this case, we don't need to expand
> > > > the CAP_PERFMON capabilities.
> > >
> > > I agree this is not needed in this case. But I wonder whether it makes sense
> > > to give CAP_PERFMON access to /proc/kallsyms. Will this change make
> > > CAP_PERFMON less secure?
> >
> > Yes, because /proc/kallsyms will expose the real address of the all
> > symbols, which makes KASLR useless. But on the other hand, it maybe
> > already useless because BPF program can read any real address, right?
> > Hmm, from this point of view, is the CAP_PERFMON meaningful?
> > (maybe it can avoid loading modules etc.)
> 
> kprobe BPF program has access to pt_regs, so it can read ip of the
> attached function. Can we do the same with regular kprobe (no bpf)?

Yes, it can. So I think it is OK to expand CAP_PERFMON to access kallsyms.
But this means CAP_PERMON itself is not safe in some case.

Thank you,

> 
> Thanks,
> Song
> 
> >
> > Thank you,
> >
> > >
> > > Thanks,
> > > Song
> >
> >
> > --
> > Masami Hiramatsu (Google) <mhiramat@kernel.org>
Francis Laniel Aug. 21, 2023, 12:22 p.m. UTC | #23
Hi.

Le dimanche 20 août 2023, 22:23:55 CEST Jiri Olsa a écrit :
> On Sat, Aug 19, 2023 at 10:11:05AM +0900, Masami Hiramatsu wrote:
> > Hi Francis,
> > (Cc: Song Liu and BPF ML)
> > 
> > On Fri, 18 Aug 2023 20:12:11 +0200
> > 
> > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > Hi.
> > > 
> > > Le vendredi 18 août 2023, 15:05:37 CEST Masami Hiramatsu a écrit :
> > > > On Thu, 17 Aug 2023 13:06:20 +0200
> > > > 
> > > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > > Hi.
> > > > > 
> > > > > Le jeudi 17 août 2023, 09:50:57 CEST Masami Hiramatsu a écrit :
> > > > > > Hi,
> > > > > > 
> > > > > > On Wed, 16 Aug 2023 18:35:17 +0200
> > > > > > 
> > > > > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > > > > When using sysfs, it is possible to create kprobe for several
> > > > > > > kernel
> > > > > > > functions sharing the same name, but of course with different
> > > > > > > addresses,
> > > > > > > by writing their addresses in kprobe_events file.
> > > > > > > 
> > > > > > > When using PMU, if only the symbol name is given, the event will
> > > > > > > be
> > > > > > > created for the first address which matches the symbol, as
> > > > > > > returned by
> > > > > > > kallsyms_lookup_name().
> > > > > > 
> > > > > > Do you mean probing the same name symbols? Yes, it is intended
> > > > > > behavior,
> > > > > > since it is not always true that the same name function has the
> > > > > > same
> > > > > > prototype (it is mostly true but is not ensured), it is better to
> > > > > > leave
> > > > > > user to decide which one is what you want to probe.
> > > > > 
> > > > > This is what I meant.
> > > > > I also share your mind regarding leaving the users deciding which
> > > > > one they
> > > > > want to probe but in my case (which I agree is a bit a corner one)
> > > > > it
> > > > > leaded me to misunderstanding as the PMU kprobe was only added to
> > > > > the
> > > > > first ntfs_file_write_iter() which is not the one for ntfs3.
> > > > 
> > > > Hmm, OK. I think in that case (multiple same-name symbols exist) the
> > > > default behavior is rejecting with error message. And optionally, it
> > > > will probe all or them like your patch.
> > > 
> > > I am not sure to understand.
> > > Can you please precise the default behavior of which software component?
> > 
> > I meant that the behavior of the kprobe-events via /sys/kernel/tracing.
> > But your patch is for the other interface for perf as kprobe-event PMU.
> > In that case, I think we should CC to other users like BPF because
> > this may change the expected behavior.
> 
> it does not break bpf tests, but of course we don't have such use case, but
> I think should make this optional not to potentionaly break existing users,
> because you get more probes than you currently ask for
> 
> would be great to have some kind of tests for this as well

If we decide to go further with this contribution, I will add some kind of 
test (even though I do not really see how to test it at the moment).

> SNIP
> 
> > > > > > > +		/*
> > > > > > > +		 * alloc_trace_kprobe() first considers symbol name, 
so we
> > > > > > > set
> > > > > > > +		 * this to NULL to allocate this kprobe on the given 
address.
> > > > > > > +		 */
> > > > > > > +		tk_same_name = 
alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > > > > +						  (void *)address, NULL, 
offs,
> > > > > > > +						  0 /* maxactive */,
> > > > > > > +						  0 /* nargs */, 
is_return);
> > > > > > > +
> > > > > > > +		if (IS_ERR(tk_same_name)) {
> > > > > > > +			ret = -ENOMEM;
> > > > > > > +			goto error_free;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		init_trace_event_call(tk_same_name);
> > > > > > > +
> > > > > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, 
ptype) < 0) {
> > > > > > > +			ret = -ENOMEM;
> > > > > > > +			goto error_free;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > > > > +		if (ret)
> > > > > > > +			goto error_free;
> 
> this seems tricky if offs is specified, because IIUC that will most
> likely fail in the __register_trace_kprobe/register_kprobe call inside
> the append_trace_kprobe ... should we allow this just for offs == 0 ?

Excellent catch!
I will correct it for v2 if I send one!

> jirka
Francis Laniel Aug. 21, 2023, 12:24 p.m. UTC | #24
Hi.

Le dimanche 20 août 2023, 22:34:40 CEST Jiri Olsa a écrit :
> On Sat, Aug 19, 2023 at 10:11:05AM +0900, Masami Hiramatsu wrote:
> 
> SNIP
> 
> > > > > > > +	func_addr = kallsyms_lookup_name(func);
> > > > > > > +	for (i = 0; i < array.size; i++) {
> > > > > > > +		struct trace_kprobe *tk_same_name;
> > > > > > > +		unsigned long address;
> > > > > > > +
> > > > > > > +		address = array.addrs[i];
> > > > > > > +		/* Skip the function address as we already 
registered it. */
> > > > > > > +		if (address == func_addr)
> > > > > > > +			continue;
> > > > > > > +
> > > > > > > +		/*
> > > > > > > +		 * alloc_trace_kprobe() first considers symbol name, 
so we
> > > > > > > set
> > > > > > > +		 * this to NULL to allocate this kprobe on the given 
address.
> > > > > > > +		 */
> > > > > > > +		tk_same_name = 
alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > > > > +						  (void *)address, NULL, 
offs,
> > > > > > > +						  0 /* maxactive */,
> > > > > > > +						  0 /* nargs */, 
is_return);
> > > > > > > +
> > > > > > > +		if (IS_ERR(tk_same_name)) {
> > > > > > > +			ret = -ENOMEM;
> > > > > > > +			goto error_free;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		init_trace_event_call(tk_same_name);
> > > > > > > +
> > > > > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, 
ptype) < 0) {
> > > > > > > +			ret = -ENOMEM;
> 
> also are we leaking tk_same_name in here?
> 
> > > > > > > +			goto error_free;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > > > > +		if (ret)
> 
> and here?

Good catch!
Do you know if the appended probes are automatically freed? If so, can you 
please indicate which function handles this?

> jirka
> 
> > > > > > > +			goto error_free;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +end:
> > > > > > > +	kfree(array.addrs);
> > > > > > > 
> > > > > > >  	return trace_probe_event_call(&tk->tp);
> > > > > > > 
> > > > > > > +error_free:
> > > > > > > +	kfree(array.addrs);
> > > > > > > 
> > > > > > >  error:
> > > > > > >  	free_trace_kprobe(tk);
> > > > > > >  	return ERR_PTR(ret);
> > > > > 
> > > > > ---
> > > > > [1]: https://github.com/torvalds/linux/blob/
> > > > > 57012c57536f8814dec92e74197ee96c3498d24e/tools/perf/util/probe-event
> > > > > .c#L29
> > > > > 89- L2993

Best regards.
Francis Laniel Aug. 21, 2023, 12:55 p.m. UTC | #25
Hi.

Le samedi 19 août 2023, 03:11:05 CEST Masami Hiramatsu a écrit :
> Hi Francis,
> (Cc: Song Liu and BPF ML)
> 
> On Fri, 18 Aug 2023 20:12:11 +0200
> 
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > Hi.
> > 
> > Le vendredi 18 août 2023, 15:05:37 CEST Masami Hiramatsu a écrit :
> > > On Thu, 17 Aug 2023 13:06:20 +0200
> > > 
> > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > Hi.
> > > > 
> > > > Le jeudi 17 août 2023, 09:50:57 CEST Masami Hiramatsu a écrit :
> > > > > Hi,
> > > > > 
> > > > > On Wed, 16 Aug 2023 18:35:17 +0200
> > > > > 
> > > > > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > > > When using sysfs, it is possible to create kprobe for several
> > > > > > kernel
> > > > > > functions sharing the same name, but of course with different
> > > > > > addresses,
> > > > > > by writing their addresses in kprobe_events file.
> > > > > > 
> > > > > > When using PMU, if only the symbol name is given, the event will
> > > > > > be
> > > > > > created for the first address which matches the symbol, as
> > > > > > returned by
> > > > > > kallsyms_lookup_name().
> > > > > 
> > > > > Do you mean probing the same name symbols? Yes, it is intended
> > > > > behavior,
> > > > > since it is not always true that the same name function has the same
> > > > > prototype (it is mostly true but is not ensured), it is better to
> > > > > leave
> > > > > user to decide which one is what you want to probe.
> > > > 
> > > > This is what I meant.
> > > > I also share your mind regarding leaving the users deciding which one
> > > > they
> > > > want to probe but in my case (which I agree is a bit a corner one) it
> > > > leaded me to misunderstanding as the PMU kprobe was only added to the
> > > > first ntfs_file_write_iter() which is not the one for ntfs3.
> > > 
> > > Hmm, OK. I think in that case (multiple same-name symbols exist) the
> > > default behavior is rejecting with error message. And optionally, it
> > > will probe all or them like your patch.
> > 
> > I am not sure to understand.
> > Can you please precise the default behavior of which software component?
> 
> I meant that the behavior of the kprobe-events via /sys/kernel/tracing.
> But your patch is for the other interface for perf as kprobe-event PMU.
> In that case, I think we should CC to other users like BPF because
> this may change the expected behavior.
> 
> > > > > Have you used 'perf probe' tool? It tries to find the appropriate
> > > > > function
> > > > > by line number and creates the probe by 'text+OFFSET' style, not by
> > > > > symbol.
> > > > > I think this is the correct way to do that, because user will not
> > > > > know
> > > > > which 'address' of the symbol is what the user want.
> > > > 
> > > > 'perf probe' perfectly does the trick, as it would find all the kernel
> > > > addresses which correspond to the symbol name and create as many
> > > > probes as
> > > > corresponding symbols [1]:
> > > > root@vm-amd64:~# perf probe --add ntfs_file_write_iter
> > > 
> > > If you can specify the (last part of) file path as below,
> > > 
> > > perf probe --add ntfs_file_write_iter@ntfs3/file.c
> > > 
> > > Then it will choose correct one. :)
> > 
> > Nice! TIL thank you! perf is really powerful!
> 
> Yeah, but note that the perf-probe is a tool to setup a 'visible' tracepoint
> event. After making a new tracepoint event, the perf tool can use such
> "[Tracepoint event]" instead of PMU.
> 
> Unfortunately, kprobe-event 'PMU' version doesn't support this
> because it has been introduced for BPF. See the original series;
> 
> https://lore.kernel.org/lkml/20171206224518.3598254-1-songliubraving@fb.com/
> 
> So, the "local_kprobe_event" is making a kprobe PMU which is a event for
> local session, that is designed for using such event from BPF (if I
> understand correctly). Of course BPF tool can setup its local
> event with a unique symbol + offset (not just a symbol) in a BPF tool with
> perf-probe but it doesn't.
> 
> Could you tell me how do you use this feature, for what perpose?

Sure (I think I detailed this in the cover letter but I only sent it to the 
"main" mailing list and not the tracing one, sorry for this inconvenience)!

Basically, I was adding NTFS tracing to an existing tool which monitors slow 
I/Os using BPF [1].
To test the tool, I compiled a kernel with both NTFS module built-in and 
figured out the write operations when done on ntfs3 were missing from the 
output of the tool.
The problem comes from the library I use in the tool which does not handle 
well when it exists different symbols with the same name.
Contrary to perf, which only handles kprobes through sysfs, the library 
handles it in both way (sysfs and PMU) with a preference for PMU when 
available [2].

After some discussion, I thought there could be a way to handle this 
automatically in the kernel when using PMU kprobes, hence this patch.
I totally understand the case I described above is really a corner one, but I 
thought this feature could be useful for other people.
In the case of the library itself, we could indeed find the address in /proc/
kallsyms but it would mean having CAP_SYS_ADMIN which is not forcefully 
something we want to enforce.
Also, if we need to read /boot/vmlinuz or /boot/System.map we also need to be 
root as these files often belong to root and cannot be read by others.
So, this patch solves the above problem while not needing specific capabilities 
as the kernel will solve it for us.

> If you just need to trace/profile a specific function which has the same
> name symbols, you might be better to use `perf probe` +
> `/sys/kernel/tracing` or `perf record -e EVENT`.
> 
> Or if you need to run it with CAP_PERFMON, without CAP_SYS_ADMIN,
> we need to change a userspace tool to find the correct address and
> pass it to the perf_event_open().
> 
> > > > Added new events:
> > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > > 
> > > > You can now use it in all perf tools, such as:
> > > >         perf record -e probe:ntfs_file_write_iter -aR sleep 1
> > > > 
> > > > root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
> > > > p:probe/ntfs_file_write_iter _text+5088544
> > > > p:probe/ntfs_file_write_iter _text+5278560
> > > > 
> > > > > Thought?
> > > > 
> > > > This contribution is basically here to sort of mimic what perf does
> > > > but
> > > > with PMU kprobes, as this is not possible to write in a sysfs file
> > > > with
> > > > this type of probe.
> > > 
> > > OK, I see it is for BPF only. Maybe BPF program can filter correct one
> > > to access the argument etc.
> > 
> > I am not sure I understand, can you please precise?
> > The eBPF program will be run when the kprobe will be triggered, so if the
> > kprobe is armed for the function (e.g. old ntfs_file_write_iter()), the
> > eBPF program will never be called.
> 
> As I said above, it is userspace BPF loader issue, because it has to specify
> the correct address via unique symbol + offset, instead of attaching all of
> them. I think that will be more side-effects.
> 
> But anyway, thanks for pointing this issue. I should fix kprobe event to
> reject the symbols which is not unique. That should be pointed by other
> unique symbols.

You are welcome and I thank you for the discussion.
Can you please precise more what you think about "reject the symbols which is 
not unique"?
Basically something like this:
struct trace_event_call *
create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
			  bool is_return)
{
	...
	if (!addr && func) {
		array.addrs = NULL;
		array.size = 0;
		ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
		if (ret)
			goto error_free;

		if (array.size != 1) {
			/* 
			 * Function name corresponding to several symbols must
			 * be passed by address only.
			 */
			return -EINVAL;
		}
	}
	...
}
?
If my understanding is correct, I think I can write a patch to achieve this.

> Thank you,
> 
> > > Thank you,
> > > 
> > > > > Thank you,
> > > > > 
> > > > > > The idea here is to search all kernel functions which match this
> > > > > > symbol
> > > > > > and
> > > > > > create a trace_kprobe for each of them.
> > > > > > All these trace_kprobes are linked together by sharing the same
> > > > > > trace_probe.
> > > > > > 
> > > > > > Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
> > > > > > ---
> > > > > > 
> > > > > >  kernel/trace/trace_kprobe.c | 86
> > > > > >  +++++++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 86 insertions(+)
> > > > > > 
> > > > > > diff --git a/kernel/trace/trace_kprobe.c
> > > > > > b/kernel/trace/trace_kprobe.c
> > > > > > index 1b3fa7b854aa..08580f1466c7 100644
> > > > > > --- a/kernel/trace/trace_kprobe.c
> > > > > > +++ b/kernel/trace/trace_kprobe.c
> > > > > > @@ -1682,13 +1682,42 @@ static int unregister_kprobe_event(struct
> > > > > > trace_kprobe *tk)>
> > > > > > 
> > > > > >  }
> > > > > >  
> > > > > >  #ifdef CONFIG_PERF_EVENTS
> > > > > > 
> > > > > > +
> > > > > > +struct address_array {
> > > > > > +	unsigned long *addrs;
> > > > > > +	size_t size;
> > > > > > +};
> > > > > > +
> > > > > > +static int add_addr(void *data, unsigned long addr)
> > > > > > +{
> > > > > > +	struct address_array *array = data;
> > > > > > +	unsigned long *p;
> > > > > > +
> > > > > > +	array->size++;
> > > > > > +	p = krealloc(array->addrs,
> > > > > > +				sizeof(*array->addrs) * array->size,
> > > > > > +				GFP_KERNEL);
> > > > > > +	if (!p) {
> > > > > > +		kfree(array->addrs);
> > > > > > +		return -ENOMEM;
> > > > > > +	}
> > > > > > +
> > > > > > +	array->addrs = p;
> > > > > > +	array->addrs[array->size - 1] = addr;
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > 
> > > > > >  /* create a trace_kprobe, but don't add it to global lists */
> > > > > >  struct trace_event_call *
> > > > > >  create_local_trace_kprobe(char *func, void *addr, unsigned long
> > > > > >  offs,
> > > > > >  
> > > > > >  			  bool is_return)
> > > > > >  
> > > > > >  {
> > > > > >  
> > > > > >  	enum probe_print_type ptype;
> > > > > > 
> > > > > > +	struct address_array array;
> > > > > > 
> > > > > >  	struct trace_kprobe *tk;
> > > > > > 
> > > > > > +	unsigned long func_addr;
> > > > > > +	unsigned int i;
> > > > > > 
> > > > > >  	int ret;
> > > > > >  	char *event;
> > > > > > 
> > > > > > @@ -1722,7 +1751,64 @@ create_local_trace_kprobe(char *func, void
> > > > > > *addr,
> > > > > > unsigned long offs,>
> > > > > > 
> > > > > >  	if (ret < 0)
> > > > > >  	
> > > > > >  		goto error;
> > > > > > 
> > > > > > +	array.addrs = NULL;
> > > > > > +	array.size = 0;
> > > > > > +	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > > > > > +	if (ret)
> > > > > > +		goto error_free;
> > > > > > +
> > > > > > +	if (array.size == 1)
> > > > > > +		goto end;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Below loop allocates a trace_kprobe for each function 
with
> > > > > > the
> > > > > > same
> > > > > > +	 * name in kernel source code.
> > > > > > +	 * All this differente trace_kprobes will be linked together
> > > > > > through
> > > > > > +	 * append_trace_kprobe().
> > > > > > +	 * NOTE append_trace_kprobe() is called in
> > > > > > register_trace_kprobe()
> > > > 
> > > > which
> > > > 
> > > > > > +	 * is called when a kprobe is added through sysfs.
> > > > > > +	 */
> > > > > > +	func_addr = kallsyms_lookup_name(func);
> > > > > > +	for (i = 0; i < array.size; i++) {
> > > > > > +		struct trace_kprobe *tk_same_name;
> > > > > > +		unsigned long address;
> > > > > > +
> > > > > > +		address = array.addrs[i];
> > > > > > +		/* Skip the function address as we already registered 
it. */
> > > > > > +		if (address == func_addr)
> > > > > > +			continue;
> > > > > > +
> > > > > > +		/*
> > > > > > +		 * alloc_trace_kprobe() first considers symbol name, 
so we set
> > > > > > +		 * this to NULL to allocate this kprobe on the given 
address.
> > > > > > +		 */
> > > > > > +		tk_same_name = 
alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > > > +						  (void *)address, NULL, 
offs,
> > > > > > +						  0 /* maxactive */,
> > > > > > +						  0 /* nargs */, 
is_return);
> > > > > > +
> > > > > > +		if (IS_ERR(tk_same_name)) {
> > > > > > +			ret = -ENOMEM;
> > > > > > +			goto error_free;
> > > > > > +		}
> > > > > > +
> > > > > > +		init_trace_event_call(tk_same_name);
> > > > > > +
> > > > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, 
ptype) < 0) {
> > > > > > +			ret = -ENOMEM;
> > > > > > +			goto error_free;
> > > > > > +		}
> > > > > > +
> > > > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > > > +		if (ret)
> > > > > > +			goto error_free;
> > > > > > +	}
> > > > > > +
> > > > > > +end:
> > > > > > +	kfree(array.addrs);
> > > > > > 
> > > > > >  	return trace_probe_event_call(&tk->tp);
> > > > > > 
> > > > > > +error_free:
> > > > > > +	kfree(array.addrs);
> > > > > > 
> > > > > >  error:
> > > > > >  	free_trace_kprobe(tk);
> > > > > >  	return ERR_PTR(ret);
> > > > 
> > > > ---
> > > > [1]: https://github.com/torvalds/linux/blob/
> > > > 57012c57536f8814dec92e74197ee96c3498d24e/tools/perf/util/probe-event.c
> > > > #L29
> > > > 89- L2993

Best regards.
---
[1]: https://github.com/inspektor-gadget/inspektor-gadget/pull/1879
[2]: https://github.com/cilium/ebpf/blob/
270c859894bd38cdd0c7783317b16343409e4df8/link/kprobe.go#L165-L191
Steven Rostedt Aug. 21, 2023, 2:29 p.m. UTC | #26
On Sun, 20 Aug 2023 03:02:18 -0700
Song Liu <song@kernel.org> wrote:

> > For the "unique symbol" + offset, you don't need the kallsyms, but need to
> > access the System.map or vmlinux image. In this case, we don't need to expand
> > the CAP_PERFMON capabilities.  
> 
> I agree this is not needed in this case. But I wonder whether it makes sense
> to give CAP_PERFMON access to /proc/kallsyms. Will this change make
> CAP_PERFMON less secure?

I guess the question is, does CAP_PERFMON allow seeing where the kernel
mapped itself via some other means? If not, then no, I would nack this as
being a security hole.

-- Steve
Steven Rostedt Aug. 21, 2023, 2:45 p.m. UTC | #27
On Mon, 21 Aug 2023 19:01:52 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > kprobe BPF program has access to pt_regs, so it can read ip of the
> > attached function. Can we do the same with regular kprobe (no bpf)?  
> 
> Yes, it can. So I think it is OK to expand CAP_PERFMON to access kallsyms.
> But this means CAP_PERMON itself is not safe in some case.

What are the privileges that CAP_PERFMON gives. I can see why Kees told me
to avoid capabilities when looking at what has access to tracefs. Because
it becomes very difficult to know what the privileges you are giving when
you give out a capability. I just stick to normal ACL (file permissions)
and everything is much easier and simpler to know what has access to what.

-- Steve
Masami Hiramatsu (Google) Aug. 21, 2023, 3:19 p.m. UTC | #28
On Mon, 21 Aug 2023 10:29:43 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Sun, 20 Aug 2023 03:02:18 -0700
> Song Liu <song@kernel.org> wrote:
> 
> > > For the "unique symbol" + offset, you don't need the kallsyms, but need to
> > > access the System.map or vmlinux image. In this case, we don't need to expand
> > > the CAP_PERFMON capabilities.  
> > 
> > I agree this is not needed in this case. But I wonder whether it makes sense
> > to give CAP_PERFMON access to /proc/kallsyms. Will this change make
> > CAP_PERFMON less secure?
> 
> I guess the question is, does CAP_PERFMON allow seeing where the kernel
> mapped itself via some other means? If not, then no, I would nack this as
> being a security hole.

As Song said, CAP_PERFMON allows user to put a kprobe anywhere user specified,
and it can run BPF program on it. This means it can access raw address
information via registers. if so, CAP_PERFMON allows seeing the kernel map or
equivalent information at this point.

> 
> -- Steve
Steven Rostedt Aug. 21, 2023, 3:28 p.m. UTC | #29
On Tue, 22 Aug 2023 00:19:55 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > I guess the question is, does CAP_PERFMON allow seeing where the kernel
> > mapped itself via some other means? If not, then no, I would nack this as
> > being a security hole.  
> 
> As Song said, CAP_PERFMON allows user to put a kprobe anywhere user specified,
> and it can run BPF program on it. This means it can access raw address
> information via registers. if so, CAP_PERFMON allows seeing the kernel map or
> equivalent information at this point.

I still don't like just giving it full kallsyms access.

I really do hate capabilities.

-- Steve
Kees Cook Aug. 21, 2023, 6:07 p.m. UTC | #30
On Mon, Aug 21, 2023 at 10:45:50AM -0400, Steven Rostedt wrote:
> On Mon, 21 Aug 2023 19:01:52 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > > kprobe BPF program has access to pt_regs, so it can read ip of the
> > > attached function. Can we do the same with regular kprobe (no bpf)?  
> > 
> > Yes, it can. So I think it is OK to expand CAP_PERFMON to access kallsyms.
> > But this means CAP_PERMON itself is not safe in some case.
> 
> What are the privileges that CAP_PERFMON gives. I can see why Kees told me
> to avoid capabilities when looking at what has access to tracefs. Because
> it becomes very difficult to know what the privileges you are giving when
> you give out a capability. I just stick to normal ACL (file permissions)
> and everything is much easier and simpler to know what has access to what.

At the very least, having a fd-based "handle" for access work. But yeah,
capabilities get ugly quickly.

Anyway... what does CAP_PERFMON have access to right now? If it is
allowed to read arbitrary kernel memory, then resolving symbols is fine.
If it doesn't, then no, it shouldn't: it becomes a oracle for probing
symbol locations.
Jiri Olsa Aug. 22, 2023, 1:13 p.m. UTC | #31
On Mon, Aug 21, 2023 at 02:24:06PM +0200, Francis Laniel wrote:
> Hi.
> 
> Le dimanche 20 août 2023, 22:34:40 CEST Jiri Olsa a écrit :
> > On Sat, Aug 19, 2023 at 10:11:05AM +0900, Masami Hiramatsu wrote:
> > 
> > SNIP
> > 
> > > > > > > > +	func_addr = kallsyms_lookup_name(func);
> > > > > > > > +	for (i = 0; i < array.size; i++) {
> > > > > > > > +		struct trace_kprobe *tk_same_name;
> > > > > > > > +		unsigned long address;
> > > > > > > > +
> > > > > > > > +		address = array.addrs[i];
> > > > > > > > +		/* Skip the function address as we already 
> registered it. */
> > > > > > > > +		if (address == func_addr)
> > > > > > > > +			continue;
> > > > > > > > +
> > > > > > > > +		/*
> > > > > > > > +		 * alloc_trace_kprobe() first considers symbol name, 
> so we
> > > > > > > > set
> > > > > > > > +		 * this to NULL to allocate this kprobe on the given 
> address.
> > > > > > > > +		 */
> > > > > > > > +		tk_same_name = 
> alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
> > > > > > > > +						  (void *)address, NULL, 
> offs,
> > > > > > > > +						  0 /* maxactive */,
> > > > > > > > +						  0 /* nargs */, 
> is_return);
> > > > > > > > +
> > > > > > > > +		if (IS_ERR(tk_same_name)) {
> > > > > > > > +			ret = -ENOMEM;
> > > > > > > > +			goto error_free;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		init_trace_event_call(tk_same_name);
> > > > > > > > +
> > > > > > > > +		if (traceprobe_set_print_fmt(&tk_same_name->tp, 
> ptype) < 0) {
> > > > > > > > +			ret = -ENOMEM;
> > 
> > also are we leaking tk_same_name in here?
> > 
> > > > > > > > +			goto error_free;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		ret = append_trace_kprobe(tk_same_name, tk);
> > > > > > > > +		if (ret)
> > 
> > and here?
> 
> Good catch!
> Do you know if the appended probes are automatically freed? If so, can you 
> please indicate which function handles this?

hum, I don't see the release code going through the list of appended probes,
so I wonder you need to somehow do that in destroy_local_trace_kprobe

jirka
Masami Hiramatsu (Google) Aug. 23, 2023, 12:36 a.m. UTC | #32
Hi,

On Mon, 21 Aug 2023 14:55:14 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> > Could you tell me how do you use this feature, for what perpose?
> 
> Sure (I think I detailed this in the cover letter but I only sent it to the 
> "main" mailing list and not the tracing one, sorry for this inconvenience)!
> 
> Basically, I was adding NTFS tracing to an existing tool which monitors slow 
> I/Os using BPF [1].
> To test the tool, I compiled a kernel with both NTFS module built-in and 
> figured out the write operations when done on ntfs3 were missing from the 
> output of the tool.
> The problem comes from the library I use in the tool which does not handle 
> well when it exists different symbols with the same name.
> Contrary to perf, which only handles kprobes through sysfs, the library 
> handles it in both way (sysfs and PMU) with a preference for PMU when 
> available [2].
> 
> After some discussion, I thought there could be a way to handle this 
> automatically in the kernel when using PMU kprobes, hence this patch.
> I totally understand the case I described above is really a corner one, but I 
> thought this feature could be useful for other people.
> In the case of the library itself, we could indeed find the address in /proc/
> kallsyms but it would mean having CAP_SYS_ADMIN which is not forcefully 
> something we want to enforce.
> Also, if we need to read /boot/vmlinuz or /boot/System.map we also need to be 
> root as these files often belong to root and cannot be read by others.
> So, this patch solves the above problem while not needing specific capabilities 
> as the kernel will solve it for us.

Thanks for the explanation. I got the background, and still have some questions.

- Is the analysis tool really necessary to be used by users other than
  CAP_SYS_ADMIN? Even if it is useful, I still doubt CAP_PERFMON is safer
  than CAP_SYS_ADMIN, because BPF program can access any kernel register.

- My concern about this solution (enabling kprobe PMU on all symbols which
  have the same name) makes it hard to run the same BPF program on it.
  This is because symbols with the same name do not necessarily have the
  same arguments (theoretically). Also, the BPF will trace unwanted symbols
  at unwanted timing.

- Can you expand that library to handle the same name symbols differently?
  I think this should be done in the user space, or in the kallsyms like
  storing symbols with source line information.

I understand this demand, but solving that with probing *all* symbols seems
like a brute force solution and may cause another problem later.

But this is a good discussion item. Last month Alessandro sent a script which
makes such symbols unique. Current problem is that the numbering is not enough
to identify which one is from which source code.

https://lore.kernel.org/all/20230714150326.1152359-1-alessandro.carminati@gmail.com/ 

> 
> > If you just need to trace/profile a specific function which has the same
> > name symbols, you might be better to use `perf probe` +
> > `/sys/kernel/tracing` or `perf record -e EVENT`.
> > 
> > Or if you need to run it with CAP_PERFMON, without CAP_SYS_ADMIN,
> > we need to change a userspace tool to find the correct address and
> > pass it to the perf_event_open().
> > 
> > > > > Added new events:
> > > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > > > 
> > > > > You can now use it in all perf tools, such as:
> > > > >         perf record -e probe:ntfs_file_write_iter -aR sleep 1
> > > > > 
> > > > > root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
> > > > > p:probe/ntfs_file_write_iter _text+5088544
> > > > > p:probe/ntfs_file_write_iter _text+5278560
> > > > > 
> > > > > > Thought?
> > > > > 
> > > > > This contribution is basically here to sort of mimic what perf does
> > > > > but
> > > > > with PMU kprobes, as this is not possible to write in a sysfs file
> > > > > with
> > > > > this type of probe.
> > > > 
> > > > OK, I see it is for BPF only. Maybe BPF program can filter correct one
> > > > to access the argument etc.
> > > 
> > > I am not sure I understand, can you please precise?
> > > The eBPF program will be run when the kprobe will be triggered, so if the
> > > kprobe is armed for the function (e.g. old ntfs_file_write_iter()), the
> > > eBPF program will never be called.
> > 
> > As I said above, it is userspace BPF loader issue, because it has to specify
> > the correct address via unique symbol + offset, instead of attaching all of
> > them. I think that will be more side-effects.
> > 
> > But anyway, thanks for pointing this issue. I should fix kprobe event to
> > reject the symbols which is not unique. That should be pointed by other
> > unique symbols.
> 
> You are welcome and I thank you for the discussion.
> Can you please precise more what you think about "reject the symbols which is 
> not unique"?
> Basically something like this:

Yes, that's what I said.

> struct trace_event_call *
> create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> 			  bool is_return)
> {
> 	...
> 	if (!addr && func) {

if (func) {  /* because anyway if user specify "func" we have to solve
 the symbol address */

> 		array.addrs = NULL;
> 		array.size = 0;
> 		ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> 		if (ret)
> 			goto error_free;
> 
> 		if (array.size != 1) {
> 			/* 
> 			 * Function name corresponding to several symbols must
> 			 * be passed by address only.
> 			 */
> 			return -EINVAL;

This case may return a unique error code so that the caller can notice
the problem.

Thank you,

> 		}
> 	}



> 	...
> }
> ?
> If my understanding is correct, I think I can write a patch to achieve this.
> 

> 
> Best regards.
> ---
> [1]: https://github.com/inspektor-gadget/inspektor-gadget/pull/1879
> [2]: https://github.com/cilium/ebpf/blob/
> 270c859894bd38cdd0c7783317b16343409e4df8/link/kprobe.go#L165-L191
> 
>
Francis Laniel Aug. 23, 2023, 9:54 a.m. UTC | #33
Hi.

Le mercredi 23 août 2023, 02:36:14 CEST Masami Hiramatsu a écrit :
> Hi,
> 
> On Mon, 21 Aug 2023 14:55:14 +0200
> 
> Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > Could you tell me how do you use this feature, for what perpose?
> > 
> > Sure (I think I detailed this in the cover letter but I only sent it to
> > the
> > "main" mailing list and not the tracing one, sorry for this
> > inconvenience)!
> > 
> > Basically, I was adding NTFS tracing to an existing tool which monitors
> > slow I/Os using BPF [1].
> > To test the tool, I compiled a kernel with both NTFS module built-in and
> > figured out the write operations when done on ntfs3 were missing from the
> > output of the tool.
> > The problem comes from the library I use in the tool which does not handle
> > well when it exists different symbols with the same name.
> > Contrary to perf, which only handles kprobes through sysfs, the library
> > handles it in both way (sysfs and PMU) with a preference for PMU when
> > available [2].
> > 
> > After some discussion, I thought there could be a way to handle this
> > automatically in the kernel when using PMU kprobes, hence this patch.
> > I totally understand the case I described above is really a corner one,
> > but I thought this feature could be useful for other people.
> > In the case of the library itself, we could indeed find the address in
> > /proc/ kallsyms but it would mean having CAP_SYS_ADMIN which is not
> > forcefully something we want to enforce.
> > Also, if we need to read /boot/vmlinuz or /boot/System.map we also need to
> > be root as these files often belong to root and cannot be read by others.
> > So, this patch solves the above problem while not needing specific
> > capabilities as the kernel will solve it for us.
> 
> Thanks for the explanation. I got the background, and still have some
> questions.
> 
> - Is the analysis tool really necessary to be used by users other than
>   CAP_SYS_ADMIN? Even if it is useful, I still doubt CAP_PERFMON is safer
>   than CAP_SYS_ADMIN, because BPF program can access any kernel register.

For the tool itself, this is indeed not a problem as we rely on CAP_SYS_ADMIN.
But this one for the library, as they do not want to enforce CAP_SYS_ADMIN to 
use the library.

> - My concern about this solution (enabling kprobe PMU on all symbols which
>   have the same name) makes it hard to run the same BPF program on it.
>   This is because symbols with the same name do not necessarily have the
>   same arguments (theoretically). Also, the BPF will trace unwanted symbols
>   at unwanted timing.

Good point for the same name but different arguments!
I was too focused on my case (ntfs_file_write_iter()) and forgot about this.

> - Can you expand that library to handle the same name symbols differently?
>   I think this should be done in the user space, or in the kallsyms like
>   storing symbols with source line information.

I think we can find a way to handle this in user space by potentially 
abstracting the several PMU probe under one.
Or we can simply explode if a name correspond to several symbols and ask the 
user to use addr + offset to precise the symbol in this case.

> I understand this demand, but solving that with probing *all* symbols seems
> like a brute force solution and may cause another problem later.
> 
> But this is a good discussion item. Last month Alessandro sent a script
> which makes such symbols unique. Current problem is that the numbering is
> not enough to identify which one is from which source code.

Definitely, I wrote this specifically to create a discussion and gather some 
comments, hence the RFC tag.

> https://lore.kernel.org/all/20230714150326.1152359-1-alessandro.carminati@gm
> ail.com/

I will definitely take a look at this contribution! Thank you for sharing the 
link!

> > > If you just need to trace/profile a specific function which has the same
> > > name symbols, you might be better to use `perf probe` +
> > > `/sys/kernel/tracing` or `perf record -e EVENT`.
> > > 
> > > Or if you need to run it with CAP_PERFMON, without CAP_SYS_ADMIN,
> > > we need to change a userspace tool to find the correct address and
> > > pass it to the perf_event_open().
> > > 
> > > > > > Added new events:
> > > > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > > > > 
> > > > > > You can now use it in all perf tools, such as:
> > > > > >         perf record -e probe:ntfs_file_write_iter -aR sleep 1
> > > > > > 
> > > > > > root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
> > > > > > p:probe/ntfs_file_write_iter _text+5088544
> > > > > > p:probe/ntfs_file_write_iter _text+5278560
> > > > > > 
> > > > > > > Thought?
> > > > > > 
> > > > > > This contribution is basically here to sort of mimic what perf
> > > > > > does
> > > > > > but
> > > > > > with PMU kprobes, as this is not possible to write in a sysfs file
> > > > > > with
> > > > > > this type of probe.
> > > > > 
> > > > > OK, I see it is for BPF only. Maybe BPF program can filter correct
> > > > > one
> > > > > to access the argument etc.
> > > > 
> > > > I am not sure I understand, can you please precise?
> > > > The eBPF program will be run when the kprobe will be triggered, so if
> > > > the
> > > > kprobe is armed for the function (e.g. old ntfs_file_write_iter()),
> > > > the
> > > > eBPF program will never be called.
> > > 
> > > As I said above, it is userspace BPF loader issue, because it has to
> > > specify the correct address via unique symbol + offset, instead of
> > > attaching all of them. I think that will be more side-effects.
> > > 
> > > But anyway, thanks for pointing this issue. I should fix kprobe event to
> > > reject the symbols which is not unique. That should be pointed by other
> > > unique symbols.
> > 
> > You are welcome and I thank you for the discussion.
> > Can you please precise more what you think about "reject the symbols which
> > is not unique"?
> 
> > Basically something like this:
> Yes, that's what I said.

OK, I will write something and send it as an RFC before end of the week then.

> > struct trace_event_call *
> > create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> > 
> > 			  bool is_return)
> > 
> > {
> > 
> > 	...
> > 	if (!addr && func) {
> 
> if (func) {  /* because anyway if user specify "func" we have to solve
>  the symbol address */
> 
> > 		array.addrs = NULL;
> > 		array.size = 0;
> > 		ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > 		if (ret)
> > 		
> > 			goto error_free;
> > 		
> > 		if (array.size != 1) {
> > 		
> > 			/*
> > 			
> > 			 * Function name corresponding to several symbols must
> > 			 * be passed by address only.
> > 			 */
> > 			
> > 			return -EINVAL;
> 
> This case may return a unique error code so that the caller can notice
> the problem.

Is it OK to add a dedicated error code for such a case?

> Thank you,
> 
> > 		}
> > 	
> > 	}
> > 	
> > 	
> > 	
> > 	...
> > 
> > }
> > ?
> > If my understanding is correct, I think I can write a patch to achieve
> > this.
> > 
> > 
> > 
> > Best regards.
> > ---
> > [1]: https://github.com/inspektor-gadget/inspektor-gadget/pull/1879
> > [2]: https://github.com/cilium/ebpf/blob/
> > 270c859894bd38cdd0c7783317b16343409e4df8/link/kprobe.go#L165-L191

Best regards.
Masami Hiramatsu (Google) Aug. 23, 2023, 1:45 p.m. UTC | #34
On Wed, 23 Aug 2023 11:54:48 +0200
Francis Laniel <flaniel@linux.microsoft.com> wrote:

> Hi.
> 
> Le mercredi 23 août 2023, 02:36:14 CEST Masami Hiramatsu a écrit :
> > Hi,
> > 
> > On Mon, 21 Aug 2023 14:55:14 +0200
> > 
> > Francis Laniel <flaniel@linux.microsoft.com> wrote:
> > > > Could you tell me how do you use this feature, for what perpose?
> > > 
> > > Sure (I think I detailed this in the cover letter but I only sent it to
> > > the
> > > "main" mailing list and not the tracing one, sorry for this
> > > inconvenience)!
> > > 
> > > Basically, I was adding NTFS tracing to an existing tool which monitors
> > > slow I/Os using BPF [1].
> > > To test the tool, I compiled a kernel with both NTFS module built-in and
> > > figured out the write operations when done on ntfs3 were missing from the
> > > output of the tool.
> > > The problem comes from the library I use in the tool which does not handle
> > > well when it exists different symbols with the same name.
> > > Contrary to perf, which only handles kprobes through sysfs, the library
> > > handles it in both way (sysfs and PMU) with a preference for PMU when
> > > available [2].
> > > 
> > > After some discussion, I thought there could be a way to handle this
> > > automatically in the kernel when using PMU kprobes, hence this patch.
> > > I totally understand the case I described above is really a corner one,
> > > but I thought this feature could be useful for other people.
> > > In the case of the library itself, we could indeed find the address in
> > > /proc/ kallsyms but it would mean having CAP_SYS_ADMIN which is not
> > > forcefully something we want to enforce.
> > > Also, if we need to read /boot/vmlinuz or /boot/System.map we also need to
> > > be root as these files often belong to root and cannot be read by others.
> > > So, this patch solves the above problem while not needing specific
> > > capabilities as the kernel will solve it for us.
> > 
> > Thanks for the explanation. I got the background, and still have some
> > questions.
> > 
> > - Is the analysis tool really necessary to be used by users other than
> >   CAP_SYS_ADMIN? Even if it is useful, I still doubt CAP_PERFMON is safer
> >   than CAP_SYS_ADMIN, because BPF program can access any kernel register.
> 
> For the tool itself, this is indeed not a problem as we rely on CAP_SYS_ADMIN.
> But this one for the library, as they do not want to enforce CAP_SYS_ADMIN to 
> use the library.

Hmm, however, that means the library also does not consider (or support) the
same-name symbols, because there is no way to identify the address without
accessing vmlinux. Since the address is still not clear whether one of
the same name symbols is what the user wants to trace, it needs to access
the vmlinux including debuginfo. It also need to use tools like 'eu-addr2line'
or 'readelf' etc. to identify the actual symbol and address.
The vmlinux is usually not installed to the system, because it is for the
kernel development, it may not need to access CAP_SYS_ADMIN.

> 
> > - My concern about this solution (enabling kprobe PMU on all symbols which
> >   have the same name) makes it hard to run the same BPF program on it.
> >   This is because symbols with the same name do not necessarily have the
> >   same arguments (theoretically). Also, the BPF will trace unwanted symbols
> >   at unwanted timing.
> 
> Good point for the same name but different arguments!
> I was too focused on my case (ntfs_file_write_iter()) and forgot about this.

Yeah, and the same reason, I think I should check the symbol is unique too
on kprobe event. I also found that fprobe event has similar issue.

> 
> > - Can you expand that library to handle the same name symbols differently?
> >   I think this should be done in the user space, or in the kallsyms like
> >   storing symbols with source line information.
> 
> I think we can find a way to handle this in user space by potentially 
> abstracting the several PMU probe under one.
> Or we can simply explode if a name correspond to several symbols and ask the 
> user to use addr + offset to precise the symbol in this case.

Yeah, I think that is a better way to handle this situation.

> 
> > I understand this demand, but solving that with probing *all* symbols seems
> > like a brute force solution and may cause another problem later.
> > 
> > But this is a good discussion item. Last month Alessandro sent a script
> > which makes such symbols unique. Current problem is that the numbering is
> > not enough to identify which one is from which source code.
> 
> Definitely, I wrote this specifically to create a discussion and gather some 
> comments, hence the RFC tag.
> 
> > https://lore.kernel.org/all/20230714150326.1152359-1-alessandro.carminati@gm
> > ail.com/
> 
> I will definitely take a look at this contribution! Thank you for sharing the 
> link!

You're welcome!

> 
> > > > If you just need to trace/profile a specific function which has the same
> > > > name symbols, you might be better to use `perf probe` +
> > > > `/sys/kernel/tracing` or `perf record -e EVENT`.
> > > > 
> > > > Or if you need to run it with CAP_PERFMON, without CAP_SYS_ADMIN,
> > > > we need to change a userspace tool to find the correct address and
> > > > pass it to the perf_event_open().
> > > > 
> > > > > > > Added new events:
> > > > > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > > > > >   probe:ntfs_file_write_iter (on ntfs_file_write_iter)
> > > > > > > 
> > > > > > > You can now use it in all perf tools, such as:
> > > > > > >         perf record -e probe:ntfs_file_write_iter -aR sleep 1
> > > > > > > 
> > > > > > > root@vm-amd64:~# cat /sys/kernel/tracing/kprobe_events
> > > > > > > p:probe/ntfs_file_write_iter _text+5088544
> > > > > > > p:probe/ntfs_file_write_iter _text+5278560
> > > > > > > 
> > > > > > > > Thought?
> > > > > > > 
> > > > > > > This contribution is basically here to sort of mimic what perf
> > > > > > > does
> > > > > > > but
> > > > > > > with PMU kprobes, as this is not possible to write in a sysfs file
> > > > > > > with
> > > > > > > this type of probe.
> > > > > > 
> > > > > > OK, I see it is for BPF only. Maybe BPF program can filter correct
> > > > > > one
> > > > > > to access the argument etc.
> > > > > 
> > > > > I am not sure I understand, can you please precise?
> > > > > The eBPF program will be run when the kprobe will be triggered, so if
> > > > > the
> > > > > kprobe is armed for the function (e.g. old ntfs_file_write_iter()),
> > > > > the
> > > > > eBPF program will never be called.
> > > > 
> > > > As I said above, it is userspace BPF loader issue, because it has to
> > > > specify the correct address via unique symbol + offset, instead of
> > > > attaching all of them. I think that will be more side-effects.
> > > > 
> > > > But anyway, thanks for pointing this issue. I should fix kprobe event to
> > > > reject the symbols which is not unique. That should be pointed by other
> > > > unique symbols.
> > > 
> > > You are welcome and I thank you for the discussion.
> > > Can you please precise more what you think about "reject the symbols which
> > > is not unique"?
> > 
> > > Basically something like this:
> > Yes, that's what I said.
> 
> OK, I will write something and send it as an RFC before end of the week then.

Thank you!

> 
> > > struct trace_event_call *
> > > create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
> > > 
> > > 			  bool is_return)
> > > 
> > > {
> > > 
> > > 	...
> > > 	if (!addr && func) {
> > 
> > if (func) {  /* because anyway if user specify "func" we have to solve
> >  the symbol address */
> > 
> > > 		array.addrs = NULL;
> > > 		array.size = 0;
> > > 		ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
> > > 		if (ret)
> > > 		
> > > 			goto error_free;
> > > 		
> > > 		if (array.size != 1) {
> > > 		
> > > 			/*
> > > 			
> > > 			 * Function name corresponding to several symbols must
> > > 			 * be passed by address only.
> > > 			 */
> > > 			
> > > 			return -EINVAL;
> > 
> > This case may return a unique error code so that the caller can notice
> > the problem.
> 
> Is it OK to add a dedicated error code for such a case?
> 
> > Thank you,
> > 
> > > 		}
> > > 	
> > > 	}
> > > 	
> > > 	
> > > 	
> > > 	...
> > > 
> > > }
> > > ?
> > > If my understanding is correct, I think I can write a patch to achieve
> > > this.
> > > 
> > > 
> > > 
> > > Best regards.
> > > ---
> > > [1]: https://github.com/inspektor-gadget/inspektor-gadget/pull/1879
> > > [2]: https://github.com/cilium/ebpf/blob/
> > > 270c859894bd38cdd0c7783317b16343409e4df8/link/kprobe.go#L165-L191
> 
> Best regards.
> 
>
diff mbox series

Patch

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1b3fa7b854aa..08580f1466c7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1682,13 +1682,42 @@  static int unregister_kprobe_event(struct trace_kprobe *tk)
 }
 
 #ifdef CONFIG_PERF_EVENTS
+
+struct address_array {
+	unsigned long *addrs;
+	size_t size;
+};
+
+static int add_addr(void *data, unsigned long addr)
+{
+	struct address_array *array = data;
+	unsigned long *p;
+
+	array->size++;
+	p = krealloc(array->addrs,
+				sizeof(*array->addrs) * array->size,
+				GFP_KERNEL);
+	if (!p) {
+		kfree(array->addrs);
+		return -ENOMEM;
+	}
+
+	array->addrs = p;
+	array->addrs[array->size - 1] = addr;
+
+	return 0;
+}
+
 /* create a trace_kprobe, but don't add it to global lists */
 struct trace_event_call *
 create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
 			  bool is_return)
 {
 	enum probe_print_type ptype;
+	struct address_array array;
 	struct trace_kprobe *tk;
+	unsigned long func_addr;
+	unsigned int i;
 	int ret;
 	char *event;
 
@@ -1722,7 +1751,64 @@  create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
 	if (ret < 0)
 		goto error;
 
+	array.addrs = NULL;
+	array.size = 0;
+	ret = kallsyms_on_each_match_symbol(add_addr, func, &array);
+	if (ret)
+		goto error_free;
+
+	if (array.size == 1)
+		goto end;
+
+	/*
+	 * Below loop allocates a trace_kprobe for each function with the same
+	 * name in kernel source code.
+	 * All this differente trace_kprobes will be linked together through
+	 * append_trace_kprobe().
+	 * NOTE append_trace_kprobe() is called in register_trace_kprobe() which
+	 * is called when a kprobe is added through sysfs.
+	 */
+	func_addr = kallsyms_lookup_name(func);
+	for (i = 0; i < array.size; i++) {
+		struct trace_kprobe *tk_same_name;
+		unsigned long address;
+
+		address = array.addrs[i];
+		/* Skip the function address as we already registered it. */
+		if (address == func_addr)
+			continue;
+
+		/*
+		 * alloc_trace_kprobe() first considers symbol name, so we set
+		 * this to NULL to allocate this kprobe on the given address.
+		 */
+		tk_same_name = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event,
+						  (void *)address, NULL, offs,
+						  0 /* maxactive */,
+						  0 /* nargs */, is_return);
+
+		if (IS_ERR(tk_same_name)) {
+			ret = -ENOMEM;
+			goto error_free;
+		}
+
+		init_trace_event_call(tk_same_name);
+
+		if (traceprobe_set_print_fmt(&tk_same_name->tp, ptype) < 0) {
+			ret = -ENOMEM;
+			goto error_free;
+		}
+
+		ret = append_trace_kprobe(tk_same_name, tk);
+		if (ret)
+			goto error_free;
+	}
+
+end:
+	kfree(array.addrs);
 	return trace_probe_event_call(&tk->tp);
+error_free:
+	kfree(array.addrs);
 error:
 	free_trace_kprobe(tk);
 	return ERR_PTR(ret);