diff mbox series

[RFC] ftrace: Show all functions with addresses in available_filter_functions_addrs

Message ID 20230608212613.424070-1-jolsa@kernel.org (mailing list archive)
State Superseded
Headers show
Series [RFC] ftrace: Show all functions with addresses in available_filter_functions_addrs | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Jiri Olsa June 8, 2023, 9:26 p.m. UTC
hi,
when ftrace based tracers we need to cross check available_filter_functions
with /proc/kallsyms. For example for kprobe_multi bpf link (based on fprobe)
we need to make sure that symbol regex resolves to traceable symbols and
that we get proper addresses for them.

Looks like on the last last LSF/MM/BPF there was an agreement to add new
file that will have available_filter_functions symbols plus addresses.

This RFC is to kick off the discussion, I'm not sure Steven wants to do
that differently ;-)

thanks,
jirka


---
Adding new available_filter_functions_addrs file that shows all available
functions (same as available_filter_functions) together with addresses,
like:

  # cat available_filter_functions_addrs | head
  ffffffff81000770 __traceiter_initcall_level
  ffffffff810007c0 __traceiter_initcall_start
  ffffffff81000810 __traceiter_initcall_finish
  ffffffff81000860 trace_initcall_finish_cb
  ...

It's useful to have address avilable for traceable symbols, so we don't
need to allways cross check kallsyms with available_filter_functions
(or the other way around) and have all the data in single file.

For backwards compatibility reasons we can't change the existing
available_filter_functions file output, but we need to add new file.

Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 include/linux/ftrace.h |  1 +
 kernel/trace/ftrace.c  | 52 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 48 insertions(+), 5 deletions(-)

Comments

Andrii Nakryiko June 8, 2023, 10:43 p.m. UTC | #1
On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
>
>
> hi,
> when ftrace based tracers we need to cross check available_filter_functions
> with /proc/kallsyms. For example for kprobe_multi bpf link (based on fprobe)
> we need to make sure that symbol regex resolves to traceable symbols and
> that we get proper addresses for them.
>
> Looks like on the last last LSF/MM/BPF there was an agreement to add new
> file that will have available_filter_functions symbols plus addresses.
>
> This RFC is to kick off the discussion, I'm not sure Steven wants to do
> that differently ;-)
>
> thanks,
> jirka
>
>
> ---
> Adding new available_filter_functions_addrs file that shows all available
> functions (same as available_filter_functions) together with addresses,
> like:
>
>   # cat available_filter_functions_addrs | head

nit: can we have some more succinct name, like "traceable_funcs" or
something? And btw, does this have to be part of tracefs/debugfs
(never knew the difference, sorry). E.g., can it be instead exposed
through sysfs?

Either than these minor things, yep, I think this is something that
would be extremely useful, thanks, Jiri, for taking a stab at it!

>   ffffffff81000770 __traceiter_initcall_level
>   ffffffff810007c0 __traceiter_initcall_start
>   ffffffff81000810 __traceiter_initcall_finish
>   ffffffff81000860 trace_initcall_finish_cb
>   ...
>
> It's useful to have address avilable for traceable symbols, so we don't
> need to allways cross check kallsyms with available_filter_functions
> (or the other way around) and have all the data in single file.
>
> For backwards compatibility reasons we can't change the existing
> available_filter_functions file output, but we need to add new file.
>
> Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  include/linux/ftrace.h |  1 +
>  kernel/trace/ftrace.c  | 52 ++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 48 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index b23bdd414394..6e372575a8e9 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -633,6 +633,7 @@ enum {
>         FTRACE_ITER_MOD         = (1 << 5),
>         FTRACE_ITER_ENABLED     = (1 << 6),
>         FTRACE_ITER_TOUCHED     = (1 << 7),
> +       FTRACE_ITER_ADDRS       = (1 << 8),
>  };
>
>  void arch_ftrace_update_code(int command);
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 764668467155..1f33e1f04834 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -3804,7 +3804,7 @@ static int __init ftrace_check_sync(void)
>  late_initcall_sync(ftrace_check_sync);
>  subsys_initcall(ftrace_check_for_weak_functions);
>
> -static int print_rec(struct seq_file *m, unsigned long ip)
> +static int print_rec(struct seq_file *m, unsigned long ip, bool print_addr)
>  {
>         unsigned long offset;
>         char str[KSYM_SYMBOL_LEN];
> @@ -3819,7 +3819,11 @@ static int print_rec(struct seq_file *m, unsigned long ip)
>                 ret = NULL;
>         }
>
> -       seq_puts(m, str);
> +       if (print_addr)
> +               seq_printf(m, "%lx %s", ip, str);
> +       else
> +               seq_puts(m, str);
> +
>         if (modname)
>                 seq_printf(m, " [%s]", modname);
>         return ret == NULL ? -1 : 0;
> @@ -3830,9 +3834,13 @@ static inline int test_for_valid_rec(struct dyn_ftrace *rec)
>         return 1;
>  }
>
> -static inline int print_rec(struct seq_file *m, unsigned long ip)
> +static inline int print_rec(struct seq_file *m, unsigned long ip, bool print_addr)
>  {
> -       seq_printf(m, "%ps", (void *)ip);
> +       if (print_addr)
> +               seq_printf(m, "%lx %ps", ip, (void *)ip);
> +       else
> +               seq_printf(m, "%ps", (void *)ip);
> +
>         return 0;
>  }
>  #endif
> @@ -3861,7 +3869,7 @@ static int t_show(struct seq_file *m, void *v)
>         if (!rec)
>                 return 0;
>
> -       if (print_rec(m, rec->ip)) {
> +       if (print_rec(m, rec->ip, iter->flags & FTRACE_ITER_ADDRS)) {
>                 /* This should only happen when a rec is disabled */
>                 WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
>                 seq_putc(m, '\n');
> @@ -3996,6 +4004,30 @@ ftrace_touched_open(struct inode *inode, struct file *file)
>         return 0;
>  }
>
> +static int
> +ftrace_avail_addrs_open(struct inode *inode, struct file *file)
> +{
> +       struct ftrace_iterator *iter;
> +       int ret;
> +
> +       ret = security_locked_down(LOCKDOWN_TRACEFS);
> +       if (ret)
> +               return ret;
> +
> +       if (unlikely(ftrace_disabled))
> +               return -ENODEV;
> +
> +       iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
> +       if (!iter)
> +               return -ENOMEM;
> +
> +       iter->pg = ftrace_pages_start;
> +       iter->flags = FTRACE_ITER_ADDRS;
> +       iter->ops = &global_ops;
> +
> +       return 0;
> +}
> +
>  /**
>   * ftrace_regex_open - initialize function tracer filter files
>   * @ops: The ftrace_ops that hold the hash filters
> @@ -5916,6 +5948,13 @@ static const struct file_operations ftrace_touched_fops = {
>         .release = seq_release_private,
>  };
>
> +static const struct file_operations ftrace_avail_addrs_fops = {
> +       .open = ftrace_avail_addrs_open,
> +       .read = seq_read,
> +       .llseek = seq_lseek,
> +       .release = seq_release_private,
> +};
> +
>  static const struct file_operations ftrace_filter_fops = {
>         .open = ftrace_filter_open,
>         .read = seq_read,
> @@ -6377,6 +6416,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
>         trace_create_file("available_filter_functions", TRACE_MODE_READ,
>                         d_tracer, NULL, &ftrace_avail_fops);
>
> +       trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ,
> +                       d_tracer, NULL, &ftrace_avail_addrs_fops);
> +
>         trace_create_file("enabled_functions", TRACE_MODE_READ,
>                         d_tracer, NULL, &ftrace_enabled_fops);
>
> --
> 2.40.1
>
Steven Rostedt June 8, 2023, 11:27 p.m. UTC | #2
On Thu, 8 Jun 2023 15:43:03 -0700
Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:

> On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> >
> > hi,
> > when ftrace based tracers we need to cross check available_filter_functions
> > with /proc/kallsyms. For example for kprobe_multi bpf link (based on fprobe)
> > we need to make sure that symbol regex resolves to traceable symbols and
> > that we get proper addresses for them.

I forgot, what was the problem with doing the above?

> >
> > Looks like on the last last LSF/MM/BPF there was an agreement to add new
> > file that will have available_filter_functions symbols plus addresses.
> >
> > This RFC is to kick off the discussion, I'm not sure Steven wants to do
> > that differently ;-)

I'm not totally against this, but I'd like to know the full issue its
solving. Perhaps I need to know more about what is being done, and what is
needed too.

> >
> > thanks,
> > jirka
> >
> >
> > ---
> > Adding new available_filter_functions_addrs file that shows all available
> > functions (same as available_filter_functions) together with addresses,
> > like:
> >
> >   # cat available_filter_functions_addrs | head  
> 
> nit: can we have some more succinct name, like "traceable_funcs" or


It's to match avaliable_filter_functions

Another way is to add a tracing option to make the address show up in the
available_filter_functions file. That would be my preferred choice.

  echo 1 > options/available_filter_addrs

Or something like that.



> something? And btw, does this have to be part of tracefs/debugfs

Because it's part of ftrace, and that belongs in tracefs.

> (never knew the difference, sorry). E.g., can it be instead exposed
> through sysfs?

tracefs is not debugfs, as debugfs includes all things debuggy (and
considered not secure). tracefs is its own file system dedicated to the
tracing code in the kernel. It exists with CONFIG_DEBUG not defined, and
lives in /sys/kernel/tracing. The only reason /sys/kernel/debug/tracing
(which is a duplicate mount point) exists is for backward compatibility for
before tracefs existed. But that path really should be deprecated.

> 
> Either than these minor things, yep, I think this is something that
> would be extremely useful, thanks, Jiri, for taking a stab at it!
> 
> >   ffffffff81000770 __traceiter_initcall_level
> >   ffffffff810007c0 __traceiter_initcall_start
> >   ffffffff81000810 __traceiter_initcall_finish
> >   ffffffff81000860 trace_initcall_finish_cb
> >   ...
> >
> > It's useful to have address avilable for traceable symbols, so we don't
> > need to allways cross check kallsyms with available_filter_functions
> > (or the other way around) and have all the data in single file.

Is it really that big of an issue? Again, I'm not against this change, but
I'm just wondering how much of a burden is it relieving?

> >
> > For backwards compatibility reasons we can't change the existing
> > available_filter_functions file output, but we need to add new file.

Or we could add an option to change it ;-)

> >
> > Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> > Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  include/linux/ftrace.h |  1 +
> >  kernel/trace/ftrace.c  | 52 ++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 48 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> > index b23bdd414394..6e372575a8e9 100644
> > --- a/include/linux/ftrace.h
> > +++ b/include/linux/ftrace.h
> > @@ -633,6 +633,7 @@ enum {
> >         FTRACE_ITER_MOD         = (1 << 5),
> >         FTRACE_ITER_ENABLED     = (1 << 6),
> >         FTRACE_ITER_TOUCHED     = (1 << 7),
> > +       FTRACE_ITER_ADDRS       = (1 << 8),
> >  };
> >
> >  void arch_ftrace_update_code(int command);
> > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> > index 764668467155..1f33e1f04834 100644
> > --- a/kernel/trace/ftrace.c
> > +++ b/kernel/trace/ftrace.c
> > @@ -3804,7 +3804,7 @@ static int __init ftrace_check_sync(void)
> >  late_initcall_sync(ftrace_check_sync);
> >  subsys_initcall(ftrace_check_for_weak_functions);
> >
> > -static int print_rec(struct seq_file *m, unsigned long ip)
> > +static int print_rec(struct seq_file *m, unsigned long ip, bool print_addr)
> >  {
> >         unsigned long offset;
> >         char str[KSYM_SYMBOL_LEN];
> > @@ -3819,7 +3819,11 @@ static int print_rec(struct seq_file *m, unsigned long ip)
> >                 ret = NULL;
> >         }
> >
> > -       seq_puts(m, str);
> > +       if (print_addr)
> > +               seq_printf(m, "%lx %s", ip, str);
> > +       else
> > +               seq_puts(m, str);
> > +
> >         if (modname)
> >                 seq_printf(m, " [%s]", modname);
> >         return ret == NULL ? -1 : 0;
> > @@ -3830,9 +3834,13 @@ static inline int test_for_valid_rec(struct dyn_ftrace *rec)
> >         return 1;
> >  }
> >
> > -static inline int print_rec(struct seq_file *m, unsigned long ip)
> > +static inline int print_rec(struct seq_file *m, unsigned long ip, bool print_addr)
> >  {
> > -       seq_printf(m, "%ps", (void *)ip);
> > +       if (print_addr)
> > +               seq_printf(m, "%lx %ps", ip, (void *)ip);
> > +       else
> > +               seq_printf(m, "%ps", (void *)ip);
> > +
> >         return 0;
> >  }
> >  #endif
> > @@ -3861,7 +3869,7 @@ static int t_show(struct seq_file *m, void *v)
> >         if (!rec)
> >                 return 0;
> >

Hmm, why not add the print here?

	if (iter->flags & FTRACE_ITER_ADDRS)
		seq_printf(m, "%lx ", rec->ip);

and not touch print_rec().

> > -       if (print_rec(m, rec->ip)) {
> > +       if (print_rec(m, rec->ip, iter->flags & FTRACE_ITER_ADDRS)) {
> >                 /* This should only happen when a rec is disabled */
> >                 WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
> >                 seq_putc(m, '\n');
> > @@ -3996,6 +4004,30 @@ ftrace_touched_open(struct inode *inode, struct file *file)
> >         return 0;
> >  }
> >

-- Steve
Andrii Nakryiko June 8, 2023, 11:55 p.m. UTC | #3
On Thu, Jun 8, 2023 at 4:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 8 Jun 2023 15:43:03 -0700
> Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
>
> > On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
> > >
> > >
> > > hi,
> > > when ftrace based tracers we need to cross check available_filter_functions
> > > with /proc/kallsyms. For example for kprobe_multi bpf link (based on fprobe)
> > > we need to make sure that symbol regex resolves to traceable symbols and
> > > that we get proper addresses for them.
>
> I forgot, what was the problem with doing the above?

More code, more memory, more CPU to parse all the text files. Parsing
kallsyms is quite expensive, so avoiding this would be great.

>
> > >
> > > Looks like on the last last LSF/MM/BPF there was an agreement to add new
> > > file that will have available_filter_functions symbols plus addresses.
> > >
> > > This RFC is to kick off the discussion, I'm not sure Steven wants to do
> > > that differently ;-)
>
> I'm not totally against this, but I'd like to know the full issue its
> solving. Perhaps I need to know more about what is being done, and what is
> needed too.

There are BPF tools that allow user to specify regex/glob of kernel
functions to attach to. This regex/glob is checked against
available_filter_functions to check which functions are traceable. All
good. But then also it's important to have corresponding memory
addresses for selected functions (for many reasons, e.g., to have
non-ambiguous and fast attachment by address instead of by name, or
for some post-processing based on captured IP addresses, etc). And
that means that now we need to also parse /proc/kallsyms and
cross-join it with data fetched from available_filter_functions.

All this is unnecessary if avalable_filter_functions would just
provide function address in the first place. It's a huge
simplification. And saves memory and CPU.

>
> > >
> > > thanks,
> > > jirka
> > >
> > >
> > > ---
> > > Adding new available_filter_functions_addrs file that shows all available
> > > functions (same as available_filter_functions) together with addresses,
> > > like:
> > >
> > >   # cat available_filter_functions_addrs | head
> >
> > nit: can we have some more succinct name, like "traceable_funcs" or
>
>
> It's to match avaliable_filter_functions

it's minor, I'm fine with whatever name, I'm searching for it in my
history every single time anyways :)

>
> Another way is to add a tracing option to make the address show up in the
> available_filter_functions file. That would be my preferred choice.
>
>   echo 1 > options/available_filter_addrs
>
> Or something like that.

This would modify behavior for entire system, right? I think this is
very bad. Just because one application is aware of this option and
wants to turn this on, doesn't mean that all other applications that
might also use available_filter_functions should immediately break on
that machine.

Please, let's have a separate file. There is no downside to that.

>
>
>
> > something? And btw, does this have to be part of tracefs/debugfs
>
> Because it's part of ftrace, and that belongs in tracefs.

I can use ftrace (through BPF) without mounting tracefs, right? So it
would be good to have a list of attachable kprobes without having to
worry whether tracefs was mounted or not. It's no big deal, I was just
curious if there has to be a tie to tracefs.

>
> > (never knew the difference, sorry). E.g., can it be instead exposed
> > through sysfs?
>
> tracefs is not debugfs, as debugfs includes all things debuggy (and
> considered not secure). tracefs is its own file system dedicated to the
> tracing code in the kernel. It exists with CONFIG_DEBUG not defined, and
> lives in /sys/kernel/tracing. The only reason /sys/kernel/debug/tracing
> (which is a duplicate mount point) exists is for backward compatibility for
> before tracefs existed. But that path really should be deprecated.

cool, thanks for explaining!

>
> >
> > Either than these minor things, yep, I think this is something that
> > would be extremely useful, thanks, Jiri, for taking a stab at it!
> >
> > >   ffffffff81000770 __traceiter_initcall_level
> > >   ffffffff810007c0 __traceiter_initcall_start
> > >   ffffffff81000810 __traceiter_initcall_finish
> > >   ffffffff81000860 trace_initcall_finish_cb
> > >   ...
> > >
> > > It's useful to have address avilable for traceable symbols, so we don't
> > > need to allways cross check kallsyms with available_filter_functions
> > > (or the other way around) and have all the data in single file.
>
> Is it really that big of an issue? Again, I'm not against this change, but
> I'm just wondering how much of a burden is it relieving?

Quite a lot, especially when you have to do all that in pure C.

>
> > >
> > > For backwards compatibility reasons we can't change the existing
> > > available_filter_functions file output, but we need to add new file.
>
> Or we could add an option to change it ;-)
>
> > >
> > > Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> > > Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> > > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > > ---
> > >  include/linux/ftrace.h |  1 +
> > >  kernel/trace/ftrace.c  | 52 ++++++++++++++++++++++++++++++++++++++----
> > >  2 files changed, 48 insertions(+), 5 deletions(-)
> > >

[...]
Jiri Olsa June 9, 2023, 12:13 a.m. UTC | #4
On Thu, Jun 08, 2023 at 04:55:40PM -0700, Andrii Nakryiko wrote:
> On Thu, Jun 8, 2023 at 4:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu, 8 Jun 2023 15:43:03 -0700
> > Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> >
> > > On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
> > > >
> > > >
> > > > hi,
> > > > when ftrace based tracers we need to cross check available_filter_functions
> > > > with /proc/kallsyms. For example for kprobe_multi bpf link (based on fprobe)
> > > > we need to make sure that symbol regex resolves to traceable symbols and
> > > > that we get proper addresses for them.
> >
> > I forgot, what was the problem with doing the above?
> 
> More code, more memory, more CPU to parse all the text files. Parsing
> kallsyms is quite expensive, so avoiding this would be great.

yes, reading both kallsyms and available_filter_functions parsing often
shows up in perf profiles

> 
> >
> > > >
> > > > Looks like on the last last LSF/MM/BPF there was an agreement to add new
> > > > file that will have available_filter_functions symbols plus addresses.
> > > >
> > > > This RFC is to kick off the discussion, I'm not sure Steven wants to do
> > > > that differently ;-)
> >
> > I'm not totally against this, but I'd like to know the full issue its
> > solving. Perhaps I need to know more about what is being done, and what is
> > needed too.
> 
> There are BPF tools that allow user to specify regex/glob of kernel
> functions to attach to. This regex/glob is checked against
> available_filter_functions to check which functions are traceable. All
> good. But then also it's important to have corresponding memory
> addresses for selected functions (for many reasons, e.g., to have
> non-ambiguous and fast attachment by address instead of by name, or
> for some post-processing based on captured IP addresses, etc). And
> that means that now we need to also parse /proc/kallsyms and
> cross-join it with data fetched from available_filter_functions.
> 
> All this is unnecessary if avalable_filter_functions would just
> provide function address in the first place. It's a huge
> simplification. And saves memory and CPU.
> 
> >
> > > >
> > > > thanks,
> > > > jirka
> > > >
> > > >
> > > > ---
> > > > Adding new available_filter_functions_addrs file that shows all available
> > > > functions (same as available_filter_functions) together with addresses,
> > > > like:
> > > >
> > > >   # cat available_filter_functions_addrs | head
> > >
> > > nit: can we have some more succinct name, like "traceable_funcs" or
> >
> >
> > It's to match avaliable_filter_functions
> 
> it's minor, I'm fine with whatever name, I'm searching for it in my
> history every single time anyways :)
> 
> >
> > Another way is to add a tracing option to make the address show up in the
> > available_filter_functions file. That would be my preferred choice.
> >
> >   echo 1 > options/available_filter_addrs
> >
> > Or something like that.
> 
> This would modify behavior for entire system, right? I think this is
> very bad. Just because one application is aware of this option and
> wants to turn this on, doesn't mean that all other applications that
> might also use available_filter_functions should immediately break on
> that machine.
> 
> Please, let's have a separate file. There is no downside to that.

+1 for file, AFAIU the option would change that globaly so we could race
with another app and break each other

jirka
Mark Rutland June 9, 2023, 8:24 a.m. UTC | #5
On Thu, Jun 08, 2023 at 04:55:40PM -0700, Andrii Nakryiko wrote:
> On Thu, Jun 8, 2023 at 4:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > On Thu, 8 Jun 2023 15:43:03 -0700 Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> > > On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
 
> There are BPF tools that allow user to specify regex/glob of kernel
> functions to attach to. This regex/glob is checked against
> available_filter_functions to check which functions are traceable. All
> good. But then also it's important to have corresponding memory
> addresses for selected functions (for many reasons, e.g., to have
> non-ambiguous and fast attachment by address instead of by name, or
> for some post-processing based on captured IP addresses, etc). And
> that means that now we need to also parse /proc/kallsyms and
> cross-join it with data fetched from available_filter_functions.
> 
> All this is unnecessary if avalable_filter_functions would just
> provide function address in the first place. It's a huge
> simplification. And saves memory and CPU.

Do you need the address of the function entry-point or the address of the
patch-site within the function? Those can differ, and the rec->ip address won't
necessarily equal the address in /proc/kallsyms, so the pointer in
/proc/kallsyms won't (always) match the address we could print for the ftrace site.

On arm64, today we can have offsets of +0, +4, and +8, and within a single
kernel image different functions can have different offsets. I suspect in
future that we may have more potential offsets (e.g. due to changes for HW/SW
CFI).

Thanks,
Mark.
Jiri Olsa June 9, 2023, 4:44 p.m. UTC | #6
On Fri, Jun 09, 2023 at 09:24:10AM +0100, Mark Rutland wrote:
> On Thu, Jun 08, 2023 at 04:55:40PM -0700, Andrii Nakryiko wrote:
> > On Thu, Jun 8, 2023 at 4:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > On Thu, 8 Jun 2023 15:43:03 -0700 Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> > > > On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
>  
> > There are BPF tools that allow user to specify regex/glob of kernel
> > functions to attach to. This regex/glob is checked against
> > available_filter_functions to check which functions are traceable. All
> > good. But then also it's important to have corresponding memory
> > addresses for selected functions (for many reasons, e.g., to have
> > non-ambiguous and fast attachment by address instead of by name, or
> > for some post-processing based on captured IP addresses, etc). And
> > that means that now we need to also parse /proc/kallsyms and
> > cross-join it with data fetched from available_filter_functions.
> > 
> > All this is unnecessary if avalable_filter_functions would just
> > provide function address in the first place. It's a huge
> > simplification. And saves memory and CPU.
> 
> Do you need the address of the function entry-point or the address of the
> patch-site within the function? Those can differ, and the rec->ip address won't
> necessarily equal the address in /proc/kallsyms, so the pointer in
> /proc/kallsyms won't (always) match the address we could print for the ftrace site.
> 
> On arm64, today we can have offsets of +0, +4, and +8, and within a single
> kernel image different functions can have different offsets. I suspect in
> future that we may have more potential offsets (e.g. due to changes for HW/SW
> CFI).

so we need that for kprobe_multi bpf link, which is based on fprobe,
and that uses ftrace_set_filter_ips to setup the ftrace_ops filter

and ftrace_set_filter_ips works fine with ip address being the address
of the patched instruction (it's matched in ftrace_location)

but right, I did not realize this.. it might cause confusion if people
don't know it's patch-side addresses..  not sure if there's easy way to
get real function address out of rec->ip, but it will also get more
complicated on x86 when IBT is enabled, will check

or we could just use patch-side addresses and reflect that in the file's
name like 'available_filter_functions_patch_addrs' .. it's already long
name ;-)

jirka
Steven Rostedt June 9, 2023, 5:12 p.m. UTC | #7
On Fri, 9 Jun 2023 09:44:36 -0700
Jiri Olsa <olsajiri@gmail.com> wrote:

> On Fri, Jun 09, 2023 at 09:24:10AM +0100, Mark Rutland wrote:
> > 
> > Do you need the address of the function entry-point or the address of the
> > patch-site within the function? Those can differ, and the rec->ip address won't
> > necessarily equal the address in /proc/kallsyms, so the pointer in
> > /proc/kallsyms won't (always) match the address we could print for the ftrace site.
> > 
> > On arm64, today we can have offsets of +0, +4, and +8, and within a single
> > kernel image different functions can have different offsets. I suspect in
> > future that we may have more potential offsets (e.g. due to changes for HW/SW
> > CFI).  
> 
> so we need that for kprobe_multi bpf link, which is based on fprobe,
> and that uses ftrace_set_filter_ips to setup the ftrace_ops filter
> 
> and ftrace_set_filter_ips works fine with ip address being the address
> of the patched instruction (it's matched in ftrace_location)

Yes, exactly. And it's off with the old "mcount" way of doing things too.

> 
> but right, I did not realize this.. it might cause confusion if people
> don't know it's patch-side addresses..  not sure if there's easy way to
> get real function address out of rec->ip, but it will also get more
> complicated on x86 when IBT is enabled, will check
> 
> or we could just use patch-side addresses and reflect that in the file's
> name like 'available_filter_functions_patch_addrs' .. it's already long
> name ;-)

No!  "available_filter_function_addrs" is enough to know that it's not
kallsyms. It's the filtered function address, which is enough description.
If people don't RTFM, then too bad ;-)

You can use ftrace_location() that takes an instruction pointer, and will
return the rec->ip of that function as long as it lands in between the
function's kallsyms start and end values.

-- Steve
Andrii Nakryiko June 9, 2023, 6:29 p.m. UTC | #8
On Fri, Jun 9, 2023 at 9:44 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Fri, Jun 09, 2023 at 09:24:10AM +0100, Mark Rutland wrote:
> > On Thu, Jun 08, 2023 at 04:55:40PM -0700, Andrii Nakryiko wrote:
> > > On Thu, Jun 8, 2023 at 4:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > > On Thu, 8 Jun 2023 15:43:03 -0700 Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> > > > > On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > > There are BPF tools that allow user to specify regex/glob of kernel
> > > functions to attach to. This regex/glob is checked against
> > > available_filter_functions to check which functions are traceable. All
> > > good. But then also it's important to have corresponding memory
> > > addresses for selected functions (for many reasons, e.g., to have
> > > non-ambiguous and fast attachment by address instead of by name, or
> > > for some post-processing based on captured IP addresses, etc). And
> > > that means that now we need to also parse /proc/kallsyms and
> > > cross-join it with data fetched from available_filter_functions.
> > >
> > > All this is unnecessary if avalable_filter_functions would just
> > > provide function address in the first place. It's a huge
> > > simplification. And saves memory and CPU.
> >
> > Do you need the address of the function entry-point or the address of the
> > patch-site within the function? Those can differ, and the rec->ip address won't
> > necessarily equal the address in /proc/kallsyms, so the pointer in
> > /proc/kallsyms won't (always) match the address we could print for the ftrace site.
> >
> > On arm64, today we can have offsets of +0, +4, and +8, and within a single
> > kernel image different functions can have different offsets. I suspect in
> > future that we may have more potential offsets (e.g. due to changes for HW/SW
> > CFI).
>
> so we need that for kprobe_multi bpf link, which is based on fprobe,
> and that uses ftrace_set_filter_ips to setup the ftrace_ops filter
>
> and ftrace_set_filter_ips works fine with ip address being the address
> of the patched instruction (it's matched in ftrace_location)
>
> but right, I did not realize this.. it might cause confusion if people
> don't know it's patch-side addresses..  not sure if there's easy way to
> get real function address out of rec->ip, but it will also get more
> complicated on x86 when IBT is enabled, will check

ok, sorry, I'm confused. Two questions:

1. when attaching kprobe_multi, does bpf() syscall expect function
address or (func+offset_of_mcount) address? I hope it's the former,
just function's address?

2. If rec->ip is not function's address, can we somehow adjust the
value to be a function address before printing it?

In short, I think it's confusing to have addresses with +0 or +4 or +8
offsets. It would be great if we can just keep it +0 at the interface
level (when attach and in available_filter_functions_addrs).

>
> or we could just use patch-side addresses and reflect that in the file's
> name like 'available_filter_functions_patch_addrs' .. it's already long
> name ;-)
>
> jirka
Jiri Olsa June 9, 2023, 8:37 p.m. UTC | #9
On Fri, Jun 09, 2023 at 11:29:59AM -0700, Andrii Nakryiko wrote:
> On Fri, Jun 9, 2023 at 9:44 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Fri, Jun 09, 2023 at 09:24:10AM +0100, Mark Rutland wrote:
> > > On Thu, Jun 08, 2023 at 04:55:40PM -0700, Andrii Nakryiko wrote:
> > > > On Thu, Jun 8, 2023 at 4:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > > > On Thu, 8 Jun 2023 15:43:03 -0700 Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> > > > > > On Thu, Jun 8, 2023 at 2:26 PM Jiri Olsa <jolsa@kernel.org> wrote:
> > >
> > > > There are BPF tools that allow user to specify regex/glob of kernel
> > > > functions to attach to. This regex/glob is checked against
> > > > available_filter_functions to check which functions are traceable. All
> > > > good. But then also it's important to have corresponding memory
> > > > addresses for selected functions (for many reasons, e.g., to have
> > > > non-ambiguous and fast attachment by address instead of by name, or
> > > > for some post-processing based on captured IP addresses, etc). And
> > > > that means that now we need to also parse /proc/kallsyms and
> > > > cross-join it with data fetched from available_filter_functions.
> > > >
> > > > All this is unnecessary if avalable_filter_functions would just
> > > > provide function address in the first place. It's a huge
> > > > simplification. And saves memory and CPU.
> > >
> > > Do you need the address of the function entry-point or the address of the
> > > patch-site within the function? Those can differ, and the rec->ip address won't
> > > necessarily equal the address in /proc/kallsyms, so the pointer in
> > > /proc/kallsyms won't (always) match the address we could print for the ftrace site.
> > >
> > > On arm64, today we can have offsets of +0, +4, and +8, and within a single
> > > kernel image different functions can have different offsets. I suspect in
> > > future that we may have more potential offsets (e.g. due to changes for HW/SW
> > > CFI).
> >
> > so we need that for kprobe_multi bpf link, which is based on fprobe,
> > and that uses ftrace_set_filter_ips to setup the ftrace_ops filter
> >
> > and ftrace_set_filter_ips works fine with ip address being the address
> > of the patched instruction (it's matched in ftrace_location)
> >
> > but right, I did not realize this.. it might cause confusion if people
> > don't know it's patch-side addresses..  not sure if there's easy way to
> > get real function address out of rec->ip, but it will also get more
> > complicated on x86 when IBT is enabled, will check
> 
> ok, sorry, I'm confused. Two questions:
> 
> 1. when attaching kprobe_multi, does bpf() syscall expect function
> address or (func+offset_of_mcount) address? I hope it's the former,
> just function's address?

it can be both, the fprobe/ftrace filter setup will take care of looking
up and translating the provided address to the patch-side address

> 
> 2. If rec->ip is not function's address, can we somehow adjust the
> value to be a function address before printing it?

that's tricky because on x86 with IBT we would need to read the first
instruction and check if it's endbr to get the real address, like we
do in get_entry_ip:

	#ifdef CONFIG_X86_KERNEL_IBT
	static unsigned long get_entry_ip(unsigned long fentry_ip)
	{
		u32 instr;

		/* Being extra safe in here in case entry ip is on the page-edge. */
		if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
			return fentry_ip;
		if (is_endbr(instr))
			fentry_ip -= ENDBR_INSN_SIZE;
		return fentry_ip;
	}
	#else
	#define get_entry_ip(fentry_ip) fentry_ip
	#endif


I'm not familiar with arm implementation, so I'm not sure if there's
a way to do this

but in any case Steven wants to keep the patch-side address:
  https://lore.kernel.org/bpf/CAEf4BzbgsLOoLKyscq6S95QeehVoAzOnQ=xmsFz8dfEUAnhObw@mail.gmail.com/T/#m19a97bbc8f8236ad869c9f8ad0cc7dbce0722d92

jirka

> 
> In short, I think it's confusing to have addresses with +0 or +4 or +8
> offsets. It would be great if we can just keep it +0 at the interface
> level (when attach and in available_filter_functions_addrs).
> 
> >
> > or we could just use patch-side addresses and reflect that in the file's
> > name like 'available_filter_functions_patch_addrs' .. it's already long
> > name ;-)
> >
> > jirka
diff mbox series

Patch

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b23bdd414394..6e372575a8e9 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -633,6 +633,7 @@  enum {
 	FTRACE_ITER_MOD		= (1 << 5),
 	FTRACE_ITER_ENABLED	= (1 << 6),
 	FTRACE_ITER_TOUCHED	= (1 << 7),
+	FTRACE_ITER_ADDRS	= (1 << 8),
 };
 
 void arch_ftrace_update_code(int command);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 764668467155..1f33e1f04834 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3804,7 +3804,7 @@  static int __init ftrace_check_sync(void)
 late_initcall_sync(ftrace_check_sync);
 subsys_initcall(ftrace_check_for_weak_functions);
 
-static int print_rec(struct seq_file *m, unsigned long ip)
+static int print_rec(struct seq_file *m, unsigned long ip, bool print_addr)
 {
 	unsigned long offset;
 	char str[KSYM_SYMBOL_LEN];
@@ -3819,7 +3819,11 @@  static int print_rec(struct seq_file *m, unsigned long ip)
 		ret = NULL;
 	}
 
-	seq_puts(m, str);
+	if (print_addr)
+		seq_printf(m, "%lx %s", ip, str);
+	else
+		seq_puts(m, str);
+
 	if (modname)
 		seq_printf(m, " [%s]", modname);
 	return ret == NULL ? -1 : 0;
@@ -3830,9 +3834,13 @@  static inline int test_for_valid_rec(struct dyn_ftrace *rec)
 	return 1;
 }
 
-static inline int print_rec(struct seq_file *m, unsigned long ip)
+static inline int print_rec(struct seq_file *m, unsigned long ip, bool print_addr)
 {
-	seq_printf(m, "%ps", (void *)ip);
+	if (print_addr)
+		seq_printf(m, "%lx %ps", ip, (void *)ip);
+	else
+		seq_printf(m, "%ps", (void *)ip);
+
 	return 0;
 }
 #endif
@@ -3861,7 +3869,7 @@  static int t_show(struct seq_file *m, void *v)
 	if (!rec)
 		return 0;
 
-	if (print_rec(m, rec->ip)) {
+	if (print_rec(m, rec->ip, iter->flags & FTRACE_ITER_ADDRS)) {
 		/* This should only happen when a rec is disabled */
 		WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
 		seq_putc(m, '\n');
@@ -3996,6 +4004,30 @@  ftrace_touched_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int
+ftrace_avail_addrs_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret;
+
+	ret = security_locked_down(LOCKDOWN_TRACEFS);
+	if (ret)
+		return ret;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+	if (!iter)
+		return -ENOMEM;
+
+	iter->pg = ftrace_pages_start;
+	iter->flags = FTRACE_ITER_ADDRS;
+	iter->ops = &global_ops;
+
+	return 0;
+}
+
 /**
  * ftrace_regex_open - initialize function tracer filter files
  * @ops: The ftrace_ops that hold the hash filters
@@ -5916,6 +5948,13 @@  static const struct file_operations ftrace_touched_fops = {
 	.release = seq_release_private,
 };
 
+static const struct file_operations ftrace_avail_addrs_fops = {
+	.open = ftrace_avail_addrs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_private,
+};
+
 static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
@@ -6377,6 +6416,9 @@  static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
 	trace_create_file("available_filter_functions", TRACE_MODE_READ,
 			d_tracer, NULL, &ftrace_avail_fops);
 
+	trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ,
+			d_tracer, NULL, &ftrace_avail_addrs_fops);
+
 	trace_create_file("enabled_functions", TRACE_MODE_READ,
 			d_tracer, NULL, &ftrace_enabled_fops);