diff mbox series

[v4,3/9] bpf/btf: Add a function to search a member of a struct/union

Message ID 169078863449.173706.2322042687021909241.stgit@devnote2 (mailing list archive)
State Superseded
Headers show
Series tracing: Improbe BTF support on probe events | expand

Commit Message

Masami Hiramatsu (Google) July 31, 2023, 7:30 a.m. UTC
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add btf_find_struct_member() API to search a member of a given data structure
or union from the member's name.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
---
 Changes in v3:
  - Remove simple input check.
  - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id().
  - Move the code next to btf_get_func_param().
  - Use for_each_member() macro instead of for-loop.
  - Use btf_type_skip_modifiers() instead of btf_type_by_id().
 Changes in v4:
  - Use a stack for searching in anonymous members instead of nested call.
---
 include/linux/btf.h |    3 +++
 kernel/bpf/btf.c    |   40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

Comments

Alexei Starovoitov July 31, 2023, 9:59 p.m. UTC | #1
On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google)
<mhiramat@kernel.org> wrote:
>
> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
>
> Add btf_find_struct_member() API to search a member of a given data structure
> or union from the member's name.
>
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
> ---
>  Changes in v3:
>   - Remove simple input check.
>   - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id().
>   - Move the code next to btf_get_func_param().
>   - Use for_each_member() macro instead of for-loop.
>   - Use btf_type_skip_modifiers() instead of btf_type_by_id().
>  Changes in v4:
>   - Use a stack for searching in anonymous members instead of nested call.
> ---
>  include/linux/btf.h |    3 +++
>  kernel/bpf/btf.c    |   40 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 43 insertions(+)
>
> diff --git a/include/linux/btf.h b/include/linux/btf.h
> index 20e3a07eef8f..4b10d57ceee0 100644
> --- a/include/linux/btf.h
> +++ b/include/linux/btf.h
> @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name,
>                                            struct btf **btf_p);
>  const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
>                                            s32 *nr);
> +const struct btf_member *btf_find_struct_member(struct btf *btf,
> +                                               const struct btf_type *type,
> +                                               const char *member_name);
>
>  #define for_each_member(i, struct_type, member)                        \
>         for (i = 0, member = btf_type_member(struct_type);      \
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index f7b25c615269..8d81a4ffa67b 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3
>                 return NULL;
>  }
>
> +#define BTF_ANON_STACK_MAX     16
> +
> +/*
> + * Find a member of data structure/union by name and return it.
> + * Return NULL if not found, or -EINVAL if parameter is invalid.
> + */
> +const struct btf_member *btf_find_struct_member(struct btf *btf,
> +                                               const struct btf_type *type,
> +                                               const char *member_name)
> +{
> +       const struct btf_type *anon_stack[BTF_ANON_STACK_MAX];
> +       const struct btf_member *member;
> +       const char *name;
> +       int i, top = 0;
> +
> +retry:
> +       if (!btf_type_is_struct(type))
> +               return ERR_PTR(-EINVAL);
> +
> +       for_each_member(i, type, member) {
> +               if (!member->name_off) {
> +                       /* Anonymous union/struct: push it for later use */
> +                       type = btf_type_skip_modifiers(btf, member->type, NULL);
> +                       if (type && top < BTF_ANON_STACK_MAX)
> +                               anon_stack[top++] = type;
> +               } else {
> +                       name = btf_name_by_offset(btf, member->name_off);
> +                       if (name && !strcmp(member_name, name))
> +                               return member;
> +               }
> +       }
> +       if (top > 0) {
> +               /* Pop from the anonymous stack and retry */
> +               type = anon_stack[--top];
> +               goto retry;
> +       }

Looks good, but I don't see a test case for this.
The logic is a bit tricky. I'd like to have a selftest that covers it.

You probably need to drop Alan's reviewed-by, since the patch is quite
different from the time he reviewed it.

Assuming that is addressed. How do we merge the series?
The first 3 patches have serious conflicts with bpf trees.

Maybe send the first 3 with extra selftest for above recursion
targeting bpf-next then we can have a merge commit that Steven can pull
into tracing?

Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next.
Masami Hiramatsu (Google) July 31, 2023, 11:57 p.m. UTC | #2
On Mon, 31 Jul 2023 14:59:47 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google)
> <mhiramat@kernel.org> wrote:
> >
> > From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> >
> > Add btf_find_struct_member() API to search a member of a given data structure
> > or union from the member's name.
> >
> > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
> > ---
> >  Changes in v3:
> >   - Remove simple input check.
> >   - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id().
> >   - Move the code next to btf_get_func_param().
> >   - Use for_each_member() macro instead of for-loop.
> >   - Use btf_type_skip_modifiers() instead of btf_type_by_id().
> >  Changes in v4:
> >   - Use a stack for searching in anonymous members instead of nested call.
> > ---
> >  include/linux/btf.h |    3 +++
> >  kernel/bpf/btf.c    |   40 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 43 insertions(+)
> >
> > diff --git a/include/linux/btf.h b/include/linux/btf.h
> > index 20e3a07eef8f..4b10d57ceee0 100644
> > --- a/include/linux/btf.h
> > +++ b/include/linux/btf.h
> > @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name,
> >                                            struct btf **btf_p);
> >  const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
> >                                            s32 *nr);
> > +const struct btf_member *btf_find_struct_member(struct btf *btf,
> > +                                               const struct btf_type *type,
> > +                                               const char *member_name);
> >
> >  #define for_each_member(i, struct_type, member)                        \
> >         for (i = 0, member = btf_type_member(struct_type);      \
> > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > index f7b25c615269..8d81a4ffa67b 100644
> > --- a/kernel/bpf/btf.c
> > +++ b/kernel/bpf/btf.c
> > @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3
> >                 return NULL;
> >  }
> >
> > +#define BTF_ANON_STACK_MAX     16
> > +
> > +/*
> > + * Find a member of data structure/union by name and return it.
> > + * Return NULL if not found, or -EINVAL if parameter is invalid.
> > + */
> > +const struct btf_member *btf_find_struct_member(struct btf *btf,
> > +                                               const struct btf_type *type,
> > +                                               const char *member_name)
> > +{
> > +       const struct btf_type *anon_stack[BTF_ANON_STACK_MAX];
> > +       const struct btf_member *member;
> > +       const char *name;
> > +       int i, top = 0;
> > +
> > +retry:
> > +       if (!btf_type_is_struct(type))
> > +               return ERR_PTR(-EINVAL);
> > +
> > +       for_each_member(i, type, member) {
> > +               if (!member->name_off) {
> > +                       /* Anonymous union/struct: push it for later use */
> > +                       type = btf_type_skip_modifiers(btf, member->type, NULL);
> > +                       if (type && top < BTF_ANON_STACK_MAX)
> > +                               anon_stack[top++] = type;
> > +               } else {
> > +                       name = btf_name_by_offset(btf, member->name_off);
> > +                       if (name && !strcmp(member_name, name))
> > +                               return member;
> > +               }
> > +       }
> > +       if (top > 0) {
> > +               /* Pop from the anonymous stack and retry */
> > +               type = anon_stack[--top];
> > +               goto retry;
> > +       }
> 
> Looks good, but I don't see a test case for this.
> The logic is a bit tricky. I'd like to have a selftest that covers it.

Thanks, and I agree about selftest.

> 
> You probably need to drop Alan's reviewed-by, since the patch is quite
> different from the time he reviewed it.

OK. BTW, I found a problem on this function. I guess the member->offset will
be the offset from the intermediate anonymous union, it is usually 0, but
I need the offset from the given structure. Thus the interface design must
be changed. Passing a 'u32 *offset' and set the correct offset in it. If
it has nested intermediate anonymous unions, that offset must also be pushed.

> 
> Assuming that is addressed. How do we merge the series?
> The first 3 patches have serious conflicts with bpf trees.
> 
> Maybe send the first 3 with extra selftest for above recursion
> targeting bpf-next then we can have a merge commit that Steven can pull
> into tracing?
> 
> Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next.

That's a good question. I don't like splitting the whole series in 2 -next
branches. So I can send this to the bpf-next.
I need to work on another series(*) on fprobes which will not have conflicts with
this series. (*Replacing pt_regs with ftrace_regs on fprobe, which will take longer
time, and need to adjust with eBPF).

Thank you,
Alexei Starovoitov Aug. 1, 2023, 12:29 a.m. UTC | #3
On Mon, Jul 31, 2023 at 4:57 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Mon, 31 Jul 2023 14:59:47 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google)
> > <mhiramat@kernel.org> wrote:
> > >
> > > From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > >
> > > Add btf_find_struct_member() API to search a member of a given data structure
> > > or union from the member's name.
> > >
> > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > > Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
> > > ---
> > >  Changes in v3:
> > >   - Remove simple input check.
> > >   - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id().
> > >   - Move the code next to btf_get_func_param().
> > >   - Use for_each_member() macro instead of for-loop.
> > >   - Use btf_type_skip_modifiers() instead of btf_type_by_id().
> > >  Changes in v4:
> > >   - Use a stack for searching in anonymous members instead of nested call.
> > > ---
> > >  include/linux/btf.h |    3 +++
> > >  kernel/bpf/btf.c    |   40 ++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 43 insertions(+)
> > >
> > > diff --git a/include/linux/btf.h b/include/linux/btf.h
> > > index 20e3a07eef8f..4b10d57ceee0 100644
> > > --- a/include/linux/btf.h
> > > +++ b/include/linux/btf.h
> > > @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name,
> > >                                            struct btf **btf_p);
> > >  const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
> > >                                            s32 *nr);
> > > +const struct btf_member *btf_find_struct_member(struct btf *btf,
> > > +                                               const struct btf_type *type,
> > > +                                               const char *member_name);
> > >
> > >  #define for_each_member(i, struct_type, member)                        \
> > >         for (i = 0, member = btf_type_member(struct_type);      \
> > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > index f7b25c615269..8d81a4ffa67b 100644
> > > --- a/kernel/bpf/btf.c
> > > +++ b/kernel/bpf/btf.c
> > > @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3
> > >                 return NULL;
> > >  }
> > >
> > > +#define BTF_ANON_STACK_MAX     16
> > > +
> > > +/*
> > > + * Find a member of data structure/union by name and return it.
> > > + * Return NULL if not found, or -EINVAL if parameter is invalid.
> > > + */
> > > +const struct btf_member *btf_find_struct_member(struct btf *btf,
> > > +                                               const struct btf_type *type,
> > > +                                               const char *member_name)
> > > +{
> > > +       const struct btf_type *anon_stack[BTF_ANON_STACK_MAX];
> > > +       const struct btf_member *member;
> > > +       const char *name;
> > > +       int i, top = 0;
> > > +
> > > +retry:
> > > +       if (!btf_type_is_struct(type))
> > > +               return ERR_PTR(-EINVAL);
> > > +
> > > +       for_each_member(i, type, member) {
> > > +               if (!member->name_off) {
> > > +                       /* Anonymous union/struct: push it for later use */
> > > +                       type = btf_type_skip_modifiers(btf, member->type, NULL);
> > > +                       if (type && top < BTF_ANON_STACK_MAX)
> > > +                               anon_stack[top++] = type;
> > > +               } else {
> > > +                       name = btf_name_by_offset(btf, member->name_off);
> > > +                       if (name && !strcmp(member_name, name))
> > > +                               return member;
> > > +               }
> > > +       }
> > > +       if (top > 0) {
> > > +               /* Pop from the anonymous stack and retry */
> > > +               type = anon_stack[--top];
> > > +               goto retry;
> > > +       }
> >
> > Looks good, but I don't see a test case for this.
> > The logic is a bit tricky. I'd like to have a selftest that covers it.
>
> Thanks, and I agree about selftest.
>
> >
> > You probably need to drop Alan's reviewed-by, since the patch is quite
> > different from the time he reviewed it.
>
> OK. BTW, I found a problem on this function. I guess the member->offset will
> be the offset from the intermediate anonymous union, it is usually 0, but
> I need the offset from the given structure. Thus the interface design must
> be changed. Passing a 'u32 *offset' and set the correct offset in it. If
> it has nested intermediate anonymous unions, that offset must also be pushed.

With all that piling up have you considering reusing btf_struct_walk() ?
It's doing the opposite off -> btf_id while you need name -> btf_id.
But it will give an idea of overall complexity if you want to solve it
for nested arrays and struct/union.

> >
> > Assuming that is addressed. How do we merge the series?
> > The first 3 patches have serious conflicts with bpf trees.
> >
> > Maybe send the first 3 with extra selftest for above recursion
> > targeting bpf-next then we can have a merge commit that Steven can pull
> > into tracing?
> >
> > Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next.
>
> That's a good question. I don't like splitting the whole series in 2 -next
> branches. So I can send this to the bpf-next.

Works for me.

> I need to work on another series(*) on fprobes which will not have conflicts with
> this series. (*Replacing pt_regs with ftrace_regs on fprobe, which will take longer
> time, and need to adjust with eBPF).

ftrace_regs?
Ouch. For bpf we rely on pt_regs being an argument.
fprobe should be 100% compatible replacement of kprobe-at-the-func-start.
If it diverges from that it's a big issue for bpf.
We'd have to remove all of fprobe usage.
I could be missing something, of course.
Steven Rostedt Aug. 1, 2023, 1:15 a.m. UTC | #4
On Mon, 31 Jul 2023 14:59:47 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> Assuming that is addressed. How do we merge the series?
> The first 3 patches have serious conflicts with bpf trees.
> 
> Maybe send the first 3 with extra selftest for above recursion
> targeting bpf-next then we can have a merge commit that Steven can pull
> into tracing?

Would it be possible to do this by basing it off of one of Linus's tags,
and doing the merge and conflict resolution in your tree before it gets to
Linus?

That way we can pull in that clean branch without having to pull in
anything else from BPF. I believe Linus prefers this over having tracing
having extra changes from BPF that are not yet in his tree. We only need
these particular changes, we shouldn't be pulling in anything specific for
BPF, as I believe that will cause issues on Linus's side.

-- Steve


> 
> Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next.
Alexei Starovoitov Aug. 1, 2023, 2:24 a.m. UTC | #5
On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 31 Jul 2023 14:59:47 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > Assuming that is addressed. How do we merge the series?
> > The first 3 patches have serious conflicts with bpf trees.
> >
> > Maybe send the first 3 with extra selftest for above recursion
> > targeting bpf-next then we can have a merge commit that Steven can pull
> > into tracing?
>
> Would it be possible to do this by basing it off of one of Linus's tags,
> and doing the merge and conflict resolution in your tree before it gets to
> Linus?
>
> That way we can pull in that clean branch without having to pull in
> anything else from BPF. I believe Linus prefers this over having tracing
> having extra changes from BPF that are not yet in his tree. We only need
> these particular changes, we shouldn't be pulling in anything specific for
> BPF, as I believe that will cause issues on Linus's side.

We can try, but I suspect git tricks won't do it.
Masami's changes depend on patches for kernel/bpf/btf.c that
are already in bpf-next, so git would have to follow all commits
that touch this file. I don't think git is smart enough to
thread the needle and split the commit into files. If one commit touches
btf.c and something else that whole commit becomes a dependency
that pulls another commit with all files touched by
the previous commit and so on.
tbh for this set, the easiest for everyone, is to land the whole thing
through bpf-next, since there are no conflicts on fprobe side.
Steven Rostedt Aug. 1, 2023, 1:35 p.m. UTC | #6
On Mon, 31 Jul 2023 19:24:25 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org>
> wrote:
> >
> > On Mon, 31 Jul 2023 14:59:47 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >  
> > > Assuming that is addressed. How do we merge the series?
> > > The first 3 patches have serious conflicts with bpf trees.
> > >
> > > Maybe send the first 3 with extra selftest for above recursion
> > > targeting bpf-next then we can have a merge commit that Steven can
> > > pull into tracing?  
> >
> > Would it be possible to do this by basing it off of one of Linus's tags,
> > and doing the merge and conflict resolution in your tree before it gets
> > to Linus?
> >
> > That way we can pull in that clean branch without having to pull in
> > anything else from BPF. I believe Linus prefers this over having tracing
> > having extra changes from BPF that are not yet in his tree. We only need
> > these particular changes, we shouldn't be pulling in anything specific
> > for BPF, as I believe that will cause issues on Linus's side.  
> 
> We can try, but I suspect git tricks won't do it.
> Masami's changes depend on patches for kernel/bpf/btf.c that
> are already in bpf-next, so git would have to follow all commits

You mean other patches that Masami has sent are in the bpf tree already and
these are on top of them?

-- Steve

> that touch this file. I don't think git is smart enough to
> thread the needle and split the commit into files. If one commit touches
> btf.c and something else that whole commit becomes a dependency
> that pulls another commit with all files touched by
> the previous commit and so on.
> tbh for this set, the easiest for everyone, is to land the whole thing
> through bpf-next, since there are no conflicts on fprobe side.
Masami Hiramatsu (Google) Aug. 1, 2023, 3:02 p.m. UTC | #7
On Mon, 31 Jul 2023 17:29:49 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Mon, Jul 31, 2023 at 4:57 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Mon, 31 Jul 2023 14:59:47 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > > On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google)
> > > <mhiramat@kernel.org> wrote:
> > > >
> > > > From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > > >
> > > > Add btf_find_struct_member() API to search a member of a given data structure
> > > > or union from the member's name.
> > > >
> > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > > > Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
> > > > ---
> > > >  Changes in v3:
> > > >   - Remove simple input check.
> > > >   - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id().
> > > >   - Move the code next to btf_get_func_param().
> > > >   - Use for_each_member() macro instead of for-loop.
> > > >   - Use btf_type_skip_modifiers() instead of btf_type_by_id().
> > > >  Changes in v4:
> > > >   - Use a stack for searching in anonymous members instead of nested call.
> > > > ---
> > > >  include/linux/btf.h |    3 +++
> > > >  kernel/bpf/btf.c    |   40 ++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 43 insertions(+)
> > > >
> > > > diff --git a/include/linux/btf.h b/include/linux/btf.h
> > > > index 20e3a07eef8f..4b10d57ceee0 100644
> > > > --- a/include/linux/btf.h
> > > > +++ b/include/linux/btf.h
> > > > @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name,
> > > >                                            struct btf **btf_p);
> > > >  const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
> > > >                                            s32 *nr);
> > > > +const struct btf_member *btf_find_struct_member(struct btf *btf,
> > > > +                                               const struct btf_type *type,
> > > > +                                               const char *member_name);
> > > >
> > > >  #define for_each_member(i, struct_type, member)                        \
> > > >         for (i = 0, member = btf_type_member(struct_type);      \
> > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > > index f7b25c615269..8d81a4ffa67b 100644
> > > > --- a/kernel/bpf/btf.c
> > > > +++ b/kernel/bpf/btf.c
> > > > @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3
> > > >                 return NULL;
> > > >  }
> > > >
> > > > +#define BTF_ANON_STACK_MAX     16
> > > > +
> > > > +/*
> > > > + * Find a member of data structure/union by name and return it.
> > > > + * Return NULL if not found, or -EINVAL if parameter is invalid.
> > > > + */
> > > > +const struct btf_member *btf_find_struct_member(struct btf *btf,
> > > > +                                               const struct btf_type *type,
> > > > +                                               const char *member_name)
> > > > +{
> > > > +       const struct btf_type *anon_stack[BTF_ANON_STACK_MAX];
> > > > +       const struct btf_member *member;
> > > > +       const char *name;
> > > > +       int i, top = 0;
> > > > +
> > > > +retry:
> > > > +       if (!btf_type_is_struct(type))
> > > > +               return ERR_PTR(-EINVAL);
> > > > +
> > > > +       for_each_member(i, type, member) {
> > > > +               if (!member->name_off) {
> > > > +                       /* Anonymous union/struct: push it for later use */
> > > > +                       type = btf_type_skip_modifiers(btf, member->type, NULL);
> > > > +                       if (type && top < BTF_ANON_STACK_MAX)
> > > > +                               anon_stack[top++] = type;
> > > > +               } else {
> > > > +                       name = btf_name_by_offset(btf, member->name_off);
> > > > +                       if (name && !strcmp(member_name, name))
> > > > +                               return member;
> > > > +               }
> > > > +       }
> > > > +       if (top > 0) {
> > > > +               /* Pop from the anonymous stack and retry */
> > > > +               type = anon_stack[--top];
> > > > +               goto retry;
> > > > +       }
> > >
> > > Looks good, but I don't see a test case for this.
> > > The logic is a bit tricky. I'd like to have a selftest that covers it.
> >
> > Thanks, and I agree about selftest.
> >
> > >
> > > You probably need to drop Alan's reviewed-by, since the patch is quite
> > > different from the time he reviewed it.
> >
> > OK. BTW, I found a problem on this function. I guess the member->offset will
> > be the offset from the intermediate anonymous union, it is usually 0, but
> > I need the offset from the given structure. Thus the interface design must
> > be changed. Passing a 'u32 *offset' and set the correct offset in it. If
> > it has nested intermediate anonymous unions, that offset must also be pushed.
> 
> With all that piling up have you considering reusing btf_struct_walk() ?
> It's doing the opposite off -> btf_id while you need name -> btf_id.
> But it will give an idea of overall complexity if you want to solve it
> for nested arrays and struct/union.

No, it seems a bit different. (and it may not return the name correctly for
anonymous struct/union) Of course it seems an interesting work. What I found
is returning btf_member is not enough because btf_member in the nested union
will have the offset from the nested structure. I have to accumulate the
offset. It is easy to fix (just stacking (tid,offset) instead of type*) :)

> 
> > >
> > > Assuming that is addressed. How do we merge the series?
> > > The first 3 patches have serious conflicts with bpf trees.
> > >
> > > Maybe send the first 3 with extra selftest for above recursion
> > > targeting bpf-next then we can have a merge commit that Steven can pull
> > > into tracing?
> > >
> > > Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next.
> >
> > That's a good question. I don't like splitting the whole series in 2 -next
> > branches. So I can send this to the bpf-next.
> 
> Works for me.

Or, yet another option is keeping new btf APIs in trace/trace_probe.c in this
series, and move all of them to btf.c in the next series.
This will not make any merge problem between trees, but just needs 2 series
on different releases. (since unless the first one is merged, we cannot send
the second one)

> 
> > I need to work on another series(*) on fprobes which will not have conflicts with
> > this series. (*Replacing pt_regs with ftrace_regs on fprobe, which will take longer
> > time, and need to adjust with eBPF).
> 
> ftrace_regs?
> Ouch. For bpf we rely on pt_regs being an argument.

Yeah, that's a problem.

> fprobe should be 100% compatible replacement of kprobe-at-the-func-start.

No, fprobe is not such feature. It must provide more generic interface because
it is a probe version of ftrace, not kprobe.

> If it diverges from that it's a big issue for bpf.
> We'd have to remove all of fprobe usage.
> I could be missing something, of course.

Yes, so that's the discussion point. At first, I will disable fprobe on BPF
if ftrace_regs is not compatible with pt_regs, but eventually it should be
handled to support arm64. I believe BPF can do it since ftrace can do.

Thank you,
Masami Hiramatsu (Google) Aug. 1, 2023, 3:18 p.m. UTC | #8
On Mon, 31 Jul 2023 19:24:25 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Mon, 31 Jul 2023 14:59:47 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > > Assuming that is addressed. How do we merge the series?
> > > The first 3 patches have serious conflicts with bpf trees.
> > >
> > > Maybe send the first 3 with extra selftest for above recursion
> > > targeting bpf-next then we can have a merge commit that Steven can pull
> > > into tracing?
> >
> > Would it be possible to do this by basing it off of one of Linus's tags,
> > and doing the merge and conflict resolution in your tree before it gets to
> > Linus?
> >
> > That way we can pull in that clean branch without having to pull in
> > anything else from BPF. I believe Linus prefers this over having tracing
> > having extra changes from BPF that are not yet in his tree. We only need
> > these particular changes, we shouldn't be pulling in anything specific for
> > BPF, as I believe that will cause issues on Linus's side.
> 
> We can try, but I suspect git tricks won't do it.
> Masami's changes depend on patches for kernel/bpf/btf.c that
> are already in bpf-next, so git would have to follow all commits
> that touch this file. 

This point is strange. I'm working on probe/fixes which is based on
v6.5-rc3, so any bpf-next change should not be involved. Can you recheck
this point?

> I don't think git is smart enough to
> thread the needle and split the commit into files. If one commit touches
> btf.c and something else that whole commit becomes a dependency
> that pulls another commit with all files touched by
> the previous commit and so on.

As far as I understand Steve's method, we will have an intermediate branch
on bpf or probe tree, like 

linus(some common commit) ---- probes/btf-find-api

and merge it to both bpf-next and probes/for-next branch

          +----------------------bpf-next --- (merge bpf patches)
         /                       / merge
common -/\ probes/btf-find-api -/-\
          \                        \ merge
           +----------------------probes/for-next --- (merge probe patches)

Thus, we can merge both for-next branches at next merge window without
any issue. (But, yes, this is not simple, and needs maxium care)

Thank you,

> tbh for this set, the easiest for everyone, is to land the whole thing
> through bpf-next, since there are no conflicts on fprobe side.
Steven Rostedt Aug. 1, 2023, 3:20 p.m. UTC | #9
On Wed, 2 Aug 2023 00:02:28 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > If it diverges from that it's a big issue for bpf.
> > We'd have to remove all of fprobe usage.
> > I could be missing something, of course.  
> 
> Yes, so that's the discussion point. At first, I will disable fprobe on BPF
> if ftrace_regs is not compatible with pt_regs, but eventually it should be
> handled to support arm64. I believe BPF can do it since ftrace can do.

Note, for FYI let me give you a little history of where ftrace_regs came
from. When I realized that all function tracing had to save all the
registers that represent the arguments of a function as well as the stack
pointer, I wanted to change the non FTRACE_WITH_REGS to be able to have
access to those registers. This is where FTRACE_WITH_ARGS came from.

My first attempt was to pass a pt_regs that was partially filled, with only
the registers required for the arguments. But the x86 maintainers NACK'd
that. They refused to allow a partially filled pt_regs as that could cause
bugs in the future when a user may assume that the pt_regs is filled but is
not.

The solution was to come up with ftrace_regs, which just means it has all
the registers to extract the arguments of a function and nothing more. Most
implementations just have a partially filled pt_regs within it, but an API
needs to be used to get to the argument values.

When you say BPF uses pt_regs, is the pt_regs full or does it get passed a
partially filled structure?

For fast function entry, ftrace_regs is what should be used if the pt_regs
is not filled. As it is only for use for function entry. It supplies all
regs and stack pointer to get to all the arguments.

-- Steve
Steven Rostedt Aug. 1, 2023, 3:32 p.m. UTC | #10
On Tue, 1 Aug 2023 11:20:36 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> The solution was to come up with ftrace_regs, which just means it has all
> the registers to extract the arguments of a function and nothing more. Most

This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As
the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They
will do:

	void callback(..., struct ftrace_regs *fregs) {
		struct pt_regs *regs = ftrace_get_regs(fregs);


Where ftrace_get_regs() will return the pt_regs only if it is fully filled.
If it is not, then it returns NULL. This was what the x86 maintainers
agreed with.

-- Steve


> implementations just have a partially filled pt_regs within it, but an API
> needs to be used to get to the argument values.
Alexei Starovoitov Aug. 1, 2023, 10:18 p.m. UTC | #11
On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 1 Aug 2023 11:20:36 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > The solution was to come up with ftrace_regs, which just means it has all
> > the registers to extract the arguments of a function and nothing more. Most
>
> This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As
> the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They
> will do:
>
>         void callback(..., struct ftrace_regs *fregs) {
>                 struct pt_regs *regs = ftrace_get_regs(fregs);
>
>
> Where ftrace_get_regs() will return the pt_regs only if it is fully filled.
> If it is not, then it returns NULL. This was what the x86 maintainers
> agreed with.

arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL

Ouch. That's very bad.
We care a lot about bpf running well on arm64.

If you guys decide to convert fprobe to ftrace_regs please
make it depend on kconfig or something.
bpf side needs full pt_regs.
It's not about access to args.
pt_regs is passed from bpf prog further into all kinds of perf event
functions including stack walking.
I think ORC unwinder might depend on availability of all registers.
Other perf helpers might need it too. Like perf_event_output.
bpf progs need to access arguments, no doubt about that.
If ftrace_regs have them exactly in the same offsets as in pt_regs
that might work transparently for bpf progs, but, I'm afraid,
it's not the case on all archs.
So we need full pt_regs to make sure all paths are still working.

Adding Jiri and others.
Alexei Starovoitov Aug. 1, 2023, 10:21 p.m. UTC | #12
On Tue, Aug 1, 2023 at 8:18 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Mon, 31 Jul 2023 19:24:25 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Mon, 31 Jul 2023 14:59:47 -0700
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > >
> > > > Assuming that is addressed. How do we merge the series?
> > > > The first 3 patches have serious conflicts with bpf trees.
> > > >
> > > > Maybe send the first 3 with extra selftest for above recursion
> > > > targeting bpf-next then we can have a merge commit that Steven can pull
> > > > into tracing?
> > >
> > > Would it be possible to do this by basing it off of one of Linus's tags,
> > > and doing the merge and conflict resolution in your tree before it gets to
> > > Linus?
> > >
> > > That way we can pull in that clean branch without having to pull in
> > > anything else from BPF. I believe Linus prefers this over having tracing
> > > having extra changes from BPF that are not yet in his tree. We only need
> > > these particular changes, we shouldn't be pulling in anything specific for
> > > BPF, as I believe that will cause issues on Linus's side.
> >
> > We can try, but I suspect git tricks won't do it.
> > Masami's changes depend on patches for kernel/bpf/btf.c that
> > are already in bpf-next, so git would have to follow all commits
> > that touch this file.
>
> This point is strange. I'm working on probe/fixes which is based on
> v6.5-rc3, so any bpf-next change should not be involved. Can you recheck
> this point?
>
> > I don't think git is smart enough to
> > thread the needle and split the commit into files. If one commit touches
> > btf.c and something else that whole commit becomes a dependency
> > that pulls another commit with all files touched by
> > the previous commit and so on.
>
> As far as I understand Steve's method, we will have an intermediate branch
> on bpf or probe tree, like
>
> linus(some common commit) ---- probes/btf-find-api
>
> and merge it to both bpf-next and probes/for-next branch
>
>           +----------------------bpf-next --- (merge bpf patches)
>          /                       / merge
> common -/\ probes/btf-find-api -/-\
>           \                        \ merge
>            +----------------------probes/for-next --- (merge probe patches)
>
> Thus, we can merge both for-next branches at next merge window without
> any issue. (But, yes, this is not simple, and needs maxium care)

Sounds like the path of least resistance is to keep the changes
in kernel/trace and consolidate with kernel/bpf/btf.c after the next
merge window.
Steven Rostedt Aug. 1, 2023, 11:09 p.m. UTC | #13
On Tue, 1 Aug 2023 15:18:56 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Tue, 1 Aug 2023 11:20:36 -0400
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >  
> > > The solution was to come up with ftrace_regs, which just means it has all
> > > the registers to extract the arguments of a function and nothing more. Most  
> >
> > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As
> > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They
> > will do:
> >
> >         void callback(..., struct ftrace_regs *fregs) {
> >                 struct pt_regs *regs = ftrace_get_regs(fregs);
> >
> >
> > Where ftrace_get_regs() will return the pt_regs only if it is fully filled.
> > If it is not, then it returns NULL. This was what the x86 maintainers
> > agreed with.  
> 
> arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL
> 
> Ouch. That's very bad.
> We care a lot about bpf running well on arm64.

[ Adding Mark and Florent ]

That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their
function handlers only care about the arguments. If you want full regs at
function entry, then you need to take a breakpoint hit for a full kprobe.

In fact, fprobes isn't even supported on arm64 because it it doesn't have
DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying
to get it to work with ftrace_regs. To get it to work on arm64.

Again, ftrace_get_regs(fregs) is only suppose to return something if the
pt_regs is fully supplied. If they are not, then it must not be used. Are
you not using a fully filled pt_regs? Because that's what both Thomas and
Peter (also added) told me not to do!

Otherwise, ftrace_regs() has support on arm64 for getting to the argument
registers and the stack. Even live kernel patching now uses ftrace_regs().

> 
> If you guys decide to convert fprobe to ftrace_regs please
> make it depend on kconfig or something.
> bpf side needs full pt_regs.

Then use kprobes. When I asked Masami what the difference between fprobes
and kprobes was, he told me that it would be that it would no longer rely
on the slower FTRACE_WITH_REGS. But currently, it still does.

The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in
the first place, was because of the overhead you reported to me with
ftrace_regs_caller and why you wanted to go the direct trampoline approach.
That's when I realized I could use a subset because those registers were
already being saved. The only reason FTRACE_WITH_REGS was created was it
had to supply full pt_regs (including flags) and emulate a breakpoint for
the kprobes interface. But in reality, nothing really needs all that.

> It's not about access to args.
> pt_regs is passed from bpf prog further into all kinds of perf event
> functions including stack walking.

ftrace_regs gives you the stack pointer. Basically, it gives you access to
anything that is required to be saved to do a function call from fentry.

> I think ORC unwinder might depend on availability of all registers.
> Other perf helpers might need it too. Like perf_event_output.
> bpf progs need to access arguments, no doubt about that.
> If ftrace_regs have them exactly in the same offsets as in pt_regs
> that might work transparently for bpf progs, but, I'm afraid,
> it's not the case on all archs.
> So we need full pt_regs to make sure all paths are still working.
> 
> Adding Jiri and others.

Then I recommend that you give up using fprobes and just stick with kprobes
as that's guaranteed to give you full pt_regs (at the overhead of doing
things like filing in flags and such). And currently for arm64, fprobes can
only work with ftrace_regs, without the full pt_regs.

-- Steve
Masami Hiramatsu (Google) Aug. 1, 2023, 11:17 p.m. UTC | #14
On Tue, 1 Aug 2023 15:21:59 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Tue, Aug 1, 2023 at 8:18 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Mon, 31 Jul 2023 19:24:25 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > > On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > >
> > > > On Mon, 31 Jul 2023 14:59:47 -0700
> > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > > Assuming that is addressed. How do we merge the series?
> > > > > The first 3 patches have serious conflicts with bpf trees.
> > > > >
> > > > > Maybe send the first 3 with extra selftest for above recursion
> > > > > targeting bpf-next then we can have a merge commit that Steven can pull
> > > > > into tracing?
> > > >
> > > > Would it be possible to do this by basing it off of one of Linus's tags,
> > > > and doing the merge and conflict resolution in your tree before it gets to
> > > > Linus?
> > > >
> > > > That way we can pull in that clean branch without having to pull in
> > > > anything else from BPF. I believe Linus prefers this over having tracing
> > > > having extra changes from BPF that are not yet in his tree. We only need
> > > > these particular changes, we shouldn't be pulling in anything specific for
> > > > BPF, as I believe that will cause issues on Linus's side.
> > >
> > > We can try, but I suspect git tricks won't do it.
> > > Masami's changes depend on patches for kernel/bpf/btf.c that
> > > are already in bpf-next, so git would have to follow all commits
> > > that touch this file.
> >
> > This point is strange. I'm working on probe/fixes which is based on
> > v6.5-rc3, so any bpf-next change should not be involved. Can you recheck
> > this point?
> >
> > > I don't think git is smart enough to
> > > thread the needle and split the commit into files. If one commit touches
> > > btf.c and something else that whole commit becomes a dependency
> > > that pulls another commit with all files touched by
> > > the previous commit and so on.
> >
> > As far as I understand Steve's method, we will have an intermediate branch
> > on bpf or probe tree, like
> >
> > linus(some common commit) ---- probes/btf-find-api
> >
> > and merge it to both bpf-next and probes/for-next branch
> >
> >           +----------------------bpf-next --- (merge bpf patches)
> >          /                       / merge
> > common -/\ probes/btf-find-api -/-\
> >           \                        \ merge
> >            +----------------------probes/for-next --- (merge probe patches)
> >
> > Thus, we can merge both for-next branches at next merge window without
> > any issue. (But, yes, this is not simple, and needs maxium care)
> 
> Sounds like the path of least resistance is to keep the changes
> in kernel/trace and consolidate with kernel/bpf/btf.c after the next
> merge window.

OK, sounds good to me. I will only expose the bpf_find_btf_id() then.

Thank you,
Alexei Starovoitov Aug. 1, 2023, 11:44 p.m. UTC | #15
On Tue, Aug 1, 2023 at 4:09 PM Steven Rostedt <rostedt@goodmis.org> wrote>
> Then I recommend that you give up using fprobes and just stick with kprobes
> as that's guaranteed to give you full pt_regs (at the overhead of doing
> things like filing in flags and such). And currently for arm64, fprobes can
> only work with ftrace_regs, without the full pt_regs.

bpf doesn't attach to fprobes directly. That was never requested.
But Jiri's work to support multi attach
https://lore.kernel.org/bpf/20220316122419.933957-1-jolsa@kernel.org/
was a joint effort with Masami that relied on fprobe multi attach api.
register_fprobe_ips() in particular, because the promise you guys
give us that callback will get pt_regs as
described in Documentation/trace/fprobe.rst.
From bpf side we don't care that such pt_regs is 100% filled in or
only partial as long as this pt_regs pointer is valid for perf_event_output
and stack walking that consume pt_regs.
I believe that was and still is the case for both x86 and arm64.

The way I understood Masami's intent is to change that promise and
fprobe callback will receive ftrace_regs that is incompatible with
pt_regs and that's obviously bad.
What you're suggesting "give up on using fprobe" is not up to us.
We're not using them. We care about register_fprobe_ips() and what
callback receives. Whatever internal changes to fprobe you're doing
are ok as long as the callback receives valid pt_regs (even partially filled).
Masami Hiramatsu (Google) Aug. 2, 2023, 12:21 a.m. UTC | #16
On Tue, 1 Aug 2023 19:09:20 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 1 Aug 2023 15:18:56 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Tue, 1 Aug 2023 11:20:36 -0400
> > > Steven Rostedt <rostedt@goodmis.org> wrote:
> > >  
> > > > The solution was to come up with ftrace_regs, which just means it has all
> > > > the registers to extract the arguments of a function and nothing more. Most  
> > >
> > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As
> > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They
> > > will do:
> > >
> > >         void callback(..., struct ftrace_regs *fregs) {
> > >                 struct pt_regs *regs = ftrace_get_regs(fregs);
> > >
> > >
> > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled.
> > > If it is not, then it returns NULL. This was what the x86 maintainers
> > > agreed with.  
> > 
> > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL
> > 
> > Ouch. That's very bad.
> > We care a lot about bpf running well on arm64.
> 
> [ Adding Mark and Florent ]
> 
> That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their
> function handlers only care about the arguments. If you want full regs at
> function entry, then you need to take a breakpoint hit for a full kprobe.
> 
> In fact, fprobes isn't even supported on arm64 because it it doesn't have
> DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying
> to get it to work with ftrace_regs. To get it to work on arm64.

That's right. And I think (agree) pt_regs is too heavy for function entry/exit
because most users needs to access the function arguments or return value.
kprobes is a bit different because it is for instruction level inspection
tool.

> 
> Again, ftrace_get_regs(fregs) is only suppose to return something if the
> pt_regs is fully supplied. If they are not, then it must not be used. Are
> you not using a fully filled pt_regs? Because that's what both Thomas and
> Peter (also added) told me not to do!

I guess that the user-land BPF tools (compliers etc.) only generates
bytecode to access registers in pt_regs for kernel probes currently.
This is why you are using "kprobes" as a naming. But I think you can be
more flexible to generate the code to access registers in ftrace_regs.
(because it's just a difference in the offset value)

> 
> Otherwise, ftrace_regs() has support on arm64 for getting to the argument
> registers and the stack. Even live kernel patching now uses ftrace_regs().
> 
> > 
> > If you guys decide to convert fprobe to ftrace_regs please
> > make it depend on kconfig or something.
> > bpf side needs full pt_regs.
> 
> Then use kprobes. When I asked Masami what the difference between fprobes
> and kprobes was, he told me that it would be that it would no longer rely
> on the slower FTRACE_WITH_REGS. But currently, it still does.

kprobes needs to keep using pt_regs because software-breakpoint exception
handler gets that. And fprobe is used for bpf multi-kprobe interface,
but I think it can be optional.

So until user-land tool supports the ftrace_regs, you can just disable
using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n

Then you can safely use 

struct pt_regs *regs = ftrace_get_regs(fregs);

I think we can just replace the CONFIG_FPROBE ifdefs with
CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c
And that will be the first version of using ftrace_regs in fprobe.

> 
> The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in
> the first place, was because of the overhead you reported to me with
> ftrace_regs_caller and why you wanted to go the direct trampoline approach.
> That's when I realized I could use a subset because those registers were
> already being saved. The only reason FTRACE_WITH_REGS was created was it
> had to supply full pt_regs (including flags) and emulate a breakpoint for
> the kprobes interface. But in reality, nothing really needs all that.
> 
> > It's not about access to args.
> > pt_regs is passed from bpf prog further into all kinds of perf event
> > functions including stack walking.
> 
> ftrace_regs gives you the stack pointer. Basically, it gives you access to
> anything that is required to be saved to do a function call from fentry.

Yeah, for stack walking, we usually need stack pointer and instruction pointer
or frame pointer. But Alexei made a good point, linux/stacktrace.h provides
pt_regs interaface because pt_regs is a generic (arch-independent) data
structure. (see arch_stack_walk()) We need a new interface for it.

> 
> > I think ORC unwinder might depend on availability of all registers.

This is not correct. ORC uses limited registers (r10, r13, bp, sp, di, dx)
on x86. Anyway, since ftrace can make a stacktrace, it should be possible
to use ORC with ftrace_regs.

> > Other perf helpers might need it too. Like perf_event_output.
> > bpf progs need to access arguments, no doubt about that.
> > If ftrace_regs have them exactly in the same offsets as in pt_regs
> > that might work transparently for bpf progs, but, I'm afraid,
> > it's not the case on all archs.
> > So we need full pt_regs to make sure all paths are still working.
> > 
> > Adding Jiri and others.
> 
> Then I recommend that you give up using fprobes and just stick with kprobes
> as that's guaranteed to give you full pt_regs (at the overhead of doing
> things like filing in flags and such). And currently for arm64, fprobes can
> only work with ftrace_regs, without the full pt_regs.

I think we can continue to limit usage of fprobe(kprobe_multi) with
CONFIG_DYNAMIC_FTRACE_WITH_REGS, which can be configured on x86. That will
not change anything from the BPF point of view.

Thank you,

> 
> -- Steve
Steven Rostedt Aug. 2, 2023, 12:40 a.m. UTC | #17
On Wed, 2 Aug 2023 09:21:46 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > Then use kprobes. When I asked Masami what the difference between fprobes
> > and kprobes was, he told me that it would be that it would no longer rely
> > on the slower FTRACE_WITH_REGS. But currently, it still does.  
> 
> kprobes needs to keep using pt_regs because software-breakpoint exception
> handler gets that. And fprobe is used for bpf multi-kprobe interface,
> but I think it can be optional.
> 
> So until user-land tool supports the ftrace_regs, you can just disable
> using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n

I'm confused. I asked about the difference between kprobes on ftrace
and fprobes, and you said it was to get rid of the requirement of
FTRACE_WITH_REGS.

 https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/

> 
> Then you can safely use 
> 
> struct pt_regs *regs = ftrace_get_regs(fregs);
> 
> I think we can just replace the CONFIG_FPROBE ifdefs with
> CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c
> And that will be the first version of using ftrace_regs in fprobe.

But it is still slow. The FTRACE_WITH_REGS gives us the full pt_regs
and saves all registers including flags, which is a very slow operation
(and noticeable in profilers).

And this still doesn't work on arm64.

Maybe we can add a ftrace_partial_regs(fregs) that returns a
partially filled pt_regs, and the caller that uses this obviously knows
its partial (as it's in the name). But this doesn't quite help out arm64
because unlike x86, struct ftrace_regs does not contain an address
compatibility with pt_regs fields. It would need to do a copy.

 ftrace_partial_regs(fregs, &regs) ?

-- Steve
Steven Rostedt Aug. 2, 2023, 12:44 a.m. UTC | #18
On Tue, 1 Aug 2023 20:40:54 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> Maybe we can add a ftrace_partial_regs(fregs) that returns a
> partially filled pt_regs, and the caller that uses this obviously knows
> its partial (as it's in the name). But this doesn't quite help out arm64
> because unlike x86, struct ftrace_regs does not contain an address
> compatibility with pt_regs fields. It would need to do a copy.
> 
>  ftrace_partial_regs(fregs, &regs) ?

Well, both would be pointers so you wouldn't need the "&", but it was
to stress that it would be copying one to the other.

  void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs);

-- Steve
Alexei Starovoitov Aug. 2, 2023, 2:22 a.m. UTC | #19
On Tue, Aug 1, 2023 at 5:44 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 1 Aug 2023 20:40:54 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > Maybe we can add a ftrace_partial_regs(fregs) that returns a
> > partially filled pt_regs, and the caller that uses this obviously knows
> > its partial (as it's in the name). But this doesn't quite help out arm64
> > because unlike x86, struct ftrace_regs does not contain an address
> > compatibility with pt_regs fields. It would need to do a copy.
> >
> >  ftrace_partial_regs(fregs, &regs) ?
>
> Well, both would be pointers so you wouldn't need the "&", but it was
> to stress that it would be copying one to the other.
>
>   void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs);

Copy works, but why did you pick a different layout?
Why not to use pt_regs ? if save of flags is slow, just skip that part
and whatever else that is slow. You don't even need to zero out
unsaved fields. Just ask the caller to zero out pt_regs before hand.
Most users have per-cpu pt_regs that is being reused.
So there will be one zero-out in the beginning and every partial
save of regs will be fast.
Then there won't be any need for copy-converter from ftrace_regs to pt_regs.
Maybe too much churn at this point. copy is fine.
Steven Rostedt Aug. 2, 2023, 2:32 a.m. UTC | #20
On Tue, 1 Aug 2023 19:22:01 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> >   void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs);  
> 
> Copy works, but why did you pick a different layout?

I didn't. That code was written by the arm64 maintainers.

-- Steve
Masami Hiramatsu (Google) Aug. 2, 2023, 1:56 p.m. UTC | #21
On Tue, 1 Aug 2023 20:40:54 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Wed, 2 Aug 2023 09:21:46 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > > Then use kprobes. When I asked Masami what the difference between fprobes
> > > and kprobes was, he told me that it would be that it would no longer rely
> > > on the slower FTRACE_WITH_REGS. But currently, it still does.  
> > 
> > kprobes needs to keep using pt_regs because software-breakpoint exception
> > handler gets that. And fprobe is used for bpf multi-kprobe interface,
> > but I think it can be optional.
> > 
> > So until user-land tool supports the ftrace_regs, you can just disable
> > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n
> 
> I'm confused. I asked about the difference between kprobes on ftrace
> and fprobes, and you said it was to get rid of the requirement of
> FTRACE_WITH_REGS.
> 
>  https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/

Yes, it is for enabling fprobe (and fprobe-event) on more architectures.
I don't think it's possible to change everything at once. So, it will be
changed step by step. At the first step, I will replace pt_regs with
ftrace_regs, and make bpf_trace.c and fprobe_event depends on
FTRACE_WITH_REGS.

At this point, we can split the problem into two, how to move bpf on
ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event
change is not hard because it is closing in the kernel and I can do it.
But for BPF, I need to ask BPF user-land tools to support ftrace_regs.

> 
> > 
> > Then you can safely use 
> > 
> > struct pt_regs *regs = ftrace_get_regs(fregs);
> > 
> > I think we can just replace the CONFIG_FPROBE ifdefs with
> > CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c
> > And that will be the first version of using ftrace_regs in fprobe.
> 
> But it is still slow. The FTRACE_WITH_REGS gives us the full pt_regs
> and saves all registers including flags, which is a very slow operation
> (and noticeable in profilers).

Yes, to solve this part, we need to work with BPF user-land people.
I guess the BPF is accessing registers from pt_regs with fixed offset
which is calculated from pt_regs layout in the user-space.

> 
> And this still doesn't work on arm64.

Yes, and this makes more motivation to move on ftrace_regs.

Thank you,
Masami Hiramatsu (Google) Aug. 2, 2023, 2:07 p.m. UTC | #22
On Tue, 1 Aug 2023 19:22:01 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Tue, Aug 1, 2023 at 5:44 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Tue, 1 Aug 2023 20:40:54 -0400
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > > Maybe we can add a ftrace_partial_regs(fregs) that returns a
> > > partially filled pt_regs, and the caller that uses this obviously knows
> > > its partial (as it's in the name). But this doesn't quite help out arm64
> > > because unlike x86, struct ftrace_regs does not contain an address
> > > compatibility with pt_regs fields. It would need to do a copy.
> > >
> > >  ftrace_partial_regs(fregs, &regs) ?
> >
> > Well, both would be pointers so you wouldn't need the "&", but it was
> > to stress that it would be copying one to the other.
> >
> >   void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs);
> 
> Copy works, but why did you pick a different layout?

I think it is for minimize the stack consumption. pt_regs on arm64 will
consume 42*u64 = 336 bytes, on the other hand ftrace_regs will use
14*unsigned long = 112 bytes. And most of the registers in pt_regs are not
accessed usually. (as you may know RISC processors usually have many
registers - and x86 will be if we use APX in kernel. So pt_regs is big.)

> Why not to use pt_regs ? if save of flags is slow, just skip that part
> and whatever else that is slow. You don't even need to zero out
> unsaved fields. Just ask the caller to zero out pt_regs before hand.
> Most users have per-cpu pt_regs that is being reused.
> So there will be one zero-out in the beginning and every partial
> save of regs will be fast.
> Then there won't be any need for copy-converter from ftrace_regs to pt_regs.
> Maybe too much churn at this point. copy is fine.

If there is no nested call, yeah, per-cpu pt_regs will work.

Thank you,
Florent Revest Aug. 2, 2023, 2:44 p.m. UTC | #23
On Wed, Aug 2, 2023 at 1:09 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 1 Aug 2023 15:18:56 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Tue, 1 Aug 2023 11:20:36 -0400
> > > Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > > The solution was to come up with ftrace_regs, which just means it has all
> > > > the registers to extract the arguments of a function and nothing more. Most
> > >
> > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As
> > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They
> > > will do:
> > >
> > >         void callback(..., struct ftrace_regs *fregs) {
> > >                 struct pt_regs *regs = ftrace_get_regs(fregs);
> > >
> > >
> > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled.
> > > If it is not, then it returns NULL. This was what the x86 maintainers
> > > agreed with.
> >
> > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL
> >
> > Ouch. That's very bad.
> > We care a lot about bpf running well on arm64.
>
> [ Adding Mark and Florent ]

Ah, thanks Steve! That's my favorite can of worms :) I actually
consider sending a talk proposal to the tracing MC at LPC "pt_regs -
the good the bad and the ugly" on this very topic because I care about
unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe
it would be interesting.

> That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their
> function handlers only care about the arguments. If you want full regs at
> function entry, then you need to take a breakpoint hit for a full kprobe.

The main reason why arm64 dropped FTRACE_WITH_REGS is because some
registers (like pstate) can not be saved outside of an exception entry
(they are just wrong), so trampolines either have to make a pstate up
or not populate it.

The other reasons are: simplicity (for architectural reasons, it's a
lot easier to have only one type of ftrace trampoline on arm64, the
"with_args" one) and performance (as you said, why bother saving a
pt_regs when most ftrace users don't need it anyway). If you need an
actual full pt_regs, then your use case is debugging rather than
tracing and you should be able to deal with the slowness and go
through an exception (a kprobe).

> In fact, fprobes isn't even supported on arm64 because it it doesn't have
> DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying
> to get it to work with ftrace_regs. To get it to work on arm64.
>
> Again, ftrace_get_regs(fregs) is only suppose to return something if the
> pt_regs is fully supplied. If they are not, then it must not be used. Are
> you not using a fully filled pt_regs? Because that's what both Thomas and
> Peter (also added) told me not to do!

Funnily enough, there's another use of sparse pt_regs in the kernel, in Perf:
https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/perf_event.h#n20
Notice how Perf on arm64 implicitly expects the "pstate" register to
be set (the very register which we try so hard not to fake in
ftrace_regs) because Perf happens to call the "user_mode()" macro
somewhere which reads this field:
https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/ptrace.h#n227

I pointed this out in
https://lore.kernel.org/all/CABRcYm+esb8J2O1v6=C+h+HSa5NxraPUgo63w7-iZj0CXbpusg@mail.gmail.com/#t
when Masami proposed adding calls from fprobe to perf. If every
subsystem makes different assumptions about "how sparse" their pt_regs
is and they call into one another, this could lead to... interesting
bugs. (eg: currently, we don't populate a fake pstate in ftrace_regs.
so we'd need to fake it when creating a sparse pt_regs _for Perf_,
knowing that Perf specifically expects this reg to be set. this would
require a struct copy anyway and some knowledge about how the data
will be consumed, in an arch- and subsystem- specific way)

On the other hand, untangling all code paths that come from
trampolines (with a light regs structure) from those that come from an
exception (with a pt_regs) could lead to a lot of duplicated code, and
converting between each subsystem's idea of a light regs structure
(what if perf introduces a perf_regs now ?) would be tedious and slow
(lots of copies ?).

> Otherwise, ftrace_regs() has support on arm64 for getting to the argument
> registers and the stack. Even live kernel patching now uses ftrace_regs().
>
> >
> > If you guys decide to convert fprobe to ftrace_regs please
> > make it depend on kconfig or something.
> > bpf side needs full pt_regs.

Some wild ideas that I brought up once in a BPF office hour: BPF
"multi_kprobe" could provide a fake pt_regs (either by constructing a
sparse one on the stack or by JIT-ing different offset accesses and/or
by having the verifier deny access to unpopulated fields) or break the
current API (is it conceivable to phase out BPF "multi_kprobe"
programs in favor of BPF "fprobe" programs that don't lie about their
API and guarantees and just provide a ftrace_regs ?)

> Then use kprobes. When I asked Masami what the difference between fprobes
> and kprobes was, he told me that it would be that it would no longer rely
> on the slower FTRACE_WITH_REGS. But currently, it still does.

Actually... Moving fprobe to ftrace_regs should get even more spicy!
:) Fprobe also wraps "rethook" which is basically the same thing as
kretprobe: a return trampoline that saves a pt_regs, to the point that
on x86 kretprobe's trampoline got dropped in favor of rethook's
trampoline. But for the same reasons that we don't want ftrace to save
pt_regs on arm64, rethook should probably also just save a ftrace_regs
? (also, to keep the fprobe callback signatures consistent between
pre- and post- handlers). But if we want fprobe "post" callbacks to
save a ftrace_regs now, either we need to re-introduce the kretprobe
trampoline or also change the API of kretprobe (and break its symmetry
with kprobe and we'd have the same problem all over again with BPF
kretprobe program types...). All of this is "beautifully" entangled...
:)

> The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in
> the first place, was because of the overhead you reported to me with
> ftrace_regs_caller and why you wanted to go the direct trampoline approach.
> That's when I realized I could use a subset because those registers were
> already being saved. The only reason FTRACE_WITH_REGS was created was it
> had to supply full pt_regs (including flags) and emulate a breakpoint for
> the kprobes interface. But in reality, nothing really needs all that.
>
> > It's not about access to args.
> > pt_regs is passed from bpf prog further into all kinds of perf event
> > functions including stack walking.

If all accesses are done in BPF bytecode, we could (theoretically)
have the verifier and JIT work together to deny accesses to
unpopulated fields, or relocate pt_regs accesses to ftrace_regs
accesses to keep backward compatibility with existing multi_kprobe BPF
programs.

Is there a risk that a "multi_kprobe" program could call into a BPF
helper or kfunc that reads this pt_regs pointer and expect certain
fields to be set ? I suppose we could also deny giving that "pt_regs"
pointer to a helper... :/

> ftrace_regs gives you the stack pointer. Basically, it gives you access to
> anything that is required to be saved to do a function call from fentry.
>
> > I think ORC unwinder might depend on availability of all registers.
> > Other perf helpers might need it too. Like perf_event_output.
> > bpf progs need to access arguments, no doubt about that.
> > If ftrace_regs have them exactly in the same offsets as in pt_regs
> > that might work transparently for bpf progs, but, I'm afraid,
> > it's not the case on all archs.
> > So we need full pt_regs to make sure all paths are still working.
> >
> > Adding Jiri and others.
>
> Then I recommend that you give up using fprobes and just stick with kprobes
> as that's guaranteed to give you full pt_regs (at the overhead of doing
> things like filing in flags and such). And currently for arm64, fprobes can
> only work with ftrace_regs, without the full pt_regs.
Florent Revest Aug. 2, 2023, 2:48 p.m. UTC | #24
On Wed, Aug 2, 2023 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Tue, 1 Aug 2023 20:40:54 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > On Wed, 2 Aug 2023 09:21:46 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> >
> > > > Then use kprobes. When I asked Masami what the difference between fprobes
> > > > and kprobes was, he told me that it would be that it would no longer rely
> > > > on the slower FTRACE_WITH_REGS. But currently, it still does.
> > >
> > > kprobes needs to keep using pt_regs because software-breakpoint exception
> > > handler gets that. And fprobe is used for bpf multi-kprobe interface,
> > > but I think it can be optional.
> > >
> > > So until user-land tool supports the ftrace_regs, you can just disable
> > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n
> >
> > I'm confused. I asked about the difference between kprobes on ftrace
> > and fprobes, and you said it was to get rid of the requirement of
> > FTRACE_WITH_REGS.
> >
> >  https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/
>
> Yes, it is for enabling fprobe (and fprobe-event) on more architectures.
> I don't think it's possible to change everything at once. So, it will be
> changed step by step. At the first step, I will replace pt_regs with
> ftrace_regs, and make bpf_trace.c and fprobe_event depends on
> FTRACE_WITH_REGS.

Just a small note that, strictly speaking,
CONFIG_DYNAMIC_FTRACE_WITH_REGS=y is not enough. fprobe_init() would
also need a way to set FTRACE_OPS_FL_SAVE_REGS conditionally. (you
could be on an arch that supports saving either regs or args and if
you don't set FTRACE_OPS_FL_SAVE_REGS you'd go through the args
trampoline and get a ftrace_regs that doesn't hold a pt_regs)
Florent Revest Aug. 2, 2023, 3:08 p.m. UTC | #25
On Wed, Aug 2, 2023 at 4:07 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Tue, 1 Aug 2023 19:22:01 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Tue, Aug 1, 2023 at 5:44 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Tue, 1 Aug 2023 20:40:54 -0400
> > > Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > > Maybe we can add a ftrace_partial_regs(fregs) that returns a
> > > > partially filled pt_regs, and the caller that uses this obviously knows
> > > > its partial (as it's in the name). But this doesn't quite help out arm64
> > > > because unlike x86, struct ftrace_regs does not contain an address
> > > > compatibility with pt_regs fields. It would need to do a copy.
> > > >
> > > >  ftrace_partial_regs(fregs, &regs) ?
> > >
> > > Well, both would be pointers so you wouldn't need the "&", but it was
> > > to stress that it would be copying one to the other.
> > >
> > >   void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs);
> >
> > Copy works, but why did you pick a different layout?
>
> I think it is for minimize the stack consumption. pt_regs on arm64 will
> consume 42*u64 = 336 bytes, on the other hand ftrace_regs will use
> 14*unsigned long = 112 bytes. And most of the registers in pt_regs are not
> accessed usually. (as you may know RISC processors usually have many
> registers - and x86 will be if we use APX in kernel. So pt_regs is big.)
>
> > Why not to use pt_regs ? if save of flags is slow, just skip that part
> > and whatever else that is slow. You don't even need to zero out
> > unsaved fields. Just ask the caller to zero out pt_regs before hand.
> > Most users have per-cpu pt_regs that is being reused.
> > So there will be one zero-out in the beginning and every partial
> > save of regs will be fast.
> > Then there won't be any need for copy-converter from ftrace_regs to pt_regs.
> > Maybe too much churn at this point. copy is fine.
>
> If there is no nested call, yeah, per-cpu pt_regs will work.

BPF "multi_kprobe" programs (ugh, it's pretty awkward we called them
that way given that kprobe is out of the picture and fprobe is subject
to completely different constraints than kprobe) can't be nested, as
checked here: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/tree/kernel/trace/bpf_trace.c?id=4c9fbff54297471d4e2bbfe9c27e80067c722eae#n2642
(this is probably the place where we'd be calling a
"ftrace_partical_regs" anyway so that's cool)
Florent Revest Aug. 2, 2023, 3:47 p.m. UTC | #26
On Wed, Aug 2, 2023 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Tue, 1 Aug 2023 20:40:54 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > On Wed, 2 Aug 2023 09:21:46 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> >
> > > > Then use kprobes. When I asked Masami what the difference between fprobes
> > > > and kprobes was, he told me that it would be that it would no longer rely
> > > > on the slower FTRACE_WITH_REGS. But currently, it still does.
> > >
> > > kprobes needs to keep using pt_regs because software-breakpoint exception
> > > handler gets that. And fprobe is used for bpf multi-kprobe interface,
> > > but I think it can be optional.
> > >
> > > So until user-land tool supports the ftrace_regs, you can just disable
> > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n
> >
> > I'm confused. I asked about the difference between kprobes on ftrace
> > and fprobes, and you said it was to get rid of the requirement of
> > FTRACE_WITH_REGS.
> >
> >  https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/
>
> Yes, it is for enabling fprobe (and fprobe-event) on more architectures.
> I don't think it's possible to change everything at once. So, it will be
> changed step by step. At the first step, I will replace pt_regs with
> ftrace_regs, and make bpf_trace.c and fprobe_event depends on
> FTRACE_WITH_REGS.
>
> At this point, we can split the problem into two, how to move bpf on
> ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event
> change is not hard because it is closing in the kernel and I can do it.
> But for BPF, I need to ask BPF user-land tools to support ftrace_regs.

Ah! I finally found the branch where I had pushed my proof of concept
of fprobe with ftrace_regs... it's a few months old and I didn't get
it in a state such that it could be sent to the list but maybe this
can save you a little bit of lead time Masami :) (especially the bpf
and arm64 specific bits)

https://github.com/FlorentRevest/linux/commits/bpf-arm-complete

08afb628c6e1 ("ftrace: Add a macro to forge an incomplete pt_regs from
a ftrace_regs")
203e96fe1790 ("fprobe, rethook: Use struct ftrace_regs instead of
struct pt_regs")
1a9e280b9b16 ("arm64,rethook,kprobes: Replace kretprobe with rethook on arm64")
7751c6db9f9d ("bpf: Fix bpf get_func_ip() on arm64 multi-kprobe programs")
a10c49c0d717 ("selftests/bpf: Update the tests deny list on aarch64")
Steven Rostedt Aug. 2, 2023, 4:11 p.m. UTC | #27
On Wed, 2 Aug 2023 16:44:09 +0200
Florent Revest <revest@chromium.org> wrote:

> > [ Adding Mark and Florent ]  
> 
> Ah, thanks Steve! That's my favorite can of worms :) I actually
> consider sending a talk proposal to the tracing MC at LPC "pt_regs -
> the good the bad and the ugly" on this very topic because I care about
> unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe
> it would be interesting.

You bring up some excellent points, and the CFP for the Tracing MC is still
open. Which reminds me, I need to write my blog post!

-- Steve
Alexei Starovoitov Aug. 2, 2023, 6:24 p.m. UTC | #28
On Wed, Aug 2, 2023 at 6:56 AM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Tue, 1 Aug 2023 20:40:54 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > On Wed, 2 Aug 2023 09:21:46 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> >
> > > > Then use kprobes. When I asked Masami what the difference between fprobes
> > > > and kprobes was, he told me that it would be that it would no longer rely
> > > > on the slower FTRACE_WITH_REGS. But currently, it still does.
> > >
> > > kprobes needs to keep using pt_regs because software-breakpoint exception
> > > handler gets that. And fprobe is used for bpf multi-kprobe interface,
> > > but I think it can be optional.
> > >
> > > So until user-land tool supports the ftrace_regs, you can just disable
> > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n
> >
> > I'm confused. I asked about the difference between kprobes on ftrace
> > and fprobes, and you said it was to get rid of the requirement of
> > FTRACE_WITH_REGS.
> >
> >  https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/
>
> Yes, it is for enabling fprobe (and fprobe-event) on more architectures.
> I don't think it's possible to change everything at once. So, it will be
> changed step by step. At the first step, I will replace pt_regs with
> ftrace_regs, and make bpf_trace.c and fprobe_event depends on
> FTRACE_WITH_REGS.
>
> At this point, we can split the problem into two, how to move bpf on
> ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event
> change is not hard because it is closing in the kernel and I can do it.
> But for BPF, I need to ask BPF user-land tools to support ftrace_regs.
>
> >
> > >
> > > Then you can safely use
> > >
> > > struct pt_regs *regs = ftrace_get_regs(fregs);
> > >
> > > I think we can just replace the CONFIG_FPROBE ifdefs with
> > > CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c
> > > And that will be the first version of using ftrace_regs in fprobe.
> >
> > But it is still slow. The FTRACE_WITH_REGS gives us the full pt_regs
> > and saves all registers including flags, which is a very slow operation
> > (and noticeable in profilers).
>
> Yes, to solve this part, we need to work with BPF user-land people.
> I guess the BPF is accessing registers from pt_regs with fixed offset
> which is calculated from pt_regs layout in the user-space.

This is a non starter.
bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that.
Steven Rostedt Aug. 2, 2023, 6:38 p.m. UTC | #29
On Wed, 2 Aug 2023 11:24:12 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> This is a non starter.
> bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that.

If the progs are compiled into native code, isn't there optimizations that
could be done? That is, if ftrace_regs is available, and the bpf program is
just using the subset of pt_regs, is it possible that it could be compiled
to use ftrace_regs?

Forgive my ignorance on how BPF programs turn into executables when running
in the kernel.

-- Steve
Alexei Starovoitov Aug. 2, 2023, 7:48 p.m. UTC | #30
On Wed, Aug 2, 2023 at 11:38 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 2 Aug 2023 11:24:12 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > This is a non starter.
> > bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that.
>
> If the progs are compiled into native code, isn't there optimizations that
> could be done? That is, if ftrace_regs is available, and the bpf program is
> just using the subset of pt_regs, is it possible that it could be compiled
> to use ftrace_regs?
>
> Forgive my ignorance on how BPF programs turn into executables when running
> in the kernel.

Right. It's possible for the verifier to do an offset rewrite,
forbid certain access, always return 0 on load from certain offset,
and so on.
It's all non trivial amount of work.
ftrace_partial_regs() from ftrace_regs into pt_regs is so much simpler.
Steven Rostedt Aug. 2, 2023, 8:12 p.m. UTC | #31
On Wed, 2 Aug 2023 12:48:14 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Wed, Aug 2, 2023 at 11:38 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Wed, 2 Aug 2023 11:24:12 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >  
> > > This is a non starter.
> > > bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that.  
> >
> > If the progs are compiled into native code, isn't there optimizations that
> > could be done? That is, if ftrace_regs is available, and the bpf program is
> > just using the subset of pt_regs, is it possible that it could be compiled
> > to use ftrace_regs?
> >
> > Forgive my ignorance on how BPF programs turn into executables when running
> > in the kernel.  
> 
> Right. It's possible for the verifier to do an offset rewrite,
> forbid certain access, always return 0 on load from certain offset,
> and so on.
> It's all non trivial amount of work.
> ftrace_partial_regs() from ftrace_regs into pt_regs is so much simpler.

Sure, and the copy could be the solution we have in the near future, but if
we could optimize it in the future, then perhaps it would be worth doing it.

Also, how are the bpf programs referencing the pt_regs? Could a ftrace_regs
API be added too? If the verifier sees that the program is using
ftrace_regs, it could then use the lighter weight fprobes for access,
otherwise it falls back to the kprobe version.

-- Steve
Alexei Starovoitov Aug. 2, 2023, 9:28 p.m. UTC | #32
On Wed, Aug 2, 2023 at 1:12 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 2 Aug 2023 12:48:14 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Wed, Aug 2, 2023 at 11:38 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Wed, 2 Aug 2023 11:24:12 -0700
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > >
> > > > This is a non starter.
> > > > bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that.
> > >
> > > If the progs are compiled into native code, isn't there optimizations that
> > > could be done? That is, if ftrace_regs is available, and the bpf program is
> > > just using the subset of pt_regs, is it possible that it could be compiled
> > > to use ftrace_regs?
> > >
> > > Forgive my ignorance on how BPF programs turn into executables when running
> > > in the kernel.
> >
> > Right. It's possible for the verifier to do an offset rewrite,
> > forbid certain access, always return 0 on load from certain offset,
> > and so on.
> > It's all non trivial amount of work.
> > ftrace_partial_regs() from ftrace_regs into pt_regs is so much simpler.
>
> Sure, and the copy could be the solution we have in the near future, but if
> we could optimize it in the future, then perhaps it would be worth doing it.
>
> Also, how are the bpf programs referencing the pt_regs?

Typically through macros that abstract arch differences away in
tools/lib/bpf/bpf_tracing.h
PT_REGS_PARM1
PT_REGS_PARM1_CORE
PT_REGS_PARM1_SYSCALL

pt_regs at syscall entry is special, since syscall calling convention
is different from the rest of the kernel.
ftrace_regs cannot help with that either.

> Could a ftrace_regs
> API be added too?

Potentially yes, but I don't see the value.
bpf users are slowly migrating to fentry/fexit that has accurate
args and return value and much faster.
kprobes are still heavily used, of course.
multi-kprobe (with fprobe_ips underneath) is a new addition that is
also very important to some users.

> If the verifier sees that the program is using
> ftrace_regs, it could then use the lighter weight fprobes for access,
> otherwise it falls back to the kprobe version.
>
> -- Steve
Masami Hiramatsu (Google) Aug. 3, 2023, 1:55 a.m. UTC | #33
On Wed, 2 Aug 2023 17:47:03 +0200
Florent Revest <revest@chromium.org> wrote:

> On Wed, Aug 2, 2023 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
> >
> > On Tue, 1 Aug 2023 20:40:54 -0400
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > > On Wed, 2 Aug 2023 09:21:46 +0900
> > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > >
> > > > > Then use kprobes. When I asked Masami what the difference between fprobes
> > > > > and kprobes was, he told me that it would be that it would no longer rely
> > > > > on the slower FTRACE_WITH_REGS. But currently, it still does.
> > > >
> > > > kprobes needs to keep using pt_regs because software-breakpoint exception
> > > > handler gets that. And fprobe is used for bpf multi-kprobe interface,
> > > > but I think it can be optional.
> > > >
> > > > So until user-land tool supports the ftrace_regs, you can just disable
> > > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n
> > >
> > > I'm confused. I asked about the difference between kprobes on ftrace
> > > and fprobes, and you said it was to get rid of the requirement of
> > > FTRACE_WITH_REGS.
> > >
> > >  https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/
> >
> > Yes, it is for enabling fprobe (and fprobe-event) on more architectures.
> > I don't think it's possible to change everything at once. So, it will be
> > changed step by step. At the first step, I will replace pt_regs with
> > ftrace_regs, and make bpf_trace.c and fprobe_event depends on
> > FTRACE_WITH_REGS.
> >
> > At this point, we can split the problem into two, how to move bpf on
> > ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event
> > change is not hard because it is closing in the kernel and I can do it.
> > But for BPF, I need to ask BPF user-land tools to support ftrace_regs.
> 
> Ah! I finally found the branch where I had pushed my proof of concept
> of fprobe with ftrace_regs... it's a few months old and I didn't get
> it in a state such that it could be sent to the list but maybe this
> can save you a little bit of lead time Masami :) (especially the bpf
> and arm64 specific bits)
> 
> https://github.com/FlorentRevest/linux/commits/bpf-arm-complete
> 
> 08afb628c6e1 ("ftrace: Add a macro to forge an incomplete pt_regs from
> a ftrace_regs")
> 203e96fe1790 ("fprobe, rethook: Use struct ftrace_regs instead of
> struct pt_regs")
> 1a9e280b9b16 ("arm64,rethook,kprobes: Replace kretprobe with rethook on arm64")
> 7751c6db9f9d ("bpf: Fix bpf get_func_ip() on arm64 multi-kprobe programs")
> a10c49c0d717 ("selftests/bpf: Update the tests deny list on aarch64")

Thanks for the work! I also pushed my patches on 

https://kernel.googlesource.com/pub/scm/linux/kernel/git/mhiramat/linux/+/refs/heads/topic/fprobe-ftrace-regs

628e6c19d7dc ("tracing/fprobe: Enable fprobe events with CONFIG_DYNAMIC_FTRACE_WITH_ARGS")
311c98c29cfd ("fprobe: Use fprobe_regs in fprobe entry handler")

This doesn't cover arm64 and rethook, but provides ftrace_regs optimized
fprobe-event code, which uses a correct APIs for ftrace_regs.

For the rethook we still need to provide 2 version for kretprobe(pt_regs)
and fprobe(ftrace_regs).
I think eventually we should replace the kretprobe with fprobe, but
current rethook is tightly coupled with kretprobe and the kretprobe
needs pt_regs. So, I would like to keep arm64 kretprobe impl, and add
new rethook with ftrace_regs.

Or, maybe we need these 2 configs intermediately.
CONFIG_RETHOOK_WITH_REGS - in this case, kretprobe uses rethook
CONFIG_RETHOOK_WITH_ARGS - in this case, kretprobe uses its own stack

The problem is ftrace_regs only depends on CONFIG_DYNAMIC_FTRACE_WITH_*.

Thank you,
Masami Hiramatsu (Google) Aug. 3, 2023, 3:42 p.m. UTC | #34
On Wed, 2 Aug 2023 16:44:09 +0200
Florent Revest <revest@chromium.org> wrote:

> On Wed, Aug 2, 2023 at 1:09 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Tue, 1 Aug 2023 15:18:56 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > >
> > > > On Tue, 1 Aug 2023 11:20:36 -0400
> > > > Steven Rostedt <rostedt@goodmis.org> wrote:
> > > >
> > > > > The solution was to come up with ftrace_regs, which just means it has all
> > > > > the registers to extract the arguments of a function and nothing more. Most
> > > >
> > > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As
> > > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They
> > > > will do:
> > > >
> > > >         void callback(..., struct ftrace_regs *fregs) {
> > > >                 struct pt_regs *regs = ftrace_get_regs(fregs);
> > > >
> > > >
> > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled.
> > > > If it is not, then it returns NULL. This was what the x86 maintainers
> > > > agreed with.
> > >
> > > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL
> > >
> > > Ouch. That's very bad.
> > > We care a lot about bpf running well on arm64.
> >
> > [ Adding Mark and Florent ]
> 
> Ah, thanks Steve! That's my favorite can of worms :) I actually
> consider sending a talk proposal to the tracing MC at LPC "pt_regs -
> the good the bad and the ugly" on this very topic because I care about
> unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe
> it would be interesting.

Ah, it is almost same as my talk :)

> 
> > That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their
> > function handlers only care about the arguments. If you want full regs at
> > function entry, then you need to take a breakpoint hit for a full kprobe.
> 
> The main reason why arm64 dropped FTRACE_WITH_REGS is because some
> registers (like pstate) can not be saved outside of an exception entry
> (they are just wrong), so trampolines either have to make a pstate up
> or not populate it.
> 
> The other reasons are: simplicity (for architectural reasons, it's a
> lot easier to have only one type of ftrace trampoline on arm64, the
> "with_args" one) and performance (as you said, why bother saving a
> pt_regs when most ftrace users don't need it anyway). If you need an
> actual full pt_regs, then your use case is debugging rather than
> tracing and you should be able to deal with the slowness and go
> through an exception (a kprobe).

Agreed. Both reasons are reasonable. Especially function entry and
exit tracing API, we don't need full pt_regs because there is 
established ABI.

> 
> > In fact, fprobes isn't even supported on arm64 because it it doesn't have
> > DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying
> > to get it to work with ftrace_regs. To get it to work on arm64.
> >
> > Again, ftrace_get_regs(fregs) is only suppose to return something if the
> > pt_regs is fully supplied. If they are not, then it must not be used. Are
> > you not using a fully filled pt_regs? Because that's what both Thomas and
> > Peter (also added) told me not to do!
> 
> Funnily enough, there's another use of sparse pt_regs in the kernel, in Perf:
> https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/perf_event.h#n20
> Notice how Perf on arm64 implicitly expects the "pstate" register to
> be set (the very register which we try so hard not to fake in
> ftrace_regs) because Perf happens to call the "user_mode()" macro
> somewhere which reads this field:
> https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/ptrace.h#n227

I think interrupt/exception based API like kprobes and perf (PMU) may need
to use pt_regs. 

> 
> I pointed this out in
> https://lore.kernel.org/all/CABRcYm+esb8J2O1v6=C+h+HSa5NxraPUgo63w7-iZj0CXbpusg@mail.gmail.com/#t
> when Masami proposed adding calls from fprobe to perf. If every
> subsystem makes different assumptions about "how sparse" their pt_regs
> is and they call into one another, this could lead to... interesting
> bugs. (eg: currently, we don't populate a fake pstate in ftrace_regs.
> so we'd need to fake it when creating a sparse pt_regs _for Perf_,
> knowing that Perf specifically expects this reg to be set. this would
> require a struct copy anyway and some knowledge about how the data
> will be consumed, in an arch- and subsystem- specific way)

yeah, sorry I missed that point. I should remove it until we can fix it.

I think we can add another kernel event only perf_trace_buf_submit()
which doesn't have the user_mode() check.

> 
> On the other hand, untangling all code paths that come from
> trampolines (with a light regs structure) from those that come from an
> exception (with a pt_regs) could lead to a lot of duplicated code, and
> converting between each subsystem's idea of a light regs structure
> (what if perf introduces a perf_regs now ?) would be tedious and slow
> (lots of copies ?).

This is one discussion point I think. Actually, using pt_regs in kretprobe
(and rethook) is histrical accident. Originally, it had put a kprobe on
the function return trampoline to hook it. So keep the API compatiblity
I made the hand assembled code to save the pt_regs on the stack.

My another question is if we have the fprobe to trace (hook) the function
return, why we still need the kretprobe itself. I think we can remove
kretprobe and use fprobe exit handler, because "function" probing will
be done by fprobe, not kprobe. And then, we can simplify the kprobe
interface and clarify what it is -- "kprobe is a wrapper of software
breakpoint". And we don't need to think about duplicated code anymore :)

>  
> > Otherwise, ftrace_regs() has support on arm64 for getting to the argument
> > registers and the stack. Even live kernel patching now uses ftrace_regs().
> >
> > >
> > > If you guys decide to convert fprobe to ftrace_regs please
> > > make it depend on kconfig or something.
> > > bpf side needs full pt_regs.
> 
> Some wild ideas that I brought up once in a BPF office hour: BPF
> "multi_kprobe" could provide a fake pt_regs (either by constructing a
> sparse one on the stack or by JIT-ing different offset accesses and/or
> by having the verifier deny access to unpopulated fields) or break the
> current API (is it conceivable to phase out BPF "multi_kprobe"
> programs in favor of BPF "fprobe" programs that don't lie about their
> API and guarantees and just provide a ftrace_regs ?)

+1 :)

> 
> > Then use kprobes. When I asked Masami what the difference between fprobes
> > and kprobes was, he told me that it would be that it would no longer rely
> > on the slower FTRACE_WITH_REGS. But currently, it still does.
> 
> Actually... Moving fprobe to ftrace_regs should get even more spicy!
> :) Fprobe also wraps "rethook" which is basically the same thing as
> kretprobe: a return trampoline that saves a pt_regs, to the point that
> on x86 kretprobe's trampoline got dropped in favor of rethook's
> trampoline. But for the same reasons that we don't want ftrace to save
> pt_regs on arm64, rethook should probably also just save a ftrace_regs
> ? (also, to keep the fprobe callback signatures consistent between
> pre- and post- handlers). But if we want fprobe "post" callbacks to
> save a ftrace_regs now, either we need to re-introduce the kretprobe
> trampoline or also change the API of kretprobe (and break its symmetry
> with kprobe and we'd have the same problem all over again with BPF
> kretprobe program types...). All of this is "beautifully" entangled...
> :)

As I said, I would like to phase out the kretprobe itself because it
provides the same feature of fprobe, which is confusing. jprobe was
removed a while ago, and now kretprobe is. But we can not phase out
it at once. So I think we will keep current kretprobe trampoline on
arm64 and just add new ftrace_regs based rethook. Then remove the
API next release. (after all users including systemtap is moved) 

> 
> > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in
> > the first place, was because of the overhead you reported to me with
> > ftrace_regs_caller and why you wanted to go the direct trampoline approach.
> > That's when I realized I could use a subset because those registers were
> > already being saved. The only reason FTRACE_WITH_REGS was created was it
> > had to supply full pt_regs (including flags) and emulate a breakpoint for
> > the kprobes interface. But in reality, nothing really needs all that.
> >
> > > It's not about access to args.
> > > pt_regs is passed from bpf prog further into all kinds of perf event
> > > functions including stack walking.
> 
> If all accesses are done in BPF bytecode, we could (theoretically)
> have the verifier and JIT work together to deny accesses to
> unpopulated fields, or relocate pt_regs accesses to ftrace_regs
> accesses to keep backward compatibility with existing multi_kprobe BPF
> programs.

Yeah, that is what I would like to suggest, and what my patch does.
(let me update rethook too, it'll be a bit tricky since I don't want
break anything) 

Thanks,

> 
> Is there a risk that a "multi_kprobe" program could call into a BPF
> helper or kfunc that reads this pt_regs pointer and expect certain
> fields to be set ? I suppose we could also deny giving that "pt_regs"
> pointer to a helper... :/
> 
> > ftrace_regs gives you the stack pointer. Basically, it gives you access to
> > anything that is required to be saved to do a function call from fentry.
> >
> > > I think ORC unwinder might depend on availability of all registers.
> > > Other perf helpers might need it too. Like perf_event_output.
> > > bpf progs need to access arguments, no doubt about that.
> > > If ftrace_regs have them exactly in the same offsets as in pt_regs
> > > that might work transparently for bpf progs, but, I'm afraid,
> > > it's not the case on all archs.
> > > So we need full pt_regs to make sure all paths are still working.
> > >
> > > Adding Jiri and others.
> >
> > Then I recommend that you give up using fprobes and just stick with kprobes
> > as that's guaranteed to give you full pt_regs (at the overhead of doing
> > things like filing in flags and such). And currently for arm64, fprobes can
> > only work with ftrace_regs, without the full pt_regs.
Florent Revest Aug. 3, 2023, 4:37 p.m. UTC | #35
On Thu, Aug 3, 2023 at 5:42 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Wed, 2 Aug 2023 16:44:09 +0200
> Florent Revest <revest@chromium.org> wrote:
>
> > On Wed, Aug 2, 2023 at 1:09 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Tue, 1 Aug 2023 15:18:56 -0700
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > >
> > > > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > > >
> > > > > On Tue, 1 Aug 2023 11:20:36 -0400
> > > > > Steven Rostedt <rostedt@goodmis.org> wrote:
> > > > >
> > > > > > The solution was to come up with ftrace_regs, which just means it has all
> > > > > > the registers to extract the arguments of a function and nothing more. Most
> > > > >
> > > > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As
> > > > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They
> > > > > will do:
> > > > >
> > > > >         void callback(..., struct ftrace_regs *fregs) {
> > > > >                 struct pt_regs *regs = ftrace_get_regs(fregs);
> > > > >
> > > > >
> > > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled.
> > > > > If it is not, then it returns NULL. This was what the x86 maintainers
> > > > > agreed with.
> > > >
> > > > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL
> > > >
> > > > Ouch. That's very bad.
> > > > We care a lot about bpf running well on arm64.
> > >
> > > [ Adding Mark and Florent ]
> >
> > Ah, thanks Steve! That's my favorite can of worms :) I actually
> > consider sending a talk proposal to the tracing MC at LPC "pt_regs -
> > the good the bad and the ugly" on this very topic because I care about
> > unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe
> > it would be interesting.
>
> Ah, it is almost same as my talk :)

Oh, I didn't know! I submitted a proposal today but if the talks have
a lot of overlap maybe it's best that only you give your talk, since
you're the actual maintainer :) or we could co-present if there's
something I could add but I think you have all the background anyway

> > I pointed this out in
> > https://lore.kernel.org/all/CABRcYm+esb8J2O1v6=C+h+HSa5NxraPUgo63w7-iZj0CXbpusg@mail.gmail.com/#t
> > when Masami proposed adding calls from fprobe to perf. If every
> > subsystem makes different assumptions about "how sparse" their pt_regs
> > is and they call into one another, this could lead to... interesting
> > bugs. (eg: currently, we don't populate a fake pstate in ftrace_regs.
> > so we'd need to fake it when creating a sparse pt_regs _for Perf_,
> > knowing that Perf specifically expects this reg to be set. this would
> > require a struct copy anyway and some knowledge about how the data
> > will be consumed, in an arch- and subsystem- specific way)
>
> yeah, sorry I missed that point. I should remove it until we can fix it.

Uh, I shouldn't have buried my important comments so far down the
email :/ I wasn't sure whether you had missed the paragraph.

> > On the other hand, untangling all code paths that come from
> > trampolines (with a light regs structure) from those that come from an
> > exception (with a pt_regs) could lead to a lot of duplicated code, and
> > converting between each subsystem's idea of a light regs structure
> > (what if perf introduces a perf_regs now ?) would be tedious and slow
> > (lots of copies ?).
>
> This is one discussion point I think. Actually, using pt_regs in kretprobe
> (and rethook) is histrical accident. Originally, it had put a kprobe on
> the function return trampoline to hook it. So keep the API compatiblity
> I made the hand assembled code to save the pt_regs on the stack.
>
> My another question is if we have the fprobe to trace (hook) the function
> return, why we still need the kretprobe itself. I think we can remove
> kretprobe and use fprobe exit handler, because "function" probing will
> be done by fprobe, not kprobe. And then, we can simplify the kprobe
> interface and clarify what it is -- "kprobe is a wrapper of software
> breakpoint". And we don't need to think about duplicated code anymore :)

That sounds reasonable to me

> As I said, I would like to phase out the kretprobe itself because it
> provides the same feature of fprobe, which is confusing. jprobe was
> removed a while ago, and now kretprobe is. But we can not phase out
> it at once. So I think we will keep current kretprobe trampoline on
> arm64 and just add new ftrace_regs based rethook. Then remove the
> API next release. (after all users including systemtap is moved)

Heads up to BPF folks though since they also have BPF "kretprobe"
program types which would break in a similar fashion as multi_kprobe
(even though BPF kretprobe programs have also been discouraged for a
while in favor of BPF fexit programs)

> > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in
> > > the first place, was because of the overhead you reported to me with
> > > ftrace_regs_caller and why you wanted to go the direct trampoline approach.
> > > That's when I realized I could use a subset because those registers were
> > > already being saved. The only reason FTRACE_WITH_REGS was created was it
> > > had to supply full pt_regs (including flags) and emulate a breakpoint for
> > > the kprobes interface. But in reality, nothing really needs all that.
> > >
> > > > It's not about access to args.
> > > > pt_regs is passed from bpf prog further into all kinds of perf event
> > > > functions including stack walking.
> >
> > If all accesses are done in BPF bytecode, we could (theoretically)
> > have the verifier and JIT work together to deny accesses to
> > unpopulated fields, or relocate pt_regs accesses to ftrace_regs
> > accesses to keep backward compatibility with existing multi_kprobe BPF
> > programs.
>
> Yeah, that is what I would like to suggest, and what my patch does.
> (let me update rethook too, it'll be a bit tricky since I don't want
> break anything)

I agree with Alexei that this is an unnecessary amount of complexity
in the verifier just to avoid a struct copy though. It's good to know
that we _could_ do it if we really need to someday but then again, if
a user chooses an interface that gets a pt_regs they shouldn't expect
high performance. Therefore, I think it's ok for BPF multi_kprobe to
copy fields from a ftrace_regs to a pt_regs on stack, especially if it
avoids so much additional complexity in the verifier.
Jiri Olsa Aug. 7, 2023, 8:48 p.m. UTC | #36
On Fri, Aug 04, 2023 at 12:42:06AM +0900, Masami Hiramatsu wrote:

SNIP

> > 
> > On the other hand, untangling all code paths that come from
> > trampolines (with a light regs structure) from those that come from an
> > exception (with a pt_regs) could lead to a lot of duplicated code, and
> > converting between each subsystem's idea of a light regs structure
> > (what if perf introduces a perf_regs now ?) would be tedious and slow
> > (lots of copies ?).
> 
> This is one discussion point I think. Actually, using pt_regs in kretprobe
> (and rethook) is histrical accident. Originally, it had put a kprobe on
> the function return trampoline to hook it. So keep the API compatiblity
> I made the hand assembled code to save the pt_regs on the stack.
> 
> My another question is if we have the fprobe to trace (hook) the function
> return, why we still need the kretprobe itself. I think we can remove
> kretprobe and use fprobe exit handler, because "function" probing will
> be done by fprobe, not kprobe. And then, we can simplify the kprobe
> interface and clarify what it is -- "kprobe is a wrapper of software
> breakpoint". And we don't need to think about duplicated code anymore :)

1+ sounds like good idea

> 
> >  
> > > Otherwise, ftrace_regs() has support on arm64 for getting to the argument
> > > registers and the stack. Even live kernel patching now uses ftrace_regs().
> > >
> > > >
> > > > If you guys decide to convert fprobe to ftrace_regs please
> > > > make it depend on kconfig or something.
> > > > bpf side needs full pt_regs.
> > 
> > Some wild ideas that I brought up once in a BPF office hour: BPF
> > "multi_kprobe" could provide a fake pt_regs (either by constructing a
> > sparse one on the stack or by JIT-ing different offset accesses and/or
> > by having the verifier deny access to unpopulated fields) or break the
> > current API (is it conceivable to phase out BPF "multi_kprobe"
> > programs in favor of BPF "fprobe" programs that don't lie about their
> > API and guarantees and just provide a ftrace_regs ?)
> 
> +1 :)

so multi_kprobe link was created to allow fast attach of BPF kprobe-type
programs to multiple functions.. I don't think there's need for new fprobe
program

> 
> > 
> > > Then use kprobes. When I asked Masami what the difference between fprobes
> > > and kprobes was, he told me that it would be that it would no longer rely
> > > on the slower FTRACE_WITH_REGS. But currently, it still does.
> > 
> > Actually... Moving fprobe to ftrace_regs should get even more spicy!
> > :) Fprobe also wraps "rethook" which is basically the same thing as
> > kretprobe: a return trampoline that saves a pt_regs, to the point that
> > on x86 kretprobe's trampoline got dropped in favor of rethook's
> > trampoline. But for the same reasons that we don't want ftrace to save
> > pt_regs on arm64, rethook should probably also just save a ftrace_regs
> > ? (also, to keep the fprobe callback signatures consistent between
> > pre- and post- handlers). But if we want fprobe "post" callbacks to
> > save a ftrace_regs now, either we need to re-introduce the kretprobe
> > trampoline or also change the API of kretprobe (and break its symmetry
> > with kprobe and we'd have the same problem all over again with BPF
> > kretprobe program types...). All of this is "beautifully" entangled...
> > :)
> 
> As I said, I would like to phase out the kretprobe itself because it
> provides the same feature of fprobe, which is confusing. jprobe was
> removed a while ago, and now kretprobe is. But we can not phase out
> it at once. So I think we will keep current kretprobe trampoline on
> arm64 and just add new ftrace_regs based rethook. Then remove the
> API next release. (after all users including systemtap is moved) 
> 
> > 
> > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in
> > > the first place, was because of the overhead you reported to me with
> > > ftrace_regs_caller and why you wanted to go the direct trampoline approach.
> > > That's when I realized I could use a subset because those registers were
> > > already being saved. The only reason FTRACE_WITH_REGS was created was it
> > > had to supply full pt_regs (including flags) and emulate a breakpoint for
> > > the kprobes interface. But in reality, nothing really needs all that.
> > >
> > > > It's not about access to args.
> > > > pt_regs is passed from bpf prog further into all kinds of perf event
> > > > functions including stack walking.
> > 
> > If all accesses are done in BPF bytecode, we could (theoretically)
> > have the verifier and JIT work together to deny accesses to
> > unpopulated fields, or relocate pt_regs accesses to ftrace_regs
> > accesses to keep backward compatibility with existing multi_kprobe BPF
> > programs.
> 
> Yeah, that is what I would like to suggest, and what my patch does.
> (let me update rethook too, it'll be a bit tricky since I don't want
> break anything) 
> 
> Thanks,
> 
> > 
> > Is there a risk that a "multi_kprobe" program could call into a BPF
> > helper or kfunc that reads this pt_regs pointer and expect certain
> > fields to be set ? I suppose we could also deny giving that "pt_regs"
> > pointer to a helper... :/

I think Alexei answered this earlier in the thread:

 >From bpf side we don't care that such pt_regs is 100% filled in or
 >only partial as long as this pt_regs pointer is valid for perf_event_output
 >and stack walking that consume pt_regs.
 >I believe that was and still is the case for both x86 and arm64.


jirka
Masami Hiramatsu (Google) Aug. 8, 2023, 2:32 p.m. UTC | #37
On Mon, 7 Aug 2023 22:48:29 +0200
Jiri Olsa <olsajiri@gmail.com> wrote:

> On Fri, Aug 04, 2023 at 12:42:06AM +0900, Masami Hiramatsu wrote:
> 
> SNIP
> 
> > > 
> > > On the other hand, untangling all code paths that come from
> > > trampolines (with a light regs structure) from those that come from an
> > > exception (with a pt_regs) could lead to a lot of duplicated code, and
> > > converting between each subsystem's idea of a light regs structure
> > > (what if perf introduces a perf_regs now ?) would be tedious and slow
> > > (lots of copies ?).
> > 
> > This is one discussion point I think. Actually, using pt_regs in kretprobe
> > (and rethook) is histrical accident. Originally, it had put a kprobe on
> > the function return trampoline to hook it. So keep the API compatiblity
> > I made the hand assembled code to save the pt_regs on the stack.
> > 
> > My another question is if we have the fprobe to trace (hook) the function
> > return, why we still need the kretprobe itself. I think we can remove
> > kretprobe and use fprobe exit handler, because "function" probing will
> > be done by fprobe, not kprobe. And then, we can simplify the kprobe
> > interface and clarify what it is -- "kprobe is a wrapper of software
> > breakpoint". And we don't need to think about duplicated code anymore :)
> 
> 1+ sounds like good idea

Thanks! the downside will be that it requires to enable CONFIG_FPROBE
instead of CONFIG_KPROBES, but I think it is natural that the user, who 
wants to trace function boundary, enables CONFIG_FUNCTION_TRACER.

> > > > Otherwise, ftrace_regs() has support on arm64 for getting to the argument
> > > > registers and the stack. Even live kernel patching now uses ftrace_regs().
> > > >
> > > > >
> > > > > If you guys decide to convert fprobe to ftrace_regs please
> > > > > make it depend on kconfig or something.
> > > > > bpf side needs full pt_regs.
> > > 
> > > Some wild ideas that I brought up once in a BPF office hour: BPF
> > > "multi_kprobe" could provide a fake pt_regs (either by constructing a
> > > sparse one on the stack or by JIT-ing different offset accesses and/or
> > > by having the verifier deny access to unpopulated fields) or break the
> > > current API (is it conceivable to phase out BPF "multi_kprobe"
> > > programs in favor of BPF "fprobe" programs that don't lie about their
> > > API and guarantees and just provide a ftrace_regs ?)
> > 
> > +1 :)
> 
> so multi_kprobe link was created to allow fast attach of BPF kprobe-type
> programs to multiple functions.. I don't think there's need for new fprobe
> program

Ah, OK. So the focus point is shortening registration time.

> 
> > 
> > > 
> > > > Then use kprobes. When I asked Masami what the difference between fprobes
> > > > and kprobes was, he told me that it would be that it would no longer rely
> > > > on the slower FTRACE_WITH_REGS. But currently, it still does.
> > > 
> > > Actually... Moving fprobe to ftrace_regs should get even more spicy!
> > > :) Fprobe also wraps "rethook" which is basically the same thing as
> > > kretprobe: a return trampoline that saves a pt_regs, to the point that
> > > on x86 kretprobe's trampoline got dropped in favor of rethook's
> > > trampoline. But for the same reasons that we don't want ftrace to save
> > > pt_regs on arm64, rethook should probably also just save a ftrace_regs
> > > ? (also, to keep the fprobe callback signatures consistent between
> > > pre- and post- handlers). But if we want fprobe "post" callbacks to
> > > save a ftrace_regs now, either we need to re-introduce the kretprobe
> > > trampoline or also change the API of kretprobe (and break its symmetry
> > > with kprobe and we'd have the same problem all over again with BPF
> > > kretprobe program types...). All of this is "beautifully" entangled...
> > > :)
> > 
> > As I said, I would like to phase out the kretprobe itself because it
> > provides the same feature of fprobe, which is confusing. jprobe was
> > removed a while ago, and now kretprobe is. But we can not phase out
> > it at once. So I think we will keep current kretprobe trampoline on
> > arm64 and just add new ftrace_regs based rethook. Then remove the
> > API next release. (after all users including systemtap is moved) 
> > 
> > > 
> > > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in
> > > > the first place, was because of the overhead you reported to me with
> > > > ftrace_regs_caller and why you wanted to go the direct trampoline approach.
> > > > That's when I realized I could use a subset because those registers were
> > > > already being saved. The only reason FTRACE_WITH_REGS was created was it
> > > > had to supply full pt_regs (including flags) and emulate a breakpoint for
> > > > the kprobes interface. But in reality, nothing really needs all that.
> > > >
> > > > > It's not about access to args.
> > > > > pt_regs is passed from bpf prog further into all kinds of perf event
> > > > > functions including stack walking.
> > > 
> > > If all accesses are done in BPF bytecode, we could (theoretically)
> > > have the verifier and JIT work together to deny accesses to
> > > unpopulated fields, or relocate pt_regs accesses to ftrace_regs
> > > accesses to keep backward compatibility with existing multi_kprobe BPF
> > > programs.
> > 
> > Yeah, that is what I would like to suggest, and what my patch does.
> > (let me update rethook too, it'll be a bit tricky since I don't want
> > break anything) 
> > 
> > Thanks,
> > 
> > > 
> > > Is there a risk that a "multi_kprobe" program could call into a BPF
> > > helper or kfunc that reads this pt_regs pointer and expect certain
> > > fields to be set ? I suppose we could also deny giving that "pt_regs"
> > > pointer to a helper... :/
> 
> I think Alexei answered this earlier in the thread:
> 
>  >From bpf side we don't care that such pt_regs is 100% filled in or
>  >only partial as long as this pt_regs pointer is valid for perf_event_output
>  >and stack walking that consume pt_regs.
>  >I believe that was and still is the case for both x86 and arm64.

OK, so I've made the ftrace_partial_regs() according to the idea now.

Thanks,

> 
> 
> jirka
diff mbox series

Patch

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 20e3a07eef8f..4b10d57ceee0 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -226,6 +226,9 @@  const struct btf_type *btf_find_func_proto(const char *func_name,
 					   struct btf **btf_p);
 const struct btf_param *btf_get_func_param(const struct btf_type *func_proto,
 					   s32 *nr);
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+						const struct btf_type *type,
+						const char *member_name);
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f7b25c615269..8d81a4ffa67b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -958,6 +958,46 @@  const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3
 		return NULL;
 }
 
+#define BTF_ANON_STACK_MAX	16
+
+/*
+ * Find a member of data structure/union by name and return it.
+ * Return NULL if not found, or -EINVAL if parameter is invalid.
+ */
+const struct btf_member *btf_find_struct_member(struct btf *btf,
+						const struct btf_type *type,
+						const char *member_name)
+{
+	const struct btf_type *anon_stack[BTF_ANON_STACK_MAX];
+	const struct btf_member *member;
+	const char *name;
+	int i, top = 0;
+
+retry:
+	if (!btf_type_is_struct(type))
+		return ERR_PTR(-EINVAL);
+
+	for_each_member(i, type, member) {
+		if (!member->name_off) {
+			/* Anonymous union/struct: push it for later use */
+			type = btf_type_skip_modifiers(btf, member->type, NULL);
+			if (type && top < BTF_ANON_STACK_MAX)
+				anon_stack[top++] = type;
+		} else {
+			name = btf_name_by_offset(btf, member->name_off);
+			if (name && !strcmp(member_name, name))
+				return member;
+		}
+	}
+	if (top > 0) {
+		/* Pop from the anonymous stack and retry */
+		type = anon_stack[--top];
+		goto retry;
+	}
+
+	return NULL;
+}
+
 #define BTF_SHOW_MAX_ITER	10
 
 #define BTF_KIND_BIT(kind)	(1ULL << kind)