Message ID | 169078863449.173706.2322042687021909241.stgit@devnote2 (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | tracing: Improbe BTF support on probe events | expand |
On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > From: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > Add btf_find_struct_member() API to search a member of a given data structure > or union from the member's name. > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > Reviewed-by: Alan Maguire <alan.maguire@oracle.com> > --- > Changes in v3: > - Remove simple input check. > - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id(). > - Move the code next to btf_get_func_param(). > - Use for_each_member() macro instead of for-loop. > - Use btf_type_skip_modifiers() instead of btf_type_by_id(). > Changes in v4: > - Use a stack for searching in anonymous members instead of nested call. > --- > include/linux/btf.h | 3 +++ > kernel/bpf/btf.c | 40 ++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 43 insertions(+) > > diff --git a/include/linux/btf.h b/include/linux/btf.h > index 20e3a07eef8f..4b10d57ceee0 100644 > --- a/include/linux/btf.h > +++ b/include/linux/btf.h > @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name, > struct btf **btf_p); > const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, > s32 *nr); > +const struct btf_member *btf_find_struct_member(struct btf *btf, > + const struct btf_type *type, > + const char *member_name); > > #define for_each_member(i, struct_type, member) \ > for (i = 0, member = btf_type_member(struct_type); \ > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > index f7b25c615269..8d81a4ffa67b 100644 > --- a/kernel/bpf/btf.c > +++ b/kernel/bpf/btf.c > @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3 > return NULL; > } > > +#define BTF_ANON_STACK_MAX 16 > + > +/* > + * Find a member of data structure/union by name and return it. > + * Return NULL if not found, or -EINVAL if parameter is invalid. > + */ > +const struct btf_member *btf_find_struct_member(struct btf *btf, > + const struct btf_type *type, > + const char *member_name) > +{ > + const struct btf_type *anon_stack[BTF_ANON_STACK_MAX]; > + const struct btf_member *member; > + const char *name; > + int i, top = 0; > + > +retry: > + if (!btf_type_is_struct(type)) > + return ERR_PTR(-EINVAL); > + > + for_each_member(i, type, member) { > + if (!member->name_off) { > + /* Anonymous union/struct: push it for later use */ > + type = btf_type_skip_modifiers(btf, member->type, NULL); > + if (type && top < BTF_ANON_STACK_MAX) > + anon_stack[top++] = type; > + } else { > + name = btf_name_by_offset(btf, member->name_off); > + if (name && !strcmp(member_name, name)) > + return member; > + } > + } > + if (top > 0) { > + /* Pop from the anonymous stack and retry */ > + type = anon_stack[--top]; > + goto retry; > + } Looks good, but I don't see a test case for this. The logic is a bit tricky. I'd like to have a selftest that covers it. You probably need to drop Alan's reviewed-by, since the patch is quite different from the time he reviewed it. Assuming that is addressed. How do we merge the series? The first 3 patches have serious conflicts with bpf trees. Maybe send the first 3 with extra selftest for above recursion targeting bpf-next then we can have a merge commit that Steven can pull into tracing? Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next.
On Mon, 31 Jul 2023 14:59:47 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google) > <mhiramat@kernel.org> wrote: > > > > From: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > > > Add btf_find_struct_member() API to search a member of a given data structure > > or union from the member's name. > > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > Reviewed-by: Alan Maguire <alan.maguire@oracle.com> > > --- > > Changes in v3: > > - Remove simple input check. > > - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id(). > > - Move the code next to btf_get_func_param(). > > - Use for_each_member() macro instead of for-loop. > > - Use btf_type_skip_modifiers() instead of btf_type_by_id(). > > Changes in v4: > > - Use a stack for searching in anonymous members instead of nested call. > > --- > > include/linux/btf.h | 3 +++ > > kernel/bpf/btf.c | 40 ++++++++++++++++++++++++++++++++++++++++ > > 2 files changed, 43 insertions(+) > > > > diff --git a/include/linux/btf.h b/include/linux/btf.h > > index 20e3a07eef8f..4b10d57ceee0 100644 > > --- a/include/linux/btf.h > > +++ b/include/linux/btf.h > > @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name, > > struct btf **btf_p); > > const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, > > s32 *nr); > > +const struct btf_member *btf_find_struct_member(struct btf *btf, > > + const struct btf_type *type, > > + const char *member_name); > > > > #define for_each_member(i, struct_type, member) \ > > for (i = 0, member = btf_type_member(struct_type); \ > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > > index f7b25c615269..8d81a4ffa67b 100644 > > --- a/kernel/bpf/btf.c > > +++ b/kernel/bpf/btf.c > > @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3 > > return NULL; > > } > > > > +#define BTF_ANON_STACK_MAX 16 > > + > > +/* > > + * Find a member of data structure/union by name and return it. > > + * Return NULL if not found, or -EINVAL if parameter is invalid. > > + */ > > +const struct btf_member *btf_find_struct_member(struct btf *btf, > > + const struct btf_type *type, > > + const char *member_name) > > +{ > > + const struct btf_type *anon_stack[BTF_ANON_STACK_MAX]; > > + const struct btf_member *member; > > + const char *name; > > + int i, top = 0; > > + > > +retry: > > + if (!btf_type_is_struct(type)) > > + return ERR_PTR(-EINVAL); > > + > > + for_each_member(i, type, member) { > > + if (!member->name_off) { > > + /* Anonymous union/struct: push it for later use */ > > + type = btf_type_skip_modifiers(btf, member->type, NULL); > > + if (type && top < BTF_ANON_STACK_MAX) > > + anon_stack[top++] = type; > > + } else { > > + name = btf_name_by_offset(btf, member->name_off); > > + if (name && !strcmp(member_name, name)) > > + return member; > > + } > > + } > > + if (top > 0) { > > + /* Pop from the anonymous stack and retry */ > > + type = anon_stack[--top]; > > + goto retry; > > + } > > Looks good, but I don't see a test case for this. > The logic is a bit tricky. I'd like to have a selftest that covers it. Thanks, and I agree about selftest. > > You probably need to drop Alan's reviewed-by, since the patch is quite > different from the time he reviewed it. OK. BTW, I found a problem on this function. I guess the member->offset will be the offset from the intermediate anonymous union, it is usually 0, but I need the offset from the given structure. Thus the interface design must be changed. Passing a 'u32 *offset' and set the correct offset in it. If it has nested intermediate anonymous unions, that offset must also be pushed. > > Assuming that is addressed. How do we merge the series? > The first 3 patches have serious conflicts with bpf trees. > > Maybe send the first 3 with extra selftest for above recursion > targeting bpf-next then we can have a merge commit that Steven can pull > into tracing? > > Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next. That's a good question. I don't like splitting the whole series in 2 -next branches. So I can send this to the bpf-next. I need to work on another series(*) on fprobes which will not have conflicts with this series. (*Replacing pt_regs with ftrace_regs on fprobe, which will take longer time, and need to adjust with eBPF). Thank you,
On Mon, Jul 31, 2023 at 4:57 PM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > On Mon, 31 Jul 2023 14:59:47 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google) > > <mhiramat@kernel.org> wrote: > > > > > > From: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > > > > > Add btf_find_struct_member() API to search a member of a given data structure > > > or union from the member's name. > > > > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > > Reviewed-by: Alan Maguire <alan.maguire@oracle.com> > > > --- > > > Changes in v3: > > > - Remove simple input check. > > > - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id(). > > > - Move the code next to btf_get_func_param(). > > > - Use for_each_member() macro instead of for-loop. > > > - Use btf_type_skip_modifiers() instead of btf_type_by_id(). > > > Changes in v4: > > > - Use a stack for searching in anonymous members instead of nested call. > > > --- > > > include/linux/btf.h | 3 +++ > > > kernel/bpf/btf.c | 40 ++++++++++++++++++++++++++++++++++++++++ > > > 2 files changed, 43 insertions(+) > > > > > > diff --git a/include/linux/btf.h b/include/linux/btf.h > > > index 20e3a07eef8f..4b10d57ceee0 100644 > > > --- a/include/linux/btf.h > > > +++ b/include/linux/btf.h > > > @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name, > > > struct btf **btf_p); > > > const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, > > > s32 *nr); > > > +const struct btf_member *btf_find_struct_member(struct btf *btf, > > > + const struct btf_type *type, > > > + const char *member_name); > > > > > > #define for_each_member(i, struct_type, member) \ > > > for (i = 0, member = btf_type_member(struct_type); \ > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > > > index f7b25c615269..8d81a4ffa67b 100644 > > > --- a/kernel/bpf/btf.c > > > +++ b/kernel/bpf/btf.c > > > @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3 > > > return NULL; > > > } > > > > > > +#define BTF_ANON_STACK_MAX 16 > > > + > > > +/* > > > + * Find a member of data structure/union by name and return it. > > > + * Return NULL if not found, or -EINVAL if parameter is invalid. > > > + */ > > > +const struct btf_member *btf_find_struct_member(struct btf *btf, > > > + const struct btf_type *type, > > > + const char *member_name) > > > +{ > > > + const struct btf_type *anon_stack[BTF_ANON_STACK_MAX]; > > > + const struct btf_member *member; > > > + const char *name; > > > + int i, top = 0; > > > + > > > +retry: > > > + if (!btf_type_is_struct(type)) > > > + return ERR_PTR(-EINVAL); > > > + > > > + for_each_member(i, type, member) { > > > + if (!member->name_off) { > > > + /* Anonymous union/struct: push it for later use */ > > > + type = btf_type_skip_modifiers(btf, member->type, NULL); > > > + if (type && top < BTF_ANON_STACK_MAX) > > > + anon_stack[top++] = type; > > > + } else { > > > + name = btf_name_by_offset(btf, member->name_off); > > > + if (name && !strcmp(member_name, name)) > > > + return member; > > > + } > > > + } > > > + if (top > 0) { > > > + /* Pop from the anonymous stack and retry */ > > > + type = anon_stack[--top]; > > > + goto retry; > > > + } > > > > Looks good, but I don't see a test case for this. > > The logic is a bit tricky. I'd like to have a selftest that covers it. > > Thanks, and I agree about selftest. > > > > > You probably need to drop Alan's reviewed-by, since the patch is quite > > different from the time he reviewed it. > > OK. BTW, I found a problem on this function. I guess the member->offset will > be the offset from the intermediate anonymous union, it is usually 0, but > I need the offset from the given structure. Thus the interface design must > be changed. Passing a 'u32 *offset' and set the correct offset in it. If > it has nested intermediate anonymous unions, that offset must also be pushed. With all that piling up have you considering reusing btf_struct_walk() ? It's doing the opposite off -> btf_id while you need name -> btf_id. But it will give an idea of overall complexity if you want to solve it for nested arrays and struct/union. > > > > Assuming that is addressed. How do we merge the series? > > The first 3 patches have serious conflicts with bpf trees. > > > > Maybe send the first 3 with extra selftest for above recursion > > targeting bpf-next then we can have a merge commit that Steven can pull > > into tracing? > > > > Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next. > > That's a good question. I don't like splitting the whole series in 2 -next > branches. So I can send this to the bpf-next. Works for me. > I need to work on another series(*) on fprobes which will not have conflicts with > this series. (*Replacing pt_regs with ftrace_regs on fprobe, which will take longer > time, and need to adjust with eBPF). ftrace_regs? Ouch. For bpf we rely on pt_regs being an argument. fprobe should be 100% compatible replacement of kprobe-at-the-func-start. If it diverges from that it's a big issue for bpf. We'd have to remove all of fprobe usage. I could be missing something, of course.
On Mon, 31 Jul 2023 14:59:47 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > Assuming that is addressed. How do we merge the series? > The first 3 patches have serious conflicts with bpf trees. > > Maybe send the first 3 with extra selftest for above recursion > targeting bpf-next then we can have a merge commit that Steven can pull > into tracing? Would it be possible to do this by basing it off of one of Linus's tags, and doing the merge and conflict resolution in your tree before it gets to Linus? That way we can pull in that clean branch without having to pull in anything else from BPF. I believe Linus prefers this over having tracing having extra changes from BPF that are not yet in his tree. We only need these particular changes, we shouldn't be pulling in anything specific for BPF, as I believe that will cause issues on Linus's side. -- Steve > > Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next.
On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Mon, 31 Jul 2023 14:59:47 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > Assuming that is addressed. How do we merge the series? > > The first 3 patches have serious conflicts with bpf trees. > > > > Maybe send the first 3 with extra selftest for above recursion > > targeting bpf-next then we can have a merge commit that Steven can pull > > into tracing? > > Would it be possible to do this by basing it off of one of Linus's tags, > and doing the merge and conflict resolution in your tree before it gets to > Linus? > > That way we can pull in that clean branch without having to pull in > anything else from BPF. I believe Linus prefers this over having tracing > having extra changes from BPF that are not yet in his tree. We only need > these particular changes, we shouldn't be pulling in anything specific for > BPF, as I believe that will cause issues on Linus's side. We can try, but I suspect git tricks won't do it. Masami's changes depend on patches for kernel/bpf/btf.c that are already in bpf-next, so git would have to follow all commits that touch this file. I don't think git is smart enough to thread the needle and split the commit into files. If one commit touches btf.c and something else that whole commit becomes a dependency that pulls another commit with all files touched by the previous commit and so on. tbh for this set, the easiest for everyone, is to land the whole thing through bpf-next, since there are no conflicts on fprobe side.
On Mon, 31 Jul 2023 19:24:25 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> > wrote: > > > > On Mon, 31 Jul 2023 14:59:47 -0700 > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > Assuming that is addressed. How do we merge the series? > > > The first 3 patches have serious conflicts with bpf trees. > > > > > > Maybe send the first 3 with extra selftest for above recursion > > > targeting bpf-next then we can have a merge commit that Steven can > > > pull into tracing? > > > > Would it be possible to do this by basing it off of one of Linus's tags, > > and doing the merge and conflict resolution in your tree before it gets > > to Linus? > > > > That way we can pull in that clean branch without having to pull in > > anything else from BPF. I believe Linus prefers this over having tracing > > having extra changes from BPF that are not yet in his tree. We only need > > these particular changes, we shouldn't be pulling in anything specific > > for BPF, as I believe that will cause issues on Linus's side. > > We can try, but I suspect git tricks won't do it. > Masami's changes depend on patches for kernel/bpf/btf.c that > are already in bpf-next, so git would have to follow all commits You mean other patches that Masami has sent are in the bpf tree already and these are on top of them? -- Steve > that touch this file. I don't think git is smart enough to > thread the needle and split the commit into files. If one commit touches > btf.c and something else that whole commit becomes a dependency > that pulls another commit with all files touched by > the previous commit and so on. > tbh for this set, the easiest for everyone, is to land the whole thing > through bpf-next, since there are no conflicts on fprobe side.
On Mon, 31 Jul 2023 17:29:49 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Mon, Jul 31, 2023 at 4:57 PM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > On Mon, 31 Jul 2023 14:59:47 -0700 > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > On Mon, Jul 31, 2023 at 12:30 AM Masami Hiramatsu (Google) > > > <mhiramat@kernel.org> wrote: > > > > > > > > From: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > > > > > > > Add btf_find_struct_member() API to search a member of a given data structure > > > > or union from the member's name. > > > > > > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > > > Reviewed-by: Alan Maguire <alan.maguire@oracle.com> > > > > --- > > > > Changes in v3: > > > > - Remove simple input check. > > > > - Fix unneeded IS_ERR_OR_NULL() check for btf_type_by_id(). > > > > - Move the code next to btf_get_func_param(). > > > > - Use for_each_member() macro instead of for-loop. > > > > - Use btf_type_skip_modifiers() instead of btf_type_by_id(). > > > > Changes in v4: > > > > - Use a stack for searching in anonymous members instead of nested call. > > > > --- > > > > include/linux/btf.h | 3 +++ > > > > kernel/bpf/btf.c | 40 ++++++++++++++++++++++++++++++++++++++++ > > > > 2 files changed, 43 insertions(+) > > > > > > > > diff --git a/include/linux/btf.h b/include/linux/btf.h > > > > index 20e3a07eef8f..4b10d57ceee0 100644 > > > > --- a/include/linux/btf.h > > > > +++ b/include/linux/btf.h > > > > @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name, > > > > struct btf **btf_p); > > > > const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, > > > > s32 *nr); > > > > +const struct btf_member *btf_find_struct_member(struct btf *btf, > > > > + const struct btf_type *type, > > > > + const char *member_name); > > > > > > > > #define for_each_member(i, struct_type, member) \ > > > > for (i = 0, member = btf_type_member(struct_type); \ > > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > > > > index f7b25c615269..8d81a4ffa67b 100644 > > > > --- a/kernel/bpf/btf.c > > > > +++ b/kernel/bpf/btf.c > > > > @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3 > > > > return NULL; > > > > } > > > > > > > > +#define BTF_ANON_STACK_MAX 16 > > > > + > > > > +/* > > > > + * Find a member of data structure/union by name and return it. > > > > + * Return NULL if not found, or -EINVAL if parameter is invalid. > > > > + */ > > > > +const struct btf_member *btf_find_struct_member(struct btf *btf, > > > > + const struct btf_type *type, > > > > + const char *member_name) > > > > +{ > > > > + const struct btf_type *anon_stack[BTF_ANON_STACK_MAX]; > > > > + const struct btf_member *member; > > > > + const char *name; > > > > + int i, top = 0; > > > > + > > > > +retry: > > > > + if (!btf_type_is_struct(type)) > > > > + return ERR_PTR(-EINVAL); > > > > + > > > > + for_each_member(i, type, member) { > > > > + if (!member->name_off) { > > > > + /* Anonymous union/struct: push it for later use */ > > > > + type = btf_type_skip_modifiers(btf, member->type, NULL); > > > > + if (type && top < BTF_ANON_STACK_MAX) > > > > + anon_stack[top++] = type; > > > > + } else { > > > > + name = btf_name_by_offset(btf, member->name_off); > > > > + if (name && !strcmp(member_name, name)) > > > > + return member; > > > > + } > > > > + } > > > > + if (top > 0) { > > > > + /* Pop from the anonymous stack and retry */ > > > > + type = anon_stack[--top]; > > > > + goto retry; > > > > + } > > > > > > Looks good, but I don't see a test case for this. > > > The logic is a bit tricky. I'd like to have a selftest that covers it. > > > > Thanks, and I agree about selftest. > > > > > > > > You probably need to drop Alan's reviewed-by, since the patch is quite > > > different from the time he reviewed it. > > > > OK. BTW, I found a problem on this function. I guess the member->offset will > > be the offset from the intermediate anonymous union, it is usually 0, but > > I need the offset from the given structure. Thus the interface design must > > be changed. Passing a 'u32 *offset' and set the correct offset in it. If > > it has nested intermediate anonymous unions, that offset must also be pushed. > > With all that piling up have you considering reusing btf_struct_walk() ? > It's doing the opposite off -> btf_id while you need name -> btf_id. > But it will give an idea of overall complexity if you want to solve it > for nested arrays and struct/union. No, it seems a bit different. (and it may not return the name correctly for anonymous struct/union) Of course it seems an interesting work. What I found is returning btf_member is not enough because btf_member in the nested union will have the offset from the nested structure. I have to accumulate the offset. It is easy to fix (just stacking (tid,offset) instead of type*) :) > > > > > > > Assuming that is addressed. How do we merge the series? > > > The first 3 patches have serious conflicts with bpf trees. > > > > > > Maybe send the first 3 with extra selftest for above recursion > > > targeting bpf-next then we can have a merge commit that Steven can pull > > > into tracing? > > > > > > Or if we can have acks for patches 4-9 we can pull the whole set into bpf-next. > > > > That's a good question. I don't like splitting the whole series in 2 -next > > branches. So I can send this to the bpf-next. > > Works for me. Or, yet another option is keeping new btf APIs in trace/trace_probe.c in this series, and move all of them to btf.c in the next series. This will not make any merge problem between trees, but just needs 2 series on different releases. (since unless the first one is merged, we cannot send the second one) > > > I need to work on another series(*) on fprobes which will not have conflicts with > > this series. (*Replacing pt_regs with ftrace_regs on fprobe, which will take longer > > time, and need to adjust with eBPF). > > ftrace_regs? > Ouch. For bpf we rely on pt_regs being an argument. Yeah, that's a problem. > fprobe should be 100% compatible replacement of kprobe-at-the-func-start. No, fprobe is not such feature. It must provide more generic interface because it is a probe version of ftrace, not kprobe. > If it diverges from that it's a big issue for bpf. > We'd have to remove all of fprobe usage. > I could be missing something, of course. Yes, so that's the discussion point. At first, I will disable fprobe on BPF if ftrace_regs is not compatible with pt_regs, but eventually it should be handled to support arm64. I believe BPF can do it since ftrace can do. Thank you,
On Mon, 31 Jul 2023 19:24:25 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > On Mon, 31 Jul 2023 14:59:47 -0700 > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > Assuming that is addressed. How do we merge the series? > > > The first 3 patches have serious conflicts with bpf trees. > > > > > > Maybe send the first 3 with extra selftest for above recursion > > > targeting bpf-next then we can have a merge commit that Steven can pull > > > into tracing? > > > > Would it be possible to do this by basing it off of one of Linus's tags, > > and doing the merge and conflict resolution in your tree before it gets to > > Linus? > > > > That way we can pull in that clean branch without having to pull in > > anything else from BPF. I believe Linus prefers this over having tracing > > having extra changes from BPF that are not yet in his tree. We only need > > these particular changes, we shouldn't be pulling in anything specific for > > BPF, as I believe that will cause issues on Linus's side. > > We can try, but I suspect git tricks won't do it. > Masami's changes depend on patches for kernel/bpf/btf.c that > are already in bpf-next, so git would have to follow all commits > that touch this file. This point is strange. I'm working on probe/fixes which is based on v6.5-rc3, so any bpf-next change should not be involved. Can you recheck this point? > I don't think git is smart enough to > thread the needle and split the commit into files. If one commit touches > btf.c and something else that whole commit becomes a dependency > that pulls another commit with all files touched by > the previous commit and so on. As far as I understand Steve's method, we will have an intermediate branch on bpf or probe tree, like linus(some common commit) ---- probes/btf-find-api and merge it to both bpf-next and probes/for-next branch +----------------------bpf-next --- (merge bpf patches) / / merge common -/\ probes/btf-find-api -/-\ \ \ merge +----------------------probes/for-next --- (merge probe patches) Thus, we can merge both for-next branches at next merge window without any issue. (But, yes, this is not simple, and needs maxium care) Thank you, > tbh for this set, the easiest for everyone, is to land the whole thing > through bpf-next, since there are no conflicts on fprobe side.
On Wed, 2 Aug 2023 00:02:28 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > If it diverges from that it's a big issue for bpf. > > We'd have to remove all of fprobe usage. > > I could be missing something, of course. > > Yes, so that's the discussion point. At first, I will disable fprobe on BPF > if ftrace_regs is not compatible with pt_regs, but eventually it should be > handled to support arm64. I believe BPF can do it since ftrace can do. Note, for FYI let me give you a little history of where ftrace_regs came from. When I realized that all function tracing had to save all the registers that represent the arguments of a function as well as the stack pointer, I wanted to change the non FTRACE_WITH_REGS to be able to have access to those registers. This is where FTRACE_WITH_ARGS came from. My first attempt was to pass a pt_regs that was partially filled, with only the registers required for the arguments. But the x86 maintainers NACK'd that. They refused to allow a partially filled pt_regs as that could cause bugs in the future when a user may assume that the pt_regs is filled but is not. The solution was to come up with ftrace_regs, which just means it has all the registers to extract the arguments of a function and nothing more. Most implementations just have a partially filled pt_regs within it, but an API needs to be used to get to the argument values. When you say BPF uses pt_regs, is the pt_regs full or does it get passed a partially filled structure? For fast function entry, ftrace_regs is what should be used if the pt_regs is not filled. As it is only for use for function entry. It supplies all regs and stack pointer to get to all the arguments. -- Steve
On Tue, 1 Aug 2023 11:20:36 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > The solution was to come up with ftrace_regs, which just means it has all > the registers to extract the arguments of a function and nothing more. Most This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They will do: void callback(..., struct ftrace_regs *fregs) { struct pt_regs *regs = ftrace_get_regs(fregs); Where ftrace_get_regs() will return the pt_regs only if it is fully filled. If it is not, then it returns NULL. This was what the x86 maintainers agreed with. -- Steve > implementations just have a partially filled pt_regs within it, but an API > needs to be used to get to the argument values.
On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Tue, 1 Aug 2023 11:20:36 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > The solution was to come up with ftrace_regs, which just means it has all > > the registers to extract the arguments of a function and nothing more. Most > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They > will do: > > void callback(..., struct ftrace_regs *fregs) { > struct pt_regs *regs = ftrace_get_regs(fregs); > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled. > If it is not, then it returns NULL. This was what the x86 maintainers > agreed with. arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL Ouch. That's very bad. We care a lot about bpf running well on arm64. If you guys decide to convert fprobe to ftrace_regs please make it depend on kconfig or something. bpf side needs full pt_regs. It's not about access to args. pt_regs is passed from bpf prog further into all kinds of perf event functions including stack walking. I think ORC unwinder might depend on availability of all registers. Other perf helpers might need it too. Like perf_event_output. bpf progs need to access arguments, no doubt about that. If ftrace_regs have them exactly in the same offsets as in pt_regs that might work transparently for bpf progs, but, I'm afraid, it's not the case on all archs. So we need full pt_regs to make sure all paths are still working. Adding Jiri and others.
On Tue, Aug 1, 2023 at 8:18 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > On Mon, 31 Jul 2023 19:24:25 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > On Mon, 31 Jul 2023 14:59:47 -0700 > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > > > Assuming that is addressed. How do we merge the series? > > > > The first 3 patches have serious conflicts with bpf trees. > > > > > > > > Maybe send the first 3 with extra selftest for above recursion > > > > targeting bpf-next then we can have a merge commit that Steven can pull > > > > into tracing? > > > > > > Would it be possible to do this by basing it off of one of Linus's tags, > > > and doing the merge and conflict resolution in your tree before it gets to > > > Linus? > > > > > > That way we can pull in that clean branch without having to pull in > > > anything else from BPF. I believe Linus prefers this over having tracing > > > having extra changes from BPF that are not yet in his tree. We only need > > > these particular changes, we shouldn't be pulling in anything specific for > > > BPF, as I believe that will cause issues on Linus's side. > > > > We can try, but I suspect git tricks won't do it. > > Masami's changes depend on patches for kernel/bpf/btf.c that > > are already in bpf-next, so git would have to follow all commits > > that touch this file. > > This point is strange. I'm working on probe/fixes which is based on > v6.5-rc3, so any bpf-next change should not be involved. Can you recheck > this point? > > > I don't think git is smart enough to > > thread the needle and split the commit into files. If one commit touches > > btf.c and something else that whole commit becomes a dependency > > that pulls another commit with all files touched by > > the previous commit and so on. > > As far as I understand Steve's method, we will have an intermediate branch > on bpf or probe tree, like > > linus(some common commit) ---- probes/btf-find-api > > and merge it to both bpf-next and probes/for-next branch > > +----------------------bpf-next --- (merge bpf patches) > / / merge > common -/\ probes/btf-find-api -/-\ > \ \ merge > +----------------------probes/for-next --- (merge probe patches) > > Thus, we can merge both for-next branches at next merge window without > any issue. (But, yes, this is not simple, and needs maxium care) Sounds like the path of least resistance is to keep the changes in kernel/trace and consolidate with kernel/bpf/btf.c after the next merge window.
On Tue, 1 Aug 2023 15:18:56 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > On Tue, 1 Aug 2023 11:20:36 -0400 > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > The solution was to come up with ftrace_regs, which just means it has all > > > the registers to extract the arguments of a function and nothing more. Most > > > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They > > will do: > > > > void callback(..., struct ftrace_regs *fregs) { > > struct pt_regs *regs = ftrace_get_regs(fregs); > > > > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled. > > If it is not, then it returns NULL. This was what the x86 maintainers > > agreed with. > > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL > > Ouch. That's very bad. > We care a lot about bpf running well on arm64. [ Adding Mark and Florent ] That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their function handlers only care about the arguments. If you want full regs at function entry, then you need to take a breakpoint hit for a full kprobe. In fact, fprobes isn't even supported on arm64 because it it doesn't have DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying to get it to work with ftrace_regs. To get it to work on arm64. Again, ftrace_get_regs(fregs) is only suppose to return something if the pt_regs is fully supplied. If they are not, then it must not be used. Are you not using a fully filled pt_regs? Because that's what both Thomas and Peter (also added) told me not to do! Otherwise, ftrace_regs() has support on arm64 for getting to the argument registers and the stack. Even live kernel patching now uses ftrace_regs(). > > If you guys decide to convert fprobe to ftrace_regs please > make it depend on kconfig or something. > bpf side needs full pt_regs. Then use kprobes. When I asked Masami what the difference between fprobes and kprobes was, he told me that it would be that it would no longer rely on the slower FTRACE_WITH_REGS. But currently, it still does. The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in the first place, was because of the overhead you reported to me with ftrace_regs_caller and why you wanted to go the direct trampoline approach. That's when I realized I could use a subset because those registers were already being saved. The only reason FTRACE_WITH_REGS was created was it had to supply full pt_regs (including flags) and emulate a breakpoint for the kprobes interface. But in reality, nothing really needs all that. > It's not about access to args. > pt_regs is passed from bpf prog further into all kinds of perf event > functions including stack walking. ftrace_regs gives you the stack pointer. Basically, it gives you access to anything that is required to be saved to do a function call from fentry. > I think ORC unwinder might depend on availability of all registers. > Other perf helpers might need it too. Like perf_event_output. > bpf progs need to access arguments, no doubt about that. > If ftrace_regs have them exactly in the same offsets as in pt_regs > that might work transparently for bpf progs, but, I'm afraid, > it's not the case on all archs. > So we need full pt_regs to make sure all paths are still working. > > Adding Jiri and others. Then I recommend that you give up using fprobes and just stick with kprobes as that's guaranteed to give you full pt_regs (at the overhead of doing things like filing in flags and such). And currently for arm64, fprobes can only work with ftrace_regs, without the full pt_regs. -- Steve
On Tue, 1 Aug 2023 15:21:59 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Tue, Aug 1, 2023 at 8:18 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > On Mon, 31 Jul 2023 19:24:25 -0700 > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > On Mon, Jul 31, 2023 at 6:15 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > > On Mon, 31 Jul 2023 14:59:47 -0700 > > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > > > > > Assuming that is addressed. How do we merge the series? > > > > > The first 3 patches have serious conflicts with bpf trees. > > > > > > > > > > Maybe send the first 3 with extra selftest for above recursion > > > > > targeting bpf-next then we can have a merge commit that Steven can pull > > > > > into tracing? > > > > > > > > Would it be possible to do this by basing it off of one of Linus's tags, > > > > and doing the merge and conflict resolution in your tree before it gets to > > > > Linus? > > > > > > > > That way we can pull in that clean branch without having to pull in > > > > anything else from BPF. I believe Linus prefers this over having tracing > > > > having extra changes from BPF that are not yet in his tree. We only need > > > > these particular changes, we shouldn't be pulling in anything specific for > > > > BPF, as I believe that will cause issues on Linus's side. > > > > > > We can try, but I suspect git tricks won't do it. > > > Masami's changes depend on patches for kernel/bpf/btf.c that > > > are already in bpf-next, so git would have to follow all commits > > > that touch this file. > > > > This point is strange. I'm working on probe/fixes which is based on > > v6.5-rc3, so any bpf-next change should not be involved. Can you recheck > > this point? > > > > > I don't think git is smart enough to > > > thread the needle and split the commit into files. If one commit touches > > > btf.c and something else that whole commit becomes a dependency > > > that pulls another commit with all files touched by > > > the previous commit and so on. > > > > As far as I understand Steve's method, we will have an intermediate branch > > on bpf or probe tree, like > > > > linus(some common commit) ---- probes/btf-find-api > > > > and merge it to both bpf-next and probes/for-next branch > > > > +----------------------bpf-next --- (merge bpf patches) > > / / merge > > common -/\ probes/btf-find-api -/-\ > > \ \ merge > > +----------------------probes/for-next --- (merge probe patches) > > > > Thus, we can merge both for-next branches at next merge window without > > any issue. (But, yes, this is not simple, and needs maxium care) > > Sounds like the path of least resistance is to keep the changes > in kernel/trace and consolidate with kernel/bpf/btf.c after the next > merge window. OK, sounds good to me. I will only expose the bpf_find_btf_id() then. Thank you,
On Tue, Aug 1, 2023 at 4:09 PM Steven Rostedt <rostedt@goodmis.org> wrote> > Then I recommend that you give up using fprobes and just stick with kprobes > as that's guaranteed to give you full pt_regs (at the overhead of doing > things like filing in flags and such). And currently for arm64, fprobes can > only work with ftrace_regs, without the full pt_regs. bpf doesn't attach to fprobes directly. That was never requested. But Jiri's work to support multi attach https://lore.kernel.org/bpf/20220316122419.933957-1-jolsa@kernel.org/ was a joint effort with Masami that relied on fprobe multi attach api. register_fprobe_ips() in particular, because the promise you guys give us that callback will get pt_regs as described in Documentation/trace/fprobe.rst. From bpf side we don't care that such pt_regs is 100% filled in or only partial as long as this pt_regs pointer is valid for perf_event_output and stack walking that consume pt_regs. I believe that was and still is the case for both x86 and arm64. The way I understood Masami's intent is to change that promise and fprobe callback will receive ftrace_regs that is incompatible with pt_regs and that's obviously bad. What you're suggesting "give up on using fprobe" is not up to us. We're not using them. We care about register_fprobe_ips() and what callback receives. Whatever internal changes to fprobe you're doing are ok as long as the callback receives valid pt_regs (even partially filled).
On Tue, 1 Aug 2023 19:09:20 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Tue, 1 Aug 2023 15:18:56 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > On Tue, 1 Aug 2023 11:20:36 -0400 > > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > The solution was to come up with ftrace_regs, which just means it has all > > > > the registers to extract the arguments of a function and nothing more. Most > > > > > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As > > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They > > > will do: > > > > > > void callback(..., struct ftrace_regs *fregs) { > > > struct pt_regs *regs = ftrace_get_regs(fregs); > > > > > > > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled. > > > If it is not, then it returns NULL. This was what the x86 maintainers > > > agreed with. > > > > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL > > > > Ouch. That's very bad. > > We care a lot about bpf running well on arm64. > > [ Adding Mark and Florent ] > > That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their > function handlers only care about the arguments. If you want full regs at > function entry, then you need to take a breakpoint hit for a full kprobe. > > In fact, fprobes isn't even supported on arm64 because it it doesn't have > DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying > to get it to work with ftrace_regs. To get it to work on arm64. That's right. And I think (agree) pt_regs is too heavy for function entry/exit because most users needs to access the function arguments or return value. kprobes is a bit different because it is for instruction level inspection tool. > > Again, ftrace_get_regs(fregs) is only suppose to return something if the > pt_regs is fully supplied. If they are not, then it must not be used. Are > you not using a fully filled pt_regs? Because that's what both Thomas and > Peter (also added) told me not to do! I guess that the user-land BPF tools (compliers etc.) only generates bytecode to access registers in pt_regs for kernel probes currently. This is why you are using "kprobes" as a naming. But I think you can be more flexible to generate the code to access registers in ftrace_regs. (because it's just a difference in the offset value) > > Otherwise, ftrace_regs() has support on arm64 for getting to the argument > registers and the stack. Even live kernel patching now uses ftrace_regs(). > > > > > If you guys decide to convert fprobe to ftrace_regs please > > make it depend on kconfig or something. > > bpf side needs full pt_regs. > > Then use kprobes. When I asked Masami what the difference between fprobes > and kprobes was, he told me that it would be that it would no longer rely > on the slower FTRACE_WITH_REGS. But currently, it still does. kprobes needs to keep using pt_regs because software-breakpoint exception handler gets that. And fprobe is used for bpf multi-kprobe interface, but I think it can be optional. So until user-land tool supports the ftrace_regs, you can just disable using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n Then you can safely use struct pt_regs *regs = ftrace_get_regs(fregs); I think we can just replace the CONFIG_FPROBE ifdefs with CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c And that will be the first version of using ftrace_regs in fprobe. > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in > the first place, was because of the overhead you reported to me with > ftrace_regs_caller and why you wanted to go the direct trampoline approach. > That's when I realized I could use a subset because those registers were > already being saved. The only reason FTRACE_WITH_REGS was created was it > had to supply full pt_regs (including flags) and emulate a breakpoint for > the kprobes interface. But in reality, nothing really needs all that. > > > It's not about access to args. > > pt_regs is passed from bpf prog further into all kinds of perf event > > functions including stack walking. > > ftrace_regs gives you the stack pointer. Basically, it gives you access to > anything that is required to be saved to do a function call from fentry. Yeah, for stack walking, we usually need stack pointer and instruction pointer or frame pointer. But Alexei made a good point, linux/stacktrace.h provides pt_regs interaface because pt_regs is a generic (arch-independent) data structure. (see arch_stack_walk()) We need a new interface for it. > > > I think ORC unwinder might depend on availability of all registers. This is not correct. ORC uses limited registers (r10, r13, bp, sp, di, dx) on x86. Anyway, since ftrace can make a stacktrace, it should be possible to use ORC with ftrace_regs. > > Other perf helpers might need it too. Like perf_event_output. > > bpf progs need to access arguments, no doubt about that. > > If ftrace_regs have them exactly in the same offsets as in pt_regs > > that might work transparently for bpf progs, but, I'm afraid, > > it's not the case on all archs. > > So we need full pt_regs to make sure all paths are still working. > > > > Adding Jiri and others. > > Then I recommend that you give up using fprobes and just stick with kprobes > as that's guaranteed to give you full pt_regs (at the overhead of doing > things like filing in flags and such). And currently for arm64, fprobes can > only work with ftrace_regs, without the full pt_regs. I think we can continue to limit usage of fprobe(kprobe_multi) with CONFIG_DYNAMIC_FTRACE_WITH_REGS, which can be configured on x86. That will not change anything from the BPF point of view. Thank you, > > -- Steve
On Wed, 2 Aug 2023 09:21:46 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > Then use kprobes. When I asked Masami what the difference between fprobes > > and kprobes was, he told me that it would be that it would no longer rely > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > kprobes needs to keep using pt_regs because software-breakpoint exception > handler gets that. And fprobe is used for bpf multi-kprobe interface, > but I think it can be optional. > > So until user-land tool supports the ftrace_regs, you can just disable > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n I'm confused. I asked about the difference between kprobes on ftrace and fprobes, and you said it was to get rid of the requirement of FTRACE_WITH_REGS. https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/ > > Then you can safely use > > struct pt_regs *regs = ftrace_get_regs(fregs); > > I think we can just replace the CONFIG_FPROBE ifdefs with > CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c > And that will be the first version of using ftrace_regs in fprobe. But it is still slow. The FTRACE_WITH_REGS gives us the full pt_regs and saves all registers including flags, which is a very slow operation (and noticeable in profilers). And this still doesn't work on arm64. Maybe we can add a ftrace_partial_regs(fregs) that returns a partially filled pt_regs, and the caller that uses this obviously knows its partial (as it's in the name). But this doesn't quite help out arm64 because unlike x86, struct ftrace_regs does not contain an address compatibility with pt_regs fields. It would need to do a copy. ftrace_partial_regs(fregs, ®s) ? -- Steve
On Tue, 1 Aug 2023 20:40:54 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > Maybe we can add a ftrace_partial_regs(fregs) that returns a > partially filled pt_regs, and the caller that uses this obviously knows > its partial (as it's in the name). But this doesn't quite help out arm64 > because unlike x86, struct ftrace_regs does not contain an address > compatibility with pt_regs fields. It would need to do a copy. > > ftrace_partial_regs(fregs, ®s) ? Well, both would be pointers so you wouldn't need the "&", but it was to stress that it would be copying one to the other. void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs); -- Steve
On Tue, Aug 1, 2023 at 5:44 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Tue, 1 Aug 2023 20:40:54 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > Maybe we can add a ftrace_partial_regs(fregs) that returns a > > partially filled pt_regs, and the caller that uses this obviously knows > > its partial (as it's in the name). But this doesn't quite help out arm64 > > because unlike x86, struct ftrace_regs does not contain an address > > compatibility with pt_regs fields. It would need to do a copy. > > > > ftrace_partial_regs(fregs, ®s) ? > > Well, both would be pointers so you wouldn't need the "&", but it was > to stress that it would be copying one to the other. > > void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs); Copy works, but why did you pick a different layout? Why not to use pt_regs ? if save of flags is slow, just skip that part and whatever else that is slow. You don't even need to zero out unsaved fields. Just ask the caller to zero out pt_regs before hand. Most users have per-cpu pt_regs that is being reused. So there will be one zero-out in the beginning and every partial save of regs will be fast. Then there won't be any need for copy-converter from ftrace_regs to pt_regs. Maybe too much churn at this point. copy is fine.
On Tue, 1 Aug 2023 19:22:01 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs); > > Copy works, but why did you pick a different layout? I didn't. That code was written by the arm64 maintainers. -- Steve
On Tue, 1 Aug 2023 20:40:54 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Wed, 2 Aug 2023 09:21:46 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > Then use kprobes. When I asked Masami what the difference between fprobes > > > and kprobes was, he told me that it would be that it would no longer rely > > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > > > kprobes needs to keep using pt_regs because software-breakpoint exception > > handler gets that. And fprobe is used for bpf multi-kprobe interface, > > but I think it can be optional. > > > > So until user-land tool supports the ftrace_regs, you can just disable > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n > > I'm confused. I asked about the difference between kprobes on ftrace > and fprobes, and you said it was to get rid of the requirement of > FTRACE_WITH_REGS. > > https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/ Yes, it is for enabling fprobe (and fprobe-event) on more architectures. I don't think it's possible to change everything at once. So, it will be changed step by step. At the first step, I will replace pt_regs with ftrace_regs, and make bpf_trace.c and fprobe_event depends on FTRACE_WITH_REGS. At this point, we can split the problem into two, how to move bpf on ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event change is not hard because it is closing in the kernel and I can do it. But for BPF, I need to ask BPF user-land tools to support ftrace_regs. > > > > > Then you can safely use > > > > struct pt_regs *regs = ftrace_get_regs(fregs); > > > > I think we can just replace the CONFIG_FPROBE ifdefs with > > CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c > > And that will be the first version of using ftrace_regs in fprobe. > > But it is still slow. The FTRACE_WITH_REGS gives us the full pt_regs > and saves all registers including flags, which is a very slow operation > (and noticeable in profilers). Yes, to solve this part, we need to work with BPF user-land people. I guess the BPF is accessing registers from pt_regs with fixed offset which is calculated from pt_regs layout in the user-space. > > And this still doesn't work on arm64. Yes, and this makes more motivation to move on ftrace_regs. Thank you,
On Tue, 1 Aug 2023 19:22:01 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Tue, Aug 1, 2023 at 5:44 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > On Tue, 1 Aug 2023 20:40:54 -0400 > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > Maybe we can add a ftrace_partial_regs(fregs) that returns a > > > partially filled pt_regs, and the caller that uses this obviously knows > > > its partial (as it's in the name). But this doesn't quite help out arm64 > > > because unlike x86, struct ftrace_regs does not contain an address > > > compatibility with pt_regs fields. It would need to do a copy. > > > > > > ftrace_partial_regs(fregs, ®s) ? > > > > Well, both would be pointers so you wouldn't need the "&", but it was > > to stress that it would be copying one to the other. > > > > void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs); > > Copy works, but why did you pick a different layout? I think it is for minimize the stack consumption. pt_regs on arm64 will consume 42*u64 = 336 bytes, on the other hand ftrace_regs will use 14*unsigned long = 112 bytes. And most of the registers in pt_regs are not accessed usually. (as you may know RISC processors usually have many registers - and x86 will be if we use APX in kernel. So pt_regs is big.) > Why not to use pt_regs ? if save of flags is slow, just skip that part > and whatever else that is slow. You don't even need to zero out > unsaved fields. Just ask the caller to zero out pt_regs before hand. > Most users have per-cpu pt_regs that is being reused. > So there will be one zero-out in the beginning and every partial > save of regs will be fast. > Then there won't be any need for copy-converter from ftrace_regs to pt_regs. > Maybe too much churn at this point. copy is fine. If there is no nested call, yeah, per-cpu pt_regs will work. Thank you,
On Wed, Aug 2, 2023 at 1:09 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Tue, 1 Aug 2023 15:18:56 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > On Tue, 1 Aug 2023 11:20:36 -0400 > > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > The solution was to come up with ftrace_regs, which just means it has all > > > > the registers to extract the arguments of a function and nothing more. Most > > > > > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As > > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They > > > will do: > > > > > > void callback(..., struct ftrace_regs *fregs) { > > > struct pt_regs *regs = ftrace_get_regs(fregs); > > > > > > > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled. > > > If it is not, then it returns NULL. This was what the x86 maintainers > > > agreed with. > > > > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL > > > > Ouch. That's very bad. > > We care a lot about bpf running well on arm64. > > [ Adding Mark and Florent ] Ah, thanks Steve! That's my favorite can of worms :) I actually consider sending a talk proposal to the tracing MC at LPC "pt_regs - the good the bad and the ugly" on this very topic because I care about unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe it would be interesting. > That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their > function handlers only care about the arguments. If you want full regs at > function entry, then you need to take a breakpoint hit for a full kprobe. The main reason why arm64 dropped FTRACE_WITH_REGS is because some registers (like pstate) can not be saved outside of an exception entry (they are just wrong), so trampolines either have to make a pstate up or not populate it. The other reasons are: simplicity (for architectural reasons, it's a lot easier to have only one type of ftrace trampoline on arm64, the "with_args" one) and performance (as you said, why bother saving a pt_regs when most ftrace users don't need it anyway). If you need an actual full pt_regs, then your use case is debugging rather than tracing and you should be able to deal with the slowness and go through an exception (a kprobe). > In fact, fprobes isn't even supported on arm64 because it it doesn't have > DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying > to get it to work with ftrace_regs. To get it to work on arm64. > > Again, ftrace_get_regs(fregs) is only suppose to return something if the > pt_regs is fully supplied. If they are not, then it must not be used. Are > you not using a fully filled pt_regs? Because that's what both Thomas and > Peter (also added) told me not to do! Funnily enough, there's another use of sparse pt_regs in the kernel, in Perf: https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/perf_event.h#n20 Notice how Perf on arm64 implicitly expects the "pstate" register to be set (the very register which we try so hard not to fake in ftrace_regs) because Perf happens to call the "user_mode()" macro somewhere which reads this field: https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/ptrace.h#n227 I pointed this out in https://lore.kernel.org/all/CABRcYm+esb8J2O1v6=C+h+HSa5NxraPUgo63w7-iZj0CXbpusg@mail.gmail.com/#t when Masami proposed adding calls from fprobe to perf. If every subsystem makes different assumptions about "how sparse" their pt_regs is and they call into one another, this could lead to... interesting bugs. (eg: currently, we don't populate a fake pstate in ftrace_regs. so we'd need to fake it when creating a sparse pt_regs _for Perf_, knowing that Perf specifically expects this reg to be set. this would require a struct copy anyway and some knowledge about how the data will be consumed, in an arch- and subsystem- specific way) On the other hand, untangling all code paths that come from trampolines (with a light regs structure) from those that come from an exception (with a pt_regs) could lead to a lot of duplicated code, and converting between each subsystem's idea of a light regs structure (what if perf introduces a perf_regs now ?) would be tedious and slow (lots of copies ?). > Otherwise, ftrace_regs() has support on arm64 for getting to the argument > registers and the stack. Even live kernel patching now uses ftrace_regs(). > > > > > If you guys decide to convert fprobe to ftrace_regs please > > make it depend on kconfig or something. > > bpf side needs full pt_regs. Some wild ideas that I brought up once in a BPF office hour: BPF "multi_kprobe" could provide a fake pt_regs (either by constructing a sparse one on the stack or by JIT-ing different offset accesses and/or by having the verifier deny access to unpopulated fields) or break the current API (is it conceivable to phase out BPF "multi_kprobe" programs in favor of BPF "fprobe" programs that don't lie about their API and guarantees and just provide a ftrace_regs ?) > Then use kprobes. When I asked Masami what the difference between fprobes > and kprobes was, he told me that it would be that it would no longer rely > on the slower FTRACE_WITH_REGS. But currently, it still does. Actually... Moving fprobe to ftrace_regs should get even more spicy! :) Fprobe also wraps "rethook" which is basically the same thing as kretprobe: a return trampoline that saves a pt_regs, to the point that on x86 kretprobe's trampoline got dropped in favor of rethook's trampoline. But for the same reasons that we don't want ftrace to save pt_regs on arm64, rethook should probably also just save a ftrace_regs ? (also, to keep the fprobe callback signatures consistent between pre- and post- handlers). But if we want fprobe "post" callbacks to save a ftrace_regs now, either we need to re-introduce the kretprobe trampoline or also change the API of kretprobe (and break its symmetry with kprobe and we'd have the same problem all over again with BPF kretprobe program types...). All of this is "beautifully" entangled... :) > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in > the first place, was because of the overhead you reported to me with > ftrace_regs_caller and why you wanted to go the direct trampoline approach. > That's when I realized I could use a subset because those registers were > already being saved. The only reason FTRACE_WITH_REGS was created was it > had to supply full pt_regs (including flags) and emulate a breakpoint for > the kprobes interface. But in reality, nothing really needs all that. > > > It's not about access to args. > > pt_regs is passed from bpf prog further into all kinds of perf event > > functions including stack walking. If all accesses are done in BPF bytecode, we could (theoretically) have the verifier and JIT work together to deny accesses to unpopulated fields, or relocate pt_regs accesses to ftrace_regs accesses to keep backward compatibility with existing multi_kprobe BPF programs. Is there a risk that a "multi_kprobe" program could call into a BPF helper or kfunc that reads this pt_regs pointer and expect certain fields to be set ? I suppose we could also deny giving that "pt_regs" pointer to a helper... :/ > ftrace_regs gives you the stack pointer. Basically, it gives you access to > anything that is required to be saved to do a function call from fentry. > > > I think ORC unwinder might depend on availability of all registers. > > Other perf helpers might need it too. Like perf_event_output. > > bpf progs need to access arguments, no doubt about that. > > If ftrace_regs have them exactly in the same offsets as in pt_regs > > that might work transparently for bpf progs, but, I'm afraid, > > it's not the case on all archs. > > So we need full pt_regs to make sure all paths are still working. > > > > Adding Jiri and others. > > Then I recommend that you give up using fprobes and just stick with kprobes > as that's guaranteed to give you full pt_regs (at the overhead of doing > things like filing in flags and such). And currently for arm64, fprobes can > only work with ftrace_regs, without the full pt_regs.
On Wed, Aug 2, 2023 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > On Tue, 1 Aug 2023 20:40:54 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > On Wed, 2 Aug 2023 09:21:46 +0900 > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > > > Then use kprobes. When I asked Masami what the difference between fprobes > > > > and kprobes was, he told me that it would be that it would no longer rely > > > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > > > > > kprobes needs to keep using pt_regs because software-breakpoint exception > > > handler gets that. And fprobe is used for bpf multi-kprobe interface, > > > but I think it can be optional. > > > > > > So until user-land tool supports the ftrace_regs, you can just disable > > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n > > > > I'm confused. I asked about the difference between kprobes on ftrace > > and fprobes, and you said it was to get rid of the requirement of > > FTRACE_WITH_REGS. > > > > https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/ > > Yes, it is for enabling fprobe (and fprobe-event) on more architectures. > I don't think it's possible to change everything at once. So, it will be > changed step by step. At the first step, I will replace pt_regs with > ftrace_regs, and make bpf_trace.c and fprobe_event depends on > FTRACE_WITH_REGS. Just a small note that, strictly speaking, CONFIG_DYNAMIC_FTRACE_WITH_REGS=y is not enough. fprobe_init() would also need a way to set FTRACE_OPS_FL_SAVE_REGS conditionally. (you could be on an arch that supports saving either regs or args and if you don't set FTRACE_OPS_FL_SAVE_REGS you'd go through the args trampoline and get a ftrace_regs that doesn't hold a pt_regs)
On Wed, Aug 2, 2023 at 4:07 PM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > On Tue, 1 Aug 2023 19:22:01 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Tue, Aug 1, 2023 at 5:44 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > On Tue, 1 Aug 2023 20:40:54 -0400 > > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > Maybe we can add a ftrace_partial_regs(fregs) that returns a > > > > partially filled pt_regs, and the caller that uses this obviously knows > > > > its partial (as it's in the name). But this doesn't quite help out arm64 > > > > because unlike x86, struct ftrace_regs does not contain an address > > > > compatibility with pt_regs fields. It would need to do a copy. > > > > > > > > ftrace_partial_regs(fregs, ®s) ? > > > > > > Well, both would be pointers so you wouldn't need the "&", but it was > > > to stress that it would be copying one to the other. > > > > > > void ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs regs); > > > > Copy works, but why did you pick a different layout? > > I think it is for minimize the stack consumption. pt_regs on arm64 will > consume 42*u64 = 336 bytes, on the other hand ftrace_regs will use > 14*unsigned long = 112 bytes. And most of the registers in pt_regs are not > accessed usually. (as you may know RISC processors usually have many > registers - and x86 will be if we use APX in kernel. So pt_regs is big.) > > > Why not to use pt_regs ? if save of flags is slow, just skip that part > > and whatever else that is slow. You don't even need to zero out > > unsaved fields. Just ask the caller to zero out pt_regs before hand. > > Most users have per-cpu pt_regs that is being reused. > > So there will be one zero-out in the beginning and every partial > > save of regs will be fast. > > Then there won't be any need for copy-converter from ftrace_regs to pt_regs. > > Maybe too much churn at this point. copy is fine. > > If there is no nested call, yeah, per-cpu pt_regs will work. BPF "multi_kprobe" programs (ugh, it's pretty awkward we called them that way given that kprobe is out of the picture and fprobe is subject to completely different constraints than kprobe) can't be nested, as checked here: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/tree/kernel/trace/bpf_trace.c?id=4c9fbff54297471d4e2bbfe9c27e80067c722eae#n2642 (this is probably the place where we'd be calling a "ftrace_partical_regs" anyway so that's cool)
On Wed, Aug 2, 2023 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > On Tue, 1 Aug 2023 20:40:54 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > On Wed, 2 Aug 2023 09:21:46 +0900 > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > > > Then use kprobes. When I asked Masami what the difference between fprobes > > > > and kprobes was, he told me that it would be that it would no longer rely > > > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > > > > > kprobes needs to keep using pt_regs because software-breakpoint exception > > > handler gets that. And fprobe is used for bpf multi-kprobe interface, > > > but I think it can be optional. > > > > > > So until user-land tool supports the ftrace_regs, you can just disable > > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n > > > > I'm confused. I asked about the difference between kprobes on ftrace > > and fprobes, and you said it was to get rid of the requirement of > > FTRACE_WITH_REGS. > > > > https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/ > > Yes, it is for enabling fprobe (and fprobe-event) on more architectures. > I don't think it's possible to change everything at once. So, it will be > changed step by step. At the first step, I will replace pt_regs with > ftrace_regs, and make bpf_trace.c and fprobe_event depends on > FTRACE_WITH_REGS. > > At this point, we can split the problem into two, how to move bpf on > ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event > change is not hard because it is closing in the kernel and I can do it. > But for BPF, I need to ask BPF user-land tools to support ftrace_regs. Ah! I finally found the branch where I had pushed my proof of concept of fprobe with ftrace_regs... it's a few months old and I didn't get it in a state such that it could be sent to the list but maybe this can save you a little bit of lead time Masami :) (especially the bpf and arm64 specific bits) https://github.com/FlorentRevest/linux/commits/bpf-arm-complete 08afb628c6e1 ("ftrace: Add a macro to forge an incomplete pt_regs from a ftrace_regs") 203e96fe1790 ("fprobe, rethook: Use struct ftrace_regs instead of struct pt_regs") 1a9e280b9b16 ("arm64,rethook,kprobes: Replace kretprobe with rethook on arm64") 7751c6db9f9d ("bpf: Fix bpf get_func_ip() on arm64 multi-kprobe programs") a10c49c0d717 ("selftests/bpf: Update the tests deny list on aarch64")
On Wed, 2 Aug 2023 16:44:09 +0200 Florent Revest <revest@chromium.org> wrote: > > [ Adding Mark and Florent ] > > Ah, thanks Steve! That's my favorite can of worms :) I actually > consider sending a talk proposal to the tracing MC at LPC "pt_regs - > the good the bad and the ugly" on this very topic because I care about > unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe > it would be interesting. You bring up some excellent points, and the CFP for the Tracing MC is still open. Which reminds me, I need to write my blog post! -- Steve
On Wed, Aug 2, 2023 at 6:56 AM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > On Tue, 1 Aug 2023 20:40:54 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > On Wed, 2 Aug 2023 09:21:46 +0900 > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > > > Then use kprobes. When I asked Masami what the difference between fprobes > > > > and kprobes was, he told me that it would be that it would no longer rely > > > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > > > > > kprobes needs to keep using pt_regs because software-breakpoint exception > > > handler gets that. And fprobe is used for bpf multi-kprobe interface, > > > but I think it can be optional. > > > > > > So until user-land tool supports the ftrace_regs, you can just disable > > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n > > > > I'm confused. I asked about the difference between kprobes on ftrace > > and fprobes, and you said it was to get rid of the requirement of > > FTRACE_WITH_REGS. > > > > https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/ > > Yes, it is for enabling fprobe (and fprobe-event) on more architectures. > I don't think it's possible to change everything at once. So, it will be > changed step by step. At the first step, I will replace pt_regs with > ftrace_regs, and make bpf_trace.c and fprobe_event depends on > FTRACE_WITH_REGS. > > At this point, we can split the problem into two, how to move bpf on > ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event > change is not hard because it is closing in the kernel and I can do it. > But for BPF, I need to ask BPF user-land tools to support ftrace_regs. > > > > > > > > > Then you can safely use > > > > > > struct pt_regs *regs = ftrace_get_regs(fregs); > > > > > > I think we can just replace the CONFIG_FPROBE ifdefs with > > > CONFIG_DYNAMIC_FTRACE_WITH_REGS in kernel/trace/bpf_trace.c > > > And that will be the first version of using ftrace_regs in fprobe. > > > > But it is still slow. The FTRACE_WITH_REGS gives us the full pt_regs > > and saves all registers including flags, which is a very slow operation > > (and noticeable in profilers). > > Yes, to solve this part, we need to work with BPF user-land people. > I guess the BPF is accessing registers from pt_regs with fixed offset > which is calculated from pt_regs layout in the user-space. This is a non starter. bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that.
On Wed, 2 Aug 2023 11:24:12 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > This is a non starter. > bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that. If the progs are compiled into native code, isn't there optimizations that could be done? That is, if ftrace_regs is available, and the bpf program is just using the subset of pt_regs, is it possible that it could be compiled to use ftrace_regs? Forgive my ignorance on how BPF programs turn into executables when running in the kernel. -- Steve
On Wed, Aug 2, 2023 at 11:38 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 2 Aug 2023 11:24:12 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > This is a non starter. > > bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that. > > If the progs are compiled into native code, isn't there optimizations that > could be done? That is, if ftrace_regs is available, and the bpf program is > just using the subset of pt_regs, is it possible that it could be compiled > to use ftrace_regs? > > Forgive my ignorance on how BPF programs turn into executables when running > in the kernel. Right. It's possible for the verifier to do an offset rewrite, forbid certain access, always return 0 on load from certain offset, and so on. It's all non trivial amount of work. ftrace_partial_regs() from ftrace_regs into pt_regs is so much simpler.
On Wed, 2 Aug 2023 12:48:14 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Wed, Aug 2, 2023 at 11:38 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > On Wed, 2 Aug 2023 11:24:12 -0700 > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > This is a non starter. > > > bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that. > > > > If the progs are compiled into native code, isn't there optimizations that > > could be done? That is, if ftrace_regs is available, and the bpf program is > > just using the subset of pt_regs, is it possible that it could be compiled > > to use ftrace_regs? > > > > Forgive my ignorance on how BPF programs turn into executables when running > > in the kernel. > > Right. It's possible for the verifier to do an offset rewrite, > forbid certain access, always return 0 on load from certain offset, > and so on. > It's all non trivial amount of work. > ftrace_partial_regs() from ftrace_regs into pt_regs is so much simpler. Sure, and the copy could be the solution we have in the near future, but if we could optimize it in the future, then perhaps it would be worth doing it. Also, how are the bpf programs referencing the pt_regs? Could a ftrace_regs API be added too? If the verifier sees that the program is using ftrace_regs, it could then use the lighter weight fprobes for access, otherwise it falls back to the kprobe version. -- Steve
On Wed, Aug 2, 2023 at 1:12 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 2 Aug 2023 12:48:14 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Wed, Aug 2, 2023 at 11:38 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > On Wed, 2 Aug 2023 11:24:12 -0700 > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > > > This is a non starter. > > > > bpf progs expect arch dependent 'struct pt_regs *' and we cannot change that. > > > > > > If the progs are compiled into native code, isn't there optimizations that > > > could be done? That is, if ftrace_regs is available, and the bpf program is > > > just using the subset of pt_regs, is it possible that it could be compiled > > > to use ftrace_regs? > > > > > > Forgive my ignorance on how BPF programs turn into executables when running > > > in the kernel. > > > > Right. It's possible for the verifier to do an offset rewrite, > > forbid certain access, always return 0 on load from certain offset, > > and so on. > > It's all non trivial amount of work. > > ftrace_partial_regs() from ftrace_regs into pt_regs is so much simpler. > > Sure, and the copy could be the solution we have in the near future, but if > we could optimize it in the future, then perhaps it would be worth doing it. > > Also, how are the bpf programs referencing the pt_regs? Typically through macros that abstract arch differences away in tools/lib/bpf/bpf_tracing.h PT_REGS_PARM1 PT_REGS_PARM1_CORE PT_REGS_PARM1_SYSCALL pt_regs at syscall entry is special, since syscall calling convention is different from the rest of the kernel. ftrace_regs cannot help with that either. > Could a ftrace_regs > API be added too? Potentially yes, but I don't see the value. bpf users are slowly migrating to fentry/fexit that has accurate args and return value and much faster. kprobes are still heavily used, of course. multi-kprobe (with fprobe_ips underneath) is a new addition that is also very important to some users. > If the verifier sees that the program is using > ftrace_regs, it could then use the lighter weight fprobes for access, > otherwise it falls back to the kprobe version. > > -- Steve
On Wed, 2 Aug 2023 17:47:03 +0200 Florent Revest <revest@chromium.org> wrote: > On Wed, Aug 2, 2023 at 3:56 PM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > On Tue, 1 Aug 2023 20:40:54 -0400 > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > On Wed, 2 Aug 2023 09:21:46 +0900 > > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > > > > > Then use kprobes. When I asked Masami what the difference between fprobes > > > > > and kprobes was, he told me that it would be that it would no longer rely > > > > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > > > > > > > kprobes needs to keep using pt_regs because software-breakpoint exception > > > > handler gets that. And fprobe is used for bpf multi-kprobe interface, > > > > but I think it can be optional. > > > > > > > > So until user-land tool supports the ftrace_regs, you can just disable > > > > using fprobes if CONFIG_DYNAMIC_FTRACE_WITH_REGS=n > > > > > > I'm confused. I asked about the difference between kprobes on ftrace > > > and fprobes, and you said it was to get rid of the requirement of > > > FTRACE_WITH_REGS. > > > > > > https://lore.kernel.org/all/20230120205535.98998636329ca4d5f8325bc3@kernel.org/ > > > > Yes, it is for enabling fprobe (and fprobe-event) on more architectures. > > I don't think it's possible to change everything at once. So, it will be > > changed step by step. At the first step, I will replace pt_regs with > > ftrace_regs, and make bpf_trace.c and fprobe_event depends on > > FTRACE_WITH_REGS. > > > > At this point, we can split the problem into two, how to move bpf on > > ftrace_regs and how to move fprobe-event on ftrace_regs. fprobe-event > > change is not hard because it is closing in the kernel and I can do it. > > But for BPF, I need to ask BPF user-land tools to support ftrace_regs. > > Ah! I finally found the branch where I had pushed my proof of concept > of fprobe with ftrace_regs... it's a few months old and I didn't get > it in a state such that it could be sent to the list but maybe this > can save you a little bit of lead time Masami :) (especially the bpf > and arm64 specific bits) > > https://github.com/FlorentRevest/linux/commits/bpf-arm-complete > > 08afb628c6e1 ("ftrace: Add a macro to forge an incomplete pt_regs from > a ftrace_regs") > 203e96fe1790 ("fprobe, rethook: Use struct ftrace_regs instead of > struct pt_regs") > 1a9e280b9b16 ("arm64,rethook,kprobes: Replace kretprobe with rethook on arm64") > 7751c6db9f9d ("bpf: Fix bpf get_func_ip() on arm64 multi-kprobe programs") > a10c49c0d717 ("selftests/bpf: Update the tests deny list on aarch64") Thanks for the work! I also pushed my patches on https://kernel.googlesource.com/pub/scm/linux/kernel/git/mhiramat/linux/+/refs/heads/topic/fprobe-ftrace-regs 628e6c19d7dc ("tracing/fprobe: Enable fprobe events with CONFIG_DYNAMIC_FTRACE_WITH_ARGS") 311c98c29cfd ("fprobe: Use fprobe_regs in fprobe entry handler") This doesn't cover arm64 and rethook, but provides ftrace_regs optimized fprobe-event code, which uses a correct APIs for ftrace_regs. For the rethook we still need to provide 2 version for kretprobe(pt_regs) and fprobe(ftrace_regs). I think eventually we should replace the kretprobe with fprobe, but current rethook is tightly coupled with kretprobe and the kretprobe needs pt_regs. So, I would like to keep arm64 kretprobe impl, and add new rethook with ftrace_regs. Or, maybe we need these 2 configs intermediately. CONFIG_RETHOOK_WITH_REGS - in this case, kretprobe uses rethook CONFIG_RETHOOK_WITH_ARGS - in this case, kretprobe uses its own stack The problem is ftrace_regs only depends on CONFIG_DYNAMIC_FTRACE_WITH_*. Thank you,
On Wed, 2 Aug 2023 16:44:09 +0200 Florent Revest <revest@chromium.org> wrote: > On Wed, Aug 2, 2023 at 1:09 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > On Tue, 1 Aug 2023 15:18:56 -0700 > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > > On Tue, 1 Aug 2023 11:20:36 -0400 > > > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > > > The solution was to come up with ftrace_regs, which just means it has all > > > > > the registers to extract the arguments of a function and nothing more. Most > > > > > > > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As > > > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They > > > > will do: > > > > > > > > void callback(..., struct ftrace_regs *fregs) { > > > > struct pt_regs *regs = ftrace_get_regs(fregs); > > > > > > > > > > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled. > > > > If it is not, then it returns NULL. This was what the x86 maintainers > > > > agreed with. > > > > > > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL > > > > > > Ouch. That's very bad. > > > We care a lot about bpf running well on arm64. > > > > [ Adding Mark and Florent ] > > Ah, thanks Steve! That's my favorite can of worms :) I actually > consider sending a talk proposal to the tracing MC at LPC "pt_regs - > the good the bad and the ugly" on this very topic because I care about > unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe > it would be interesting. Ah, it is almost same as my talk :) > > > That's because arm64 doesn't support FTRACE_WITH_REGS anymore. Their > > function handlers only care about the arguments. If you want full regs at > > function entry, then you need to take a breakpoint hit for a full kprobe. > > The main reason why arm64 dropped FTRACE_WITH_REGS is because some > registers (like pstate) can not be saved outside of an exception entry > (they are just wrong), so trampolines either have to make a pstate up > or not populate it. > > The other reasons are: simplicity (for architectural reasons, it's a > lot easier to have only one type of ftrace trampoline on arm64, the > "with_args" one) and performance (as you said, why bother saving a > pt_regs when most ftrace users don't need it anyway). If you need an > actual full pt_regs, then your use case is debugging rather than > tracing and you should be able to deal with the slowness and go > through an exception (a kprobe). Agreed. Both reasons are reasonable. Especially function entry and exit tracing API, we don't need full pt_regs because there is established ABI. > > > In fact, fprobes isn't even supported on arm64 because it it doesn't have > > DYNAMIC_FTRACE_WITH_REGS. I believe that was the reason Masami was trying > > to get it to work with ftrace_regs. To get it to work on arm64. > > > > Again, ftrace_get_regs(fregs) is only suppose to return something if the > > pt_regs is fully supplied. If they are not, then it must not be used. Are > > you not using a fully filled pt_regs? Because that's what both Thomas and > > Peter (also added) told me not to do! > > Funnily enough, there's another use of sparse pt_regs in the kernel, in Perf: > https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/perf_event.h#n20 > Notice how Perf on arm64 implicitly expects the "pstate" register to > be set (the very register which we try so hard not to fake in > ftrace_regs) because Perf happens to call the "user_mode()" macro > somewhere which reads this field: > https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/tree/arch/arm64/include/asm/ptrace.h#n227 I think interrupt/exception based API like kprobes and perf (PMU) may need to use pt_regs. > > I pointed this out in > https://lore.kernel.org/all/CABRcYm+esb8J2O1v6=C+h+HSa5NxraPUgo63w7-iZj0CXbpusg@mail.gmail.com/#t > when Masami proposed adding calls from fprobe to perf. If every > subsystem makes different assumptions about "how sparse" their pt_regs > is and they call into one another, this could lead to... interesting > bugs. (eg: currently, we don't populate a fake pstate in ftrace_regs. > so we'd need to fake it when creating a sparse pt_regs _for Perf_, > knowing that Perf specifically expects this reg to be set. this would > require a struct copy anyway and some knowledge about how the data > will be consumed, in an arch- and subsystem- specific way) yeah, sorry I missed that point. I should remove it until we can fix it. I think we can add another kernel event only perf_trace_buf_submit() which doesn't have the user_mode() check. > > On the other hand, untangling all code paths that come from > trampolines (with a light regs structure) from those that come from an > exception (with a pt_regs) could lead to a lot of duplicated code, and > converting between each subsystem's idea of a light regs structure > (what if perf introduces a perf_regs now ?) would be tedious and slow > (lots of copies ?). This is one discussion point I think. Actually, using pt_regs in kretprobe (and rethook) is histrical accident. Originally, it had put a kprobe on the function return trampoline to hook it. So keep the API compatiblity I made the hand assembled code to save the pt_regs on the stack. My another question is if we have the fprobe to trace (hook) the function return, why we still need the kretprobe itself. I think we can remove kretprobe and use fprobe exit handler, because "function" probing will be done by fprobe, not kprobe. And then, we can simplify the kprobe interface and clarify what it is -- "kprobe is a wrapper of software breakpoint". And we don't need to think about duplicated code anymore :) > > > Otherwise, ftrace_regs() has support on arm64 for getting to the argument > > registers and the stack. Even live kernel patching now uses ftrace_regs(). > > > > > > > > If you guys decide to convert fprobe to ftrace_regs please > > > make it depend on kconfig or something. > > > bpf side needs full pt_regs. > > Some wild ideas that I brought up once in a BPF office hour: BPF > "multi_kprobe" could provide a fake pt_regs (either by constructing a > sparse one on the stack or by JIT-ing different offset accesses and/or > by having the verifier deny access to unpopulated fields) or break the > current API (is it conceivable to phase out BPF "multi_kprobe" > programs in favor of BPF "fprobe" programs that don't lie about their > API and guarantees and just provide a ftrace_regs ?) +1 :) > > > Then use kprobes. When I asked Masami what the difference between fprobes > > and kprobes was, he told me that it would be that it would no longer rely > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > Actually... Moving fprobe to ftrace_regs should get even more spicy! > :) Fprobe also wraps "rethook" which is basically the same thing as > kretprobe: a return trampoline that saves a pt_regs, to the point that > on x86 kretprobe's trampoline got dropped in favor of rethook's > trampoline. But for the same reasons that we don't want ftrace to save > pt_regs on arm64, rethook should probably also just save a ftrace_regs > ? (also, to keep the fprobe callback signatures consistent between > pre- and post- handlers). But if we want fprobe "post" callbacks to > save a ftrace_regs now, either we need to re-introduce the kretprobe > trampoline or also change the API of kretprobe (and break its symmetry > with kprobe and we'd have the same problem all over again with BPF > kretprobe program types...). All of this is "beautifully" entangled... > :) As I said, I would like to phase out the kretprobe itself because it provides the same feature of fprobe, which is confusing. jprobe was removed a while ago, and now kretprobe is. But we can not phase out it at once. So I think we will keep current kretprobe trampoline on arm64 and just add new ftrace_regs based rethook. Then remove the API next release. (after all users including systemtap is moved) > > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in > > the first place, was because of the overhead you reported to me with > > ftrace_regs_caller and why you wanted to go the direct trampoline approach. > > That's when I realized I could use a subset because those registers were > > already being saved. The only reason FTRACE_WITH_REGS was created was it > > had to supply full pt_regs (including flags) and emulate a breakpoint for > > the kprobes interface. But in reality, nothing really needs all that. > > > > > It's not about access to args. > > > pt_regs is passed from bpf prog further into all kinds of perf event > > > functions including stack walking. > > If all accesses are done in BPF bytecode, we could (theoretically) > have the verifier and JIT work together to deny accesses to > unpopulated fields, or relocate pt_regs accesses to ftrace_regs > accesses to keep backward compatibility with existing multi_kprobe BPF > programs. Yeah, that is what I would like to suggest, and what my patch does. (let me update rethook too, it'll be a bit tricky since I don't want break anything) Thanks, > > Is there a risk that a "multi_kprobe" program could call into a BPF > helper or kfunc that reads this pt_regs pointer and expect certain > fields to be set ? I suppose we could also deny giving that "pt_regs" > pointer to a helper... :/ > > > ftrace_regs gives you the stack pointer. Basically, it gives you access to > > anything that is required to be saved to do a function call from fentry. > > > > > I think ORC unwinder might depend on availability of all registers. > > > Other perf helpers might need it too. Like perf_event_output. > > > bpf progs need to access arguments, no doubt about that. > > > If ftrace_regs have them exactly in the same offsets as in pt_regs > > > that might work transparently for bpf progs, but, I'm afraid, > > > it's not the case on all archs. > > > So we need full pt_regs to make sure all paths are still working. > > > > > > Adding Jiri and others. > > > > Then I recommend that you give up using fprobes and just stick with kprobes > > as that's guaranteed to give you full pt_regs (at the overhead of doing > > things like filing in flags and such). And currently for arm64, fprobes can > > only work with ftrace_regs, without the full pt_regs.
On Thu, Aug 3, 2023 at 5:42 PM Masami Hiramatsu <mhiramat@kernel.org> wrote: > > On Wed, 2 Aug 2023 16:44:09 +0200 > Florent Revest <revest@chromium.org> wrote: > > > On Wed, Aug 2, 2023 at 1:09 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > On Tue, 1 Aug 2023 15:18:56 -0700 > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > > > On Tue, Aug 1, 2023 at 8:32 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > > > > On Tue, 1 Aug 2023 11:20:36 -0400 > > > > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > > > > > The solution was to come up with ftrace_regs, which just means it has all > > > > > > the registers to extract the arguments of a function and nothing more. Most > > > > > > > > > > This isn't 100% true. The ftrace_regs may hold a fully filled pt_regs. As > > > > > the FTRACE_WITH_REGS callbacks still get passed a ftrace_regs pointer. They > > > > > will do: > > > > > > > > > > void callback(..., struct ftrace_regs *fregs) { > > > > > struct pt_regs *regs = ftrace_get_regs(fregs); > > > > > > > > > > > > > > > Where ftrace_get_regs() will return the pt_regs only if it is fully filled. > > > > > If it is not, then it returns NULL. This was what the x86 maintainers > > > > > agreed with. > > > > > > > > arch/arm64/include/asm/ftrace.h:#define arch_ftrace_get_regs(regs) NULL > > > > > > > > Ouch. That's very bad. > > > > We care a lot about bpf running well on arm64. > > > > > > [ Adding Mark and Florent ] > > > > Ah, thanks Steve! That's my favorite can of worms :) I actually > > consider sending a talk proposal to the tracing MC at LPC "pt_regs - > > the good the bad and the ugly" on this very topic because I care about > > unblocking BPF "multi_kprobe" (which really is fprobe) on arm64, maybe > > it would be interesting. > > Ah, it is almost same as my talk :) Oh, I didn't know! I submitted a proposal today but if the talks have a lot of overlap maybe it's best that only you give your talk, since you're the actual maintainer :) or we could co-present if there's something I could add but I think you have all the background anyway > > I pointed this out in > > https://lore.kernel.org/all/CABRcYm+esb8J2O1v6=C+h+HSa5NxraPUgo63w7-iZj0CXbpusg@mail.gmail.com/#t > > when Masami proposed adding calls from fprobe to perf. If every > > subsystem makes different assumptions about "how sparse" their pt_regs > > is and they call into one another, this could lead to... interesting > > bugs. (eg: currently, we don't populate a fake pstate in ftrace_regs. > > so we'd need to fake it when creating a sparse pt_regs _for Perf_, > > knowing that Perf specifically expects this reg to be set. this would > > require a struct copy anyway and some knowledge about how the data > > will be consumed, in an arch- and subsystem- specific way) > > yeah, sorry I missed that point. I should remove it until we can fix it. Uh, I shouldn't have buried my important comments so far down the email :/ I wasn't sure whether you had missed the paragraph. > > On the other hand, untangling all code paths that come from > > trampolines (with a light regs structure) from those that come from an > > exception (with a pt_regs) could lead to a lot of duplicated code, and > > converting between each subsystem's idea of a light regs structure > > (what if perf introduces a perf_regs now ?) would be tedious and slow > > (lots of copies ?). > > This is one discussion point I think. Actually, using pt_regs in kretprobe > (and rethook) is histrical accident. Originally, it had put a kprobe on > the function return trampoline to hook it. So keep the API compatiblity > I made the hand assembled code to save the pt_regs on the stack. > > My another question is if we have the fprobe to trace (hook) the function > return, why we still need the kretprobe itself. I think we can remove > kretprobe and use fprobe exit handler, because "function" probing will > be done by fprobe, not kprobe. And then, we can simplify the kprobe > interface and clarify what it is -- "kprobe is a wrapper of software > breakpoint". And we don't need to think about duplicated code anymore :) That sounds reasonable to me > As I said, I would like to phase out the kretprobe itself because it > provides the same feature of fprobe, which is confusing. jprobe was > removed a while ago, and now kretprobe is. But we can not phase out > it at once. So I think we will keep current kretprobe trampoline on > arm64 and just add new ftrace_regs based rethook. Then remove the > API next release. (after all users including systemtap is moved) Heads up to BPF folks though since they also have BPF "kretprobe" program types which would break in a similar fashion as multi_kprobe (even though BPF kretprobe programs have also been discouraged for a while in favor of BPF fexit programs) > > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in > > > the first place, was because of the overhead you reported to me with > > > ftrace_regs_caller and why you wanted to go the direct trampoline approach. > > > That's when I realized I could use a subset because those registers were > > > already being saved. The only reason FTRACE_WITH_REGS was created was it > > > had to supply full pt_regs (including flags) and emulate a breakpoint for > > > the kprobes interface. But in reality, nothing really needs all that. > > > > > > > It's not about access to args. > > > > pt_regs is passed from bpf prog further into all kinds of perf event > > > > functions including stack walking. > > > > If all accesses are done in BPF bytecode, we could (theoretically) > > have the verifier and JIT work together to deny accesses to > > unpopulated fields, or relocate pt_regs accesses to ftrace_regs > > accesses to keep backward compatibility with existing multi_kprobe BPF > > programs. > > Yeah, that is what I would like to suggest, and what my patch does. > (let me update rethook too, it'll be a bit tricky since I don't want > break anything) I agree with Alexei that this is an unnecessary amount of complexity in the verifier just to avoid a struct copy though. It's good to know that we _could_ do it if we really need to someday but then again, if a user chooses an interface that gets a pt_regs they shouldn't expect high performance. Therefore, I think it's ok for BPF multi_kprobe to copy fields from a ftrace_regs to a pt_regs on stack, especially if it avoids so much additional complexity in the verifier.
On Fri, Aug 04, 2023 at 12:42:06AM +0900, Masami Hiramatsu wrote: SNIP > > > > On the other hand, untangling all code paths that come from > > trampolines (with a light regs structure) from those that come from an > > exception (with a pt_regs) could lead to a lot of duplicated code, and > > converting between each subsystem's idea of a light regs structure > > (what if perf introduces a perf_regs now ?) would be tedious and slow > > (lots of copies ?). > > This is one discussion point I think. Actually, using pt_regs in kretprobe > (and rethook) is histrical accident. Originally, it had put a kprobe on > the function return trampoline to hook it. So keep the API compatiblity > I made the hand assembled code to save the pt_regs on the stack. > > My another question is if we have the fprobe to trace (hook) the function > return, why we still need the kretprobe itself. I think we can remove > kretprobe and use fprobe exit handler, because "function" probing will > be done by fprobe, not kprobe. And then, we can simplify the kprobe > interface and clarify what it is -- "kprobe is a wrapper of software > breakpoint". And we don't need to think about duplicated code anymore :) 1+ sounds like good idea > > > > > > Otherwise, ftrace_regs() has support on arm64 for getting to the argument > > > registers and the stack. Even live kernel patching now uses ftrace_regs(). > > > > > > > > > > > If you guys decide to convert fprobe to ftrace_regs please > > > > make it depend on kconfig or something. > > > > bpf side needs full pt_regs. > > > > Some wild ideas that I brought up once in a BPF office hour: BPF > > "multi_kprobe" could provide a fake pt_regs (either by constructing a > > sparse one on the stack or by JIT-ing different offset accesses and/or > > by having the verifier deny access to unpopulated fields) or break the > > current API (is it conceivable to phase out BPF "multi_kprobe" > > programs in favor of BPF "fprobe" programs that don't lie about their > > API and guarantees and just provide a ftrace_regs ?) > > +1 :) so multi_kprobe link was created to allow fast attach of BPF kprobe-type programs to multiple functions.. I don't think there's need for new fprobe program > > > > > > Then use kprobes. When I asked Masami what the difference between fprobes > > > and kprobes was, he told me that it would be that it would no longer rely > > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > > > Actually... Moving fprobe to ftrace_regs should get even more spicy! > > :) Fprobe also wraps "rethook" which is basically the same thing as > > kretprobe: a return trampoline that saves a pt_regs, to the point that > > on x86 kretprobe's trampoline got dropped in favor of rethook's > > trampoline. But for the same reasons that we don't want ftrace to save > > pt_regs on arm64, rethook should probably also just save a ftrace_regs > > ? (also, to keep the fprobe callback signatures consistent between > > pre- and post- handlers). But if we want fprobe "post" callbacks to > > save a ftrace_regs now, either we need to re-introduce the kretprobe > > trampoline or also change the API of kretprobe (and break its symmetry > > with kprobe and we'd have the same problem all over again with BPF > > kretprobe program types...). All of this is "beautifully" entangled... > > :) > > As I said, I would like to phase out the kretprobe itself because it > provides the same feature of fprobe, which is confusing. jprobe was > removed a while ago, and now kretprobe is. But we can not phase out > it at once. So I think we will keep current kretprobe trampoline on > arm64 and just add new ftrace_regs based rethook. Then remove the > API next release. (after all users including systemtap is moved) > > > > > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in > > > the first place, was because of the overhead you reported to me with > > > ftrace_regs_caller and why you wanted to go the direct trampoline approach. > > > That's when I realized I could use a subset because those registers were > > > already being saved. The only reason FTRACE_WITH_REGS was created was it > > > had to supply full pt_regs (including flags) and emulate a breakpoint for > > > the kprobes interface. But in reality, nothing really needs all that. > > > > > > > It's not about access to args. > > > > pt_regs is passed from bpf prog further into all kinds of perf event > > > > functions including stack walking. > > > > If all accesses are done in BPF bytecode, we could (theoretically) > > have the verifier and JIT work together to deny accesses to > > unpopulated fields, or relocate pt_regs accesses to ftrace_regs > > accesses to keep backward compatibility with existing multi_kprobe BPF > > programs. > > Yeah, that is what I would like to suggest, and what my patch does. > (let me update rethook too, it'll be a bit tricky since I don't want > break anything) > > Thanks, > > > > > Is there a risk that a "multi_kprobe" program could call into a BPF > > helper or kfunc that reads this pt_regs pointer and expect certain > > fields to be set ? I suppose we could also deny giving that "pt_regs" > > pointer to a helper... :/ I think Alexei answered this earlier in the thread: >From bpf side we don't care that such pt_regs is 100% filled in or >only partial as long as this pt_regs pointer is valid for perf_event_output >and stack walking that consume pt_regs. >I believe that was and still is the case for both x86 and arm64. jirka
On Mon, 7 Aug 2023 22:48:29 +0200 Jiri Olsa <olsajiri@gmail.com> wrote: > On Fri, Aug 04, 2023 at 12:42:06AM +0900, Masami Hiramatsu wrote: > > SNIP > > > > > > > On the other hand, untangling all code paths that come from > > > trampolines (with a light regs structure) from those that come from an > > > exception (with a pt_regs) could lead to a lot of duplicated code, and > > > converting between each subsystem's idea of a light regs structure > > > (what if perf introduces a perf_regs now ?) would be tedious and slow > > > (lots of copies ?). > > > > This is one discussion point I think. Actually, using pt_regs in kretprobe > > (and rethook) is histrical accident. Originally, it had put a kprobe on > > the function return trampoline to hook it. So keep the API compatiblity > > I made the hand assembled code to save the pt_regs on the stack. > > > > My another question is if we have the fprobe to trace (hook) the function > > return, why we still need the kretprobe itself. I think we can remove > > kretprobe and use fprobe exit handler, because "function" probing will > > be done by fprobe, not kprobe. And then, we can simplify the kprobe > > interface and clarify what it is -- "kprobe is a wrapper of software > > breakpoint". And we don't need to think about duplicated code anymore :) > > 1+ sounds like good idea Thanks! the downside will be that it requires to enable CONFIG_FPROBE instead of CONFIG_KPROBES, but I think it is natural that the user, who wants to trace function boundary, enables CONFIG_FUNCTION_TRACER. > > > > Otherwise, ftrace_regs() has support on arm64 for getting to the argument > > > > registers and the stack. Even live kernel patching now uses ftrace_regs(). > > > > > > > > > > > > > > If you guys decide to convert fprobe to ftrace_regs please > > > > > make it depend on kconfig or something. > > > > > bpf side needs full pt_regs. > > > > > > Some wild ideas that I brought up once in a BPF office hour: BPF > > > "multi_kprobe" could provide a fake pt_regs (either by constructing a > > > sparse one on the stack or by JIT-ing different offset accesses and/or > > > by having the verifier deny access to unpopulated fields) or break the > > > current API (is it conceivable to phase out BPF "multi_kprobe" > > > programs in favor of BPF "fprobe" programs that don't lie about their > > > API and guarantees and just provide a ftrace_regs ?) > > > > +1 :) > > so multi_kprobe link was created to allow fast attach of BPF kprobe-type > programs to multiple functions.. I don't think there's need for new fprobe > program Ah, OK. So the focus point is shortening registration time. > > > > > > > > > > Then use kprobes. When I asked Masami what the difference between fprobes > > > > and kprobes was, he told me that it would be that it would no longer rely > > > > on the slower FTRACE_WITH_REGS. But currently, it still does. > > > > > > Actually... Moving fprobe to ftrace_regs should get even more spicy! > > > :) Fprobe also wraps "rethook" which is basically the same thing as > > > kretprobe: a return trampoline that saves a pt_regs, to the point that > > > on x86 kretprobe's trampoline got dropped in favor of rethook's > > > trampoline. But for the same reasons that we don't want ftrace to save > > > pt_regs on arm64, rethook should probably also just save a ftrace_regs > > > ? (also, to keep the fprobe callback signatures consistent between > > > pre- and post- handlers). But if we want fprobe "post" callbacks to > > > save a ftrace_regs now, either we need to re-introduce the kretprobe > > > trampoline or also change the API of kretprobe (and break its symmetry > > > with kprobe and we'd have the same problem all over again with BPF > > > kretprobe program types...). All of this is "beautifully" entangled... > > > :) > > > > As I said, I would like to phase out the kretprobe itself because it > > provides the same feature of fprobe, which is confusing. jprobe was > > removed a while ago, and now kretprobe is. But we can not phase out > > it at once. So I think we will keep current kretprobe trampoline on > > arm64 and just add new ftrace_regs based rethook. Then remove the > > API next release. (after all users including systemtap is moved) > > > > > > > > > The reason I started the FTRACE_WITH_ARGS (which gave us ftrace_regs) in > > > > the first place, was because of the overhead you reported to me with > > > > ftrace_regs_caller and why you wanted to go the direct trampoline approach. > > > > That's when I realized I could use a subset because those registers were > > > > already being saved. The only reason FTRACE_WITH_REGS was created was it > > > > had to supply full pt_regs (including flags) and emulate a breakpoint for > > > > the kprobes interface. But in reality, nothing really needs all that. > > > > > > > > > It's not about access to args. > > > > > pt_regs is passed from bpf prog further into all kinds of perf event > > > > > functions including stack walking. > > > > > > If all accesses are done in BPF bytecode, we could (theoretically) > > > have the verifier and JIT work together to deny accesses to > > > unpopulated fields, or relocate pt_regs accesses to ftrace_regs > > > accesses to keep backward compatibility with existing multi_kprobe BPF > > > programs. > > > > Yeah, that is what I would like to suggest, and what my patch does. > > (let me update rethook too, it'll be a bit tricky since I don't want > > break anything) > > > > Thanks, > > > > > > > > Is there a risk that a "multi_kprobe" program could call into a BPF > > > helper or kfunc that reads this pt_regs pointer and expect certain > > > fields to be set ? I suppose we could also deny giving that "pt_regs" > > > pointer to a helper... :/ > > I think Alexei answered this earlier in the thread: > > >From bpf side we don't care that such pt_regs is 100% filled in or > >only partial as long as this pt_regs pointer is valid for perf_event_output > >and stack walking that consume pt_regs. > >I believe that was and still is the case for both x86 and arm64. OK, so I've made the ftrace_partial_regs() according to the idea now. Thanks, > > > jirka
diff --git a/include/linux/btf.h b/include/linux/btf.h index 20e3a07eef8f..4b10d57ceee0 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -226,6 +226,9 @@ const struct btf_type *btf_find_func_proto(const char *func_name, struct btf **btf_p); const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s32 *nr); +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7b25c615269..8d81a4ffa67b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -958,6 +958,46 @@ const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s3 return NULL; } +#define BTF_ANON_STACK_MAX 16 + +/* + * Find a member of data structure/union by name and return it. + * Return NULL if not found, or -EINVAL if parameter is invalid. + */ +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name) +{ + const struct btf_type *anon_stack[BTF_ANON_STACK_MAX]; + const struct btf_member *member; + const char *name; + int i, top = 0; + +retry: + if (!btf_type_is_struct(type)) + return ERR_PTR(-EINVAL); + + for_each_member(i, type, member) { + if (!member->name_off) { + /* Anonymous union/struct: push it for later use */ + type = btf_type_skip_modifiers(btf, member->type, NULL); + if (type && top < BTF_ANON_STACK_MAX) + anon_stack[top++] = type; + } else { + name = btf_name_by_offset(btf, member->name_off); + if (name && !strcmp(member_name, name)) + return member; + } + } + if (top > 0) { + /* Pop from the anonymous stack and retry */ + type = anon_stack[--top]; + goto retry; + } + + return NULL; +} + #define BTF_SHOW_MAX_ITER 10 #define BTF_KIND_BIT(kind) (1ULL << kind)