diff mbox series

[bpf-next,v2,1/9] bpf: tracing: add support to record and check the accessed args

Message ID 20240311093526.1010158-2-dongmenglong.8@bytedance.com (mailing list archive)
State New, archived
Headers show
Series bpf: make tracing program support multi-link | expand

Commit Message

梦龙董 March 11, 2024, 9:35 a.m. UTC
In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
which is used to record the accessed index of the function args in
btf_ctx_access().

Meanwhile, we add the function btf_check_func_part_match() to compare the
accessed function args of two function prototype. This function will be
used in the following commit.

Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
---
 include/linux/bpf.h |   4 ++
 kernel/bpf/btf.c    | 108 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 2 deletions(-)

Comments

Alexei Starovoitov March 12, 2024, 1:46 a.m. UTC | #1
On Mon, Mar 11, 2024 at 2:34 AM Menglong Dong
<dongmenglong.8@bytedance.com> wrote:
>
> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> which is used to record the accessed index of the function args in
> btf_ctx_access().
>
> Meanwhile, we add the function btf_check_func_part_match() to compare the
> accessed function args of two function prototype. This function will be
> used in the following commit.
>
> Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
> ---
>  include/linux/bpf.h |   4 ++
>  kernel/bpf/btf.c    | 108 +++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 110 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 95e07673cdc1..0f677fdcfcc7 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
>         const struct btf_type *attach_func_proto;
>         /* function name for valid attach_btf_id */
>         const char *attach_func_name;
> +       u64 accessed_args;
>         struct bpf_prog **func;
>         void *jit_data; /* JIT specific data. arch dependent */
>         struct bpf_jit_poke_descriptor *poke_tab;
> @@ -2565,6 +2566,9 @@ struct bpf_reg_state;
>  int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
>  int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
>                          struct btf *btf, const struct btf_type *t);
> +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
> +                             struct btf *btf2, const struct btf_type *t2,
> +                             u64 func_args);
>  const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
>                                     int comp_idx, const char *tag_key);
>  int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 170d017e8e4a..c2a0299d4358 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -6125,19 +6125,24 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
>  }
>
>  static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
> -                          int off)
> +                          int off, int *aligned_idx)
>  {
>         const struct btf_param *args;
>         const struct btf_type *t;
>         u32 offset = 0, nr_args;
>         int i;
>
> +       if (aligned_idx)
> +               *aligned_idx = -ENOENT;
> +
>         if (!func_proto)
>                 return off / 8;
>
>         nr_args = btf_type_vlen(func_proto);
>         args = (const struct btf_param *)(func_proto + 1);
>         for (i = 0; i < nr_args; i++) {
> +               if (aligned_idx && offset == off)
> +                       *aligned_idx = i;
>                 t = btf_type_skip_modifiers(btf, args[i].type, NULL);
>                 offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
>                 if (off < offset)
> @@ -6207,7 +6212,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
>                         tname, off);
>                 return false;
>         }
> -       arg = get_ctx_arg_idx(btf, t, off);
> +       arg = get_ctx_arg_idx(btf, t, off, NULL);
>         args = (const struct btf_param *)(t + 1);
>         /* if (t == NULL) Fall back to default BPF prog with
>          * MAX_BPF_FUNC_REG_ARGS u64 arguments.
> @@ -6217,6 +6222,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
>                 /* skip first 'void *__data' argument in btf_trace_##name typedef */
>                 args++;
>                 nr_args--;
> +               prog->aux->accessed_args |= (1 << (arg + 1));
> +       } else {
> +               prog->aux->accessed_args |= (1 << arg);

What do you need this aligned_idx for ?
I'd expect that above "accessed_args |= (1 << arg);" is enough.

>         }
>
>         if (arg > nr_args) {
> @@ -7024,6 +7032,102 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
>         return btf_check_func_type_match(log, btf1, t1, btf2, t2);
>  }
>
> +static u32 get_ctx_arg_total_size(struct btf *btf, const struct btf_type *t)
> +{
> +       const struct btf_param *args;
> +       u32 size = 0, nr_args;
> +       int i;
> +
> +       nr_args = btf_type_vlen(t);
> +       args = (const struct btf_param *)(t + 1);
> +       for (i = 0; i < nr_args; i++) {
> +               t = btf_type_skip_modifiers(btf, args[i].type, NULL);
> +               size += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
> +       }
> +
> +       return size;
> +}
> +
> +/* This function is similar to btf_check_func_type_match(), except that it
> + * only compare some function args of the function prototype t1 and t2.
> + */
> +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *func1,
> +                             struct btf *btf2, const struct btf_type *func2,
> +                             u64 func_args)

This is way too much copy paste.
Please share the code with btf_check_func_type_match.
梦龙董 March 12, 2024, 2:01 a.m. UTC | #2
On Tue, Mar 12, 2024 at 9:46 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Mar 11, 2024 at 2:34 AM Menglong Dong
> <dongmenglong.8@bytedance.com> wrote:
> >
> > In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> > which is used to record the accessed index of the function args in
> > btf_ctx_access().
> >
> > Meanwhile, we add the function btf_check_func_part_match() to compare the
> > accessed function args of two function prototype. This function will be
> > used in the following commit.
> >
> > Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
> > ---
> >  include/linux/bpf.h |   4 ++
> >  kernel/bpf/btf.c    | 108 +++++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 110 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 95e07673cdc1..0f677fdcfcc7 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
> >         const struct btf_type *attach_func_proto;
> >         /* function name for valid attach_btf_id */
> >         const char *attach_func_name;
> > +       u64 accessed_args;
> >         struct bpf_prog **func;
> >         void *jit_data; /* JIT specific data. arch dependent */
> >         struct bpf_jit_poke_descriptor *poke_tab;
> > @@ -2565,6 +2566,9 @@ struct bpf_reg_state;
> >  int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
> >  int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
> >                          struct btf *btf, const struct btf_type *t);
> > +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
> > +                             struct btf *btf2, const struct btf_type *t2,
> > +                             u64 func_args);
> >  const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
> >                                     int comp_idx, const char *tag_key);
> >  int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
> > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > index 170d017e8e4a..c2a0299d4358 100644
> > --- a/kernel/bpf/btf.c
> > +++ b/kernel/bpf/btf.c
> > @@ -6125,19 +6125,24 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
> >  }
> >
> >  static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
> > -                          int off)
> > +                          int off, int *aligned_idx)
> >  {
> >         const struct btf_param *args;
> >         const struct btf_type *t;
> >         u32 offset = 0, nr_args;
> >         int i;
> >
> > +       if (aligned_idx)
> > +               *aligned_idx = -ENOENT;
> > +
> >         if (!func_proto)
> >                 return off / 8;
> >
> >         nr_args = btf_type_vlen(func_proto);
> >         args = (const struct btf_param *)(func_proto + 1);
> >         for (i = 0; i < nr_args; i++) {
> > +               if (aligned_idx && offset == off)
> > +                       *aligned_idx = i;
> >                 t = btf_type_skip_modifiers(btf, args[i].type, NULL);
> >                 offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
> >                 if (off < offset)
> > @@ -6207,7 +6212,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> >                         tname, off);
> >                 return false;
> >         }
> > -       arg = get_ctx_arg_idx(btf, t, off);
> > +       arg = get_ctx_arg_idx(btf, t, off, NULL);
> >         args = (const struct btf_param *)(t + 1);
> >         /* if (t == NULL) Fall back to default BPF prog with
> >          * MAX_BPF_FUNC_REG_ARGS u64 arguments.
> > @@ -6217,6 +6222,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> >                 /* skip first 'void *__data' argument in btf_trace_##name typedef */
> >                 args++;
> >                 nr_args--;
> > +               prog->aux->accessed_args |= (1 << (arg + 1));
> > +       } else {
> > +               prog->aux->accessed_args |= (1 << arg);
>
> What do you need this aligned_idx for ?
> I'd expect that above "accessed_args |= (1 << arg);" is enough.
>

Which aligned_idx? No aligned_idx in the btf_ctx_access(), and
aligned_idx is only used in the btf_check_func_part_match().

In the btf_check_func_part_match(), I need to compare the
t1->args[i] and t2->args[j], which have the same offset. And
the aligned_idx is to find the "j" according to the offset of
t1->args[i].

> >         }
> >
> >         if (arg > nr_args) {
> > @@ -7024,6 +7032,102 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
> >         return btf_check_func_type_match(log, btf1, t1, btf2, t2);
> >  }
> >
> > +static u32 get_ctx_arg_total_size(struct btf *btf, const struct btf_type *t)
> > +{
> > +       const struct btf_param *args;
> > +       u32 size = 0, nr_args;
> > +       int i;
> > +
> > +       nr_args = btf_type_vlen(t);
> > +       args = (const struct btf_param *)(t + 1);
> > +       for (i = 0; i < nr_args; i++) {
> > +               t = btf_type_skip_modifiers(btf, args[i].type, NULL);
> > +               size += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
> > +       }
> > +
> > +       return size;
> > +}
> > +
> > +/* This function is similar to btf_check_func_type_match(), except that it
> > + * only compare some function args of the function prototype t1 and t2.
> > + */
> > +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *func1,
> > +                             struct btf *btf2, const struct btf_type *func2,
> > +                             u64 func_args)
>
> This is way too much copy paste.
> Please share the code with btf_check_func_type_match.

Okay!

Thanks!
Menglong Dong
Alexei Starovoitov March 12, 2024, 2:09 a.m. UTC | #3
On Mon, Mar 11, 2024 at 7:01 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> On Tue, Mar 12, 2024 at 9:46 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Mar 11, 2024 at 2:34 AM Menglong Dong
> > <dongmenglong.8@bytedance.com> wrote:
> > >
> > > In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> > > which is used to record the accessed index of the function args in
> > > btf_ctx_access().
> > >
> > > Meanwhile, we add the function btf_check_func_part_match() to compare the
> > > accessed function args of two function prototype. This function will be
> > > used in the following commit.
> > >
> > > Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
> > > ---
> > >  include/linux/bpf.h |   4 ++
> > >  kernel/bpf/btf.c    | 108 +++++++++++++++++++++++++++++++++++++++++++-
> > >  2 files changed, 110 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 95e07673cdc1..0f677fdcfcc7 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
> > >         const struct btf_type *attach_func_proto;
> > >         /* function name for valid attach_btf_id */
> > >         const char *attach_func_name;
> > > +       u64 accessed_args;
> > >         struct bpf_prog **func;
> > >         void *jit_data; /* JIT specific data. arch dependent */
> > >         struct bpf_jit_poke_descriptor *poke_tab;
> > > @@ -2565,6 +2566,9 @@ struct bpf_reg_state;
> > >  int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
> > >  int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
> > >                          struct btf *btf, const struct btf_type *t);
> > > +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
> > > +                             struct btf *btf2, const struct btf_type *t2,
> > > +                             u64 func_args);
> > >  const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
> > >                                     int comp_idx, const char *tag_key);
> > >  int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
> > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > index 170d017e8e4a..c2a0299d4358 100644
> > > --- a/kernel/bpf/btf.c
> > > +++ b/kernel/bpf/btf.c
> > > @@ -6125,19 +6125,24 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
> > >  }
> > >
> > >  static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
> > > -                          int off)
> > > +                          int off, int *aligned_idx)
> > >  {
> > >         const struct btf_param *args;
> > >         const struct btf_type *t;
> > >         u32 offset = 0, nr_args;
> > >         int i;
> > >
> > > +       if (aligned_idx)
> > > +               *aligned_idx = -ENOENT;
> > > +
> > >         if (!func_proto)
> > >                 return off / 8;
> > >
> > >         nr_args = btf_type_vlen(func_proto);
> > >         args = (const struct btf_param *)(func_proto + 1);
> > >         for (i = 0; i < nr_args; i++) {
> > > +               if (aligned_idx && offset == off)
> > > +                       *aligned_idx = i;
> > >                 t = btf_type_skip_modifiers(btf, args[i].type, NULL);
> > >                 offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
> > >                 if (off < offset)
> > > @@ -6207,7 +6212,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > >                         tname, off);
> > >                 return false;
> > >         }
> > > -       arg = get_ctx_arg_idx(btf, t, off);
> > > +       arg = get_ctx_arg_idx(btf, t, off, NULL);
> > >         args = (const struct btf_param *)(t + 1);
> > >         /* if (t == NULL) Fall back to default BPF prog with
> > >          * MAX_BPF_FUNC_REG_ARGS u64 arguments.
> > > @@ -6217,6 +6222,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > >                 /* skip first 'void *__data' argument in btf_trace_##name typedef */
> > >                 args++;
> > >                 nr_args--;
> > > +               prog->aux->accessed_args |= (1 << (arg + 1));
> > > +       } else {
> > > +               prog->aux->accessed_args |= (1 << arg);
> >
> > What do you need this aligned_idx for ?
> > I'd expect that above "accessed_args |= (1 << arg);" is enough.
> >
>
> Which aligned_idx? No aligned_idx in the btf_ctx_access(), and
> aligned_idx is only used in the btf_check_func_part_match().
>
> In the btf_check_func_part_match(), I need to compare the
> t1->args[i] and t2->args[j], which have the same offset. And
> the aligned_idx is to find the "j" according to the offset of
> t1->args[i].

And that's my question.
Why you don't do the max of accessed_args across all attach
points and do btf_check_func_type_match() to that argno
instead of nargs1.
This 'offset += btf_type_is_ptr(t1) ? 8 : roundup...
is odd.
梦龙董 March 12, 2024, 2:42 a.m. UTC | #4
On Tue, Mar 12, 2024 at 10:09 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Mar 11, 2024 at 7:01 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> >
> > On Tue, Mar 12, 2024 at 9:46 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Mar 11, 2024 at 2:34 AM Menglong Dong
> > > <dongmenglong.8@bytedance.com> wrote:
> > > >
> > > > In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> > > > which is used to record the accessed index of the function args in
> > > > btf_ctx_access().
> > > >
> > > > Meanwhile, we add the function btf_check_func_part_match() to compare the
> > > > accessed function args of two function prototype. This function will be
> > > > used in the following commit.
> > > >
> > > > Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
> > > > ---
> > > >  include/linux/bpf.h |   4 ++
> > > >  kernel/bpf/btf.c    | 108 +++++++++++++++++++++++++++++++++++++++++++-
> > > >  2 files changed, 110 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index 95e07673cdc1..0f677fdcfcc7 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
> > > >         const struct btf_type *attach_func_proto;
> > > >         /* function name for valid attach_btf_id */
> > > >         const char *attach_func_name;
> > > > +       u64 accessed_args;
> > > >         struct bpf_prog **func;
> > > >         void *jit_data; /* JIT specific data. arch dependent */
> > > >         struct bpf_jit_poke_descriptor *poke_tab;
> > > > @@ -2565,6 +2566,9 @@ struct bpf_reg_state;
> > > >  int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
> > > >  int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
> > > >                          struct btf *btf, const struct btf_type *t);
> > > > +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
> > > > +                             struct btf *btf2, const struct btf_type *t2,
> > > > +                             u64 func_args);
> > > >  const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
> > > >                                     int comp_idx, const char *tag_key);
> > > >  int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
> > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > > index 170d017e8e4a..c2a0299d4358 100644
> > > > --- a/kernel/bpf/btf.c
> > > > +++ b/kernel/bpf/btf.c
> > > > @@ -6125,19 +6125,24 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
> > > >  }
> > > >
> > > >  static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
> > > > -                          int off)
> > > > +                          int off, int *aligned_idx)
> > > >  {
> > > >         const struct btf_param *args;
> > > >         const struct btf_type *t;
> > > >         u32 offset = 0, nr_args;
> > > >         int i;
> > > >
> > > > +       if (aligned_idx)
> > > > +               *aligned_idx = -ENOENT;
> > > > +
> > > >         if (!func_proto)
> > > >                 return off / 8;
> > > >
> > > >         nr_args = btf_type_vlen(func_proto);
> > > >         args = (const struct btf_param *)(func_proto + 1);
> > > >         for (i = 0; i < nr_args; i++) {
> > > > +               if (aligned_idx && offset == off)
> > > > +                       *aligned_idx = i;
> > > >                 t = btf_type_skip_modifiers(btf, args[i].type, NULL);
> > > >                 offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
> > > >                 if (off < offset)
> > > > @@ -6207,7 +6212,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > > >                         tname, off);
> > > >                 return false;
> > > >         }
> > > > -       arg = get_ctx_arg_idx(btf, t, off);
> > > > +       arg = get_ctx_arg_idx(btf, t, off, NULL);
> > > >         args = (const struct btf_param *)(t + 1);
> > > >         /* if (t == NULL) Fall back to default BPF prog with
> > > >          * MAX_BPF_FUNC_REG_ARGS u64 arguments.
> > > > @@ -6217,6 +6222,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > > >                 /* skip first 'void *__data' argument in btf_trace_##name typedef */
> > > >                 args++;
> > > >                 nr_args--;
> > > > +               prog->aux->accessed_args |= (1 << (arg + 1));
> > > > +       } else {
> > > > +               prog->aux->accessed_args |= (1 << arg);
> > >
> > > What do you need this aligned_idx for ?
> > > I'd expect that above "accessed_args |= (1 << arg);" is enough.
> > >
> >
> > Which aligned_idx? No aligned_idx in the btf_ctx_access(), and
> > aligned_idx is only used in the btf_check_func_part_match().
> >
> > In the btf_check_func_part_match(), I need to compare the
> > t1->args[i] and t2->args[j], which have the same offset. And
> > the aligned_idx is to find the "j" according to the offset of
> > t1->args[i].
>
> And that's my question.
> Why you don't do the max of accessed_args across all attach
> points and do btf_check_func_type_match() to that argno
> instead of nargs1.
> This 'offset += btf_type_is_ptr(t1) ? 8 : roundup...
> is odd.

Hi, I'm trying to make the bpf flexible enough. Let's take an example,
now we have the bpf program:

int test1_result = 0;
int BPF_PROG(test1, int a, long b, char c)
{
    test1_result = a + c;
    return 0;
}

In this program, only the 1st and 3rd arg is accessed. So all kernel
functions whose 1st arg is int and 3rd arg is char can be attached
by this bpf program, even if their 2nd arg is different.

And let's take another example for struct. This is our bpf program:

int test1_result = 0;
int BPF_PROG(test1, long a, long b, char c)
{
    test1_result = c;
    return 0;
}

Only the 3rd arg is accessed. And we have following kernel function:

int kernel_function1(long a, long b, char c)
{
xxx
}

struct test1 {
    long a;
    long b;
};
int kernel_function2(struct test1 a, char b)
{
xxx
}

The kernel_function1 and kernel_function2 should be compatible,
as the bpf program only accessed the ctx[2], whose offset is 16.
And the arg in kernel_function1() with offset 16 is "char c", the
arg in kernel_function2() with offset 16 is "char b", which is
compatible.

That's why we need to check the consistency of accessed args
by offset instead of function arg index.

I'm not sure if I express my idea clearly, is this what you are
asking?

Thanks!
Menglong Dong
梦龙董 March 12, 2024, 2:49 a.m. UTC | #5
On Tue, Mar 12, 2024 at 10:42 AM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> On Tue, Mar 12, 2024 at 10:09 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Mar 11, 2024 at 7:01 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> > >
> > > On Tue, Mar 12, 2024 at 9:46 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 11, 2024 at 2:34 AM Menglong Dong
> > > > <dongmenglong.8@bytedance.com> wrote:
> > > > >
> > > > > In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> > > > > which is used to record the accessed index of the function args in
> > > > > btf_ctx_access().
> > > > >
> > > > > Meanwhile, we add the function btf_check_func_part_match() to compare the
> > > > > accessed function args of two function prototype. This function will be
> > > > > used in the following commit.
> > > > >
> > > > > Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
> > > > > ---
> > > > >  include/linux/bpf.h |   4 ++
> > > > >  kernel/bpf/btf.c    | 108 +++++++++++++++++++++++++++++++++++++++++++-
> > > > >  2 files changed, 110 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > index 95e07673cdc1..0f677fdcfcc7 100644
> > > > > --- a/include/linux/bpf.h
> > > > > +++ b/include/linux/bpf.h
> > > > > @@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
> > > > >         const struct btf_type *attach_func_proto;
> > > > >         /* function name for valid attach_btf_id */
> > > > >         const char *attach_func_name;
> > > > > +       u64 accessed_args;
> > > > >         struct bpf_prog **func;
> > > > >         void *jit_data; /* JIT specific data. arch dependent */
> > > > >         struct bpf_jit_poke_descriptor *poke_tab;
> > > > > @@ -2565,6 +2566,9 @@ struct bpf_reg_state;
> > > > >  int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
> > > > >  int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
> > > > >                          struct btf *btf, const struct btf_type *t);
> > > > > +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
> > > > > +                             struct btf *btf2, const struct btf_type *t2,
> > > > > +                             u64 func_args);
> > > > >  const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
> > > > >                                     int comp_idx, const char *tag_key);
> > > > >  int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
> > > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > > > index 170d017e8e4a..c2a0299d4358 100644
> > > > > --- a/kernel/bpf/btf.c
> > > > > +++ b/kernel/bpf/btf.c
> > > > > @@ -6125,19 +6125,24 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
> > > > >  }
> > > > >
> > > > >  static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
> > > > > -                          int off)
> > > > > +                          int off, int *aligned_idx)
> > > > >  {
> > > > >         const struct btf_param *args;
> > > > >         const struct btf_type *t;
> > > > >         u32 offset = 0, nr_args;
> > > > >         int i;
> > > > >
> > > > > +       if (aligned_idx)
> > > > > +               *aligned_idx = -ENOENT;
> > > > > +
> > > > >         if (!func_proto)
> > > > >                 return off / 8;
> > > > >
> > > > >         nr_args = btf_type_vlen(func_proto);
> > > > >         args = (const struct btf_param *)(func_proto + 1);
> > > > >         for (i = 0; i < nr_args; i++) {
> > > > > +               if (aligned_idx && offset == off)
> > > > > +                       *aligned_idx = i;
> > > > >                 t = btf_type_skip_modifiers(btf, args[i].type, NULL);
> > > > >                 offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
> > > > >                 if (off < offset)
> > > > > @@ -6207,7 +6212,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > > > >                         tname, off);
> > > > >                 return false;
> > > > >         }
> > > > > -       arg = get_ctx_arg_idx(btf, t, off);
> > > > > +       arg = get_ctx_arg_idx(btf, t, off, NULL);
> > > > >         args = (const struct btf_param *)(t + 1);
> > > > >         /* if (t == NULL) Fall back to default BPF prog with
> > > > >          * MAX_BPF_FUNC_REG_ARGS u64 arguments.
> > > > > @@ -6217,6 +6222,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > > > >                 /* skip first 'void *__data' argument in btf_trace_##name typedef */
> > > > >                 args++;
> > > > >                 nr_args--;
> > > > > +               prog->aux->accessed_args |= (1 << (arg + 1));
> > > > > +       } else {
> > > > > +               prog->aux->accessed_args |= (1 << arg);
> > > >
> > > > What do you need this aligned_idx for ?
> > > > I'd expect that above "accessed_args |= (1 << arg);" is enough.
> > > >
> > >
> > > Which aligned_idx? No aligned_idx in the btf_ctx_access(), and
> > > aligned_idx is only used in the btf_check_func_part_match().
> > >
> > > In the btf_check_func_part_match(), I need to compare the
> > > t1->args[i] and t2->args[j], which have the same offset. And
> > > the aligned_idx is to find the "j" according to the offset of
> > > t1->args[i].
> >
> > And that's my question.
> > Why you don't do the max of accessed_args across all attach
> > points and do btf_check_func_type_match() to that argno
> > instead of nargs1.
> > This 'offset += btf_type_is_ptr(t1) ? 8 : roundup...
> > is odd.
>
> Hi, I'm trying to make the bpf flexible enough. Let's take an example,
> now we have the bpf program:
>
> int test1_result = 0;
> int BPF_PROG(test1, int a, long b, char c)
> {
>     test1_result = a + c;
>     return 0;
> }
>
> In this program, only the 1st and 3rd arg is accessed. So all kernel
> functions whose 1st arg is int and 3rd arg is char can be attached
> by this bpf program, even if their 2nd arg is different.
>
> And let's take another example for struct. This is our bpf program:
>
> int test1_result = 0;
> int BPF_PROG(test1, long a, long b, char c)
> {
>     test1_result = c;
>     return 0;
> }
>
> Only the 3rd arg is accessed. And we have following kernel function:
>
> int kernel_function1(long a, long b, char c)
> {
> xxx
> }
>
> struct test1 {
>     long a;
>     long b;
> };
> int kernel_function2(struct test1 a, char b)
> {
> xxx
> }
>
> The kernel_function1 and kernel_function2 should be compatible,
> as the bpf program only accessed the ctx[2], whose offset is 16.
> And the arg in kernel_function1() with offset 16 is "char c", the
> arg in kernel_function2() with offset 16 is "char b", which is
> compatible.
>
> That's why we need to check the consistency of accessed args
> by offset instead of function arg index.
>

And that's why I didn't share the code with btf_check_func_type_match().
In btf_check_func_part_match(), I'm trying to check the "real" accessed
args of t1 and t2, not by the function index, which has quite a difference
with btf_check_func_type_match().

> I'm not sure if I express my idea clearly, is this what you are
> asking?
>
> Thanks!
> Menglong Dong
Alexei Starovoitov March 12, 2024, 4:42 p.m. UTC | #6
On Mon, Mar 11, 2024 at 7:42 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> On Tue, Mar 12, 2024 at 10:09 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Mar 11, 2024 at 7:01 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> > >
> > > On Tue, Mar 12, 2024 at 9:46 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 11, 2024 at 2:34 AM Menglong Dong
> > > > <dongmenglong.8@bytedance.com> wrote:
> > > > >
> > > > > In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> > > > > which is used to record the accessed index of the function args in
> > > > > btf_ctx_access().
> > > > >
> > > > > Meanwhile, we add the function btf_check_func_part_match() to compare the
> > > > > accessed function args of two function prototype. This function will be
> > > > > used in the following commit.
> > > > >
> > > > > Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
> > > > > ---
> > > > >  include/linux/bpf.h |   4 ++
> > > > >  kernel/bpf/btf.c    | 108 +++++++++++++++++++++++++++++++++++++++++++-
> > > > >  2 files changed, 110 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > index 95e07673cdc1..0f677fdcfcc7 100644
> > > > > --- a/include/linux/bpf.h
> > > > > +++ b/include/linux/bpf.h
> > > > > @@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
> > > > >         const struct btf_type *attach_func_proto;
> > > > >         /* function name for valid attach_btf_id */
> > > > >         const char *attach_func_name;
> > > > > +       u64 accessed_args;
> > > > >         struct bpf_prog **func;
> > > > >         void *jit_data; /* JIT specific data. arch dependent */
> > > > >         struct bpf_jit_poke_descriptor *poke_tab;
> > > > > @@ -2565,6 +2566,9 @@ struct bpf_reg_state;
> > > > >  int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
> > > > >  int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
> > > > >                          struct btf *btf, const struct btf_type *t);
> > > > > +int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
> > > > > +                             struct btf *btf2, const struct btf_type *t2,
> > > > > +                             u64 func_args);
> > > > >  const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
> > > > >                                     int comp_idx, const char *tag_key);
> > > > >  int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
> > > > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > > > index 170d017e8e4a..c2a0299d4358 100644
> > > > > --- a/kernel/bpf/btf.c
> > > > > +++ b/kernel/bpf/btf.c
> > > > > @@ -6125,19 +6125,24 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
> > > > >  }
> > > > >
> > > > >  static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
> > > > > -                          int off)
> > > > > +                          int off, int *aligned_idx)
> > > > >  {
> > > > >         const struct btf_param *args;
> > > > >         const struct btf_type *t;
> > > > >         u32 offset = 0, nr_args;
> > > > >         int i;
> > > > >
> > > > > +       if (aligned_idx)
> > > > > +               *aligned_idx = -ENOENT;
> > > > > +
> > > > >         if (!func_proto)
> > > > >                 return off / 8;
> > > > >
> > > > >         nr_args = btf_type_vlen(func_proto);
> > > > >         args = (const struct btf_param *)(func_proto + 1);
> > > > >         for (i = 0; i < nr_args; i++) {
> > > > > +               if (aligned_idx && offset == off)
> > > > > +                       *aligned_idx = i;
> > > > >                 t = btf_type_skip_modifiers(btf, args[i].type, NULL);
> > > > >                 offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
> > > > >                 if (off < offset)
> > > > > @@ -6207,7 +6212,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > > > >                         tname, off);
> > > > >                 return false;
> > > > >         }
> > > > > -       arg = get_ctx_arg_idx(btf, t, off);
> > > > > +       arg = get_ctx_arg_idx(btf, t, off, NULL);
> > > > >         args = (const struct btf_param *)(t + 1);
> > > > >         /* if (t == NULL) Fall back to default BPF prog with
> > > > >          * MAX_BPF_FUNC_REG_ARGS u64 arguments.
> > > > > @@ -6217,6 +6222,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> > > > >                 /* skip first 'void *__data' argument in btf_trace_##name typedef */
> > > > >                 args++;
> > > > >                 nr_args--;
> > > > > +               prog->aux->accessed_args |= (1 << (arg + 1));
> > > > > +       } else {
> > > > > +               prog->aux->accessed_args |= (1 << arg);
> > > >
> > > > What do you need this aligned_idx for ?
> > > > I'd expect that above "accessed_args |= (1 << arg);" is enough.
> > > >
> > >
> > > Which aligned_idx? No aligned_idx in the btf_ctx_access(), and
> > > aligned_idx is only used in the btf_check_func_part_match().
> > >
> > > In the btf_check_func_part_match(), I need to compare the
> > > t1->args[i] and t2->args[j], which have the same offset. And
> > > the aligned_idx is to find the "j" according to the offset of
> > > t1->args[i].
> >
> > And that's my question.
> > Why you don't do the max of accessed_args across all attach
> > points and do btf_check_func_type_match() to that argno
> > instead of nargs1.
> > This 'offset += btf_type_is_ptr(t1) ? 8 : roundup...
> > is odd.
>
> Hi, I'm trying to make the bpf flexible enough. Let's take an example,
> now we have the bpf program:
>
> int test1_result = 0;
> int BPF_PROG(test1, int a, long b, char c)
> {
>     test1_result = a + c;
>     return 0;
> }
>
> In this program, only the 1st and 3rd arg is accessed. So all kernel
> functions whose 1st arg is int and 3rd arg is char can be attached
> by this bpf program, even if their 2nd arg is different.
>
> And let's take another example for struct. This is our bpf program:
>
> int test1_result = 0;
> int BPF_PROG(test1, long a, long b, char c)
> {
>     test1_result = c;
>     return 0;
> }
>
> Only the 3rd arg is accessed. And we have following kernel function:
>
> int kernel_function1(long a, long b, char c)
> {
> xxx
> }
>
> struct test1 {
>     long a;
>     long b;
> };
> int kernel_function2(struct test1 a, char b)
> {
> xxx
> }
>
> The kernel_function1 and kernel_function2 should be compatible,
> as the bpf program only accessed the ctx[2], whose offset is 16.
> And the arg in kernel_function1() with offset 16 is "char c", the
> arg in kernel_function2() with offset 16 is "char b", which is
> compatible.

I see.
I thought you're sharing the trampoline across attachments.
(since bpf prog is the same).
But above approach cannot possibly work with a shared trampoline.
You need to create individual trampoline for all attachment
and point them to single bpf prog.

tbh I'm less excited about this feature now, since sharing
the prog across different attachments is nice, but it won't scale
to thousands of attachments.
I assumed that there will be a single trampoline with max(argno)
across attachments and attach/detach will scale to thousands.

With individual trampoline this will work for up to a hundred
attachments max.

Let's step back.
What is the exact use case you're trying to solve?
Not an artificial one as selftest in patch 9, but the real use case?
梦龙董 March 13, 2024, 1:53 a.m. UTC | #7
On Wed, Mar 13, 2024 at 12:42 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Mar 11, 2024 at 7:42 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> >
[......]
>
> I see.
> I thought you're sharing the trampoline across attachments.
> (since bpf prog is the same).

That seems to be a good idea, which I hadn't thought before.

> But above approach cannot possibly work with a shared trampoline.
> You need to create individual trampoline for all attachment
> and point them to single bpf prog.
>
> tbh I'm less excited about this feature now, since sharing
> the prog across different attachments is nice, but it won't scale
> to thousands of attachments.
> I assumed that there will be a single trampoline with max(argno)
> across attachments and attach/detach will scale to thousands.
>
> With individual trampoline this will work for up to a hundred
> attachments max.

What does "a hundred attachments max" means? Can't I
trace thousands of kernel functions with a bpf program of
tracing multi-link?

>
> Let's step back.
> What is the exact use case you're trying to solve?
> Not an artificial one as selftest in patch 9, but the real use case?

I have a tool, which is used to diagnose network problems,
and its name is "nettrace". It will trace many kernel functions, whose
function args contain "skb", like this:

./nettrace -p icmp
begin trace...
***************** ffff889be8fbd500,ffff889be8fbcd00 ***************
[1272349.614564] [dev_gro_receive     ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614579] [__netif_receive_skb_core] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614585] [ip_rcv              ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614592] [ip_rcv_core         ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614599] [skb_clone           ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614616] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614629] [nft_do_chain        ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614635] [ip_rcv_finish       ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614643] [ip_route_input_slow ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614647] [fib_validate_source ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614652] [ip_local_deliver    ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614658] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614663] [ip_local_deliver_finish] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614666] [icmp_rcv            ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614671] [icmp_echo           ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614675] [icmp_reply          ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614715] [consume_skb         ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614722] [packet_rcv          ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220
[1272349.614725] [consume_skb         ] ICMP: 169.254.128.15 ->
172.27.0.6 ping request, seq: 48220

For now, I have to create a bpf program for every kernel
function that I want to trace, which is up to 200.

With this multi-link, I only need to create 5 bpf program,
like this:

int BPF_PROG(trace_skb_1, struct *skb);
int BPF_PROG(trace_skb_2, u64 arg0, struct *skb);
int BPF_PROG(trace_skb_3, u64 arg0, u64 arg1, struct *skb);
int BPF_PROG(trace_skb_4, u64 arg0, u64 arg1, u64 arg2, struct *skb);
int BPF_PROG(trace_skb_5, u64 arg0, u64 arg1, u64 arg2, u64 arg3, struct *skb);

Then, I can attach trace_skb_1 to all the kernel functions that
I want to trace and whose first arg is skb; attach trace_skb_2 to kernel
functions whose 2nd arg is skb, etc.

Or, I can create only one bpf program and store the index
of skb to the attachment cookie, and attach this program to all
the kernel functions that I want to trace.

This is my use case. With the multi-link, now I only have
1 bpf program, 1 bpf link, 200 trampolines, instead of 200
bpf programs, 200 bpf link and 200 trampolines.

The shared trampoline you mentioned seems to be a
wonderful idea, which can make the 200 trampolines
to one. Let me have a look, we create a trampoline and
record the max args count of all the target functions, let's
mark it as arg_count.

During generating the trampoline, we assume that the
function args count is arg_count. During attaching, we
check the consistency of all the target functions, just like
what we do now.

Am I right?

Thanks!
Menglong Dong
Alexei Starovoitov March 14, 2024, 12:25 a.m. UTC | #8
On Tue, Mar 12, 2024 at 6:53 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> On Wed, Mar 13, 2024 at 12:42 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Mar 11, 2024 at 7:42 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> > >
> [......]
> >
> > I see.
> > I thought you're sharing the trampoline across attachments.
> > (since bpf prog is the same).
>
> That seems to be a good idea, which I hadn't thought before.
>
> > But above approach cannot possibly work with a shared trampoline.
> > You need to create individual trampoline for all attachment
> > and point them to single bpf prog.
> >
> > tbh I'm less excited about this feature now, since sharing
> > the prog across different attachments is nice, but it won't scale
> > to thousands of attachments.
> > I assumed that there will be a single trampoline with max(argno)
> > across attachments and attach/detach will scale to thousands.
> >
> > With individual trampoline this will work for up to a hundred
> > attachments max.
>
> What does "a hundred attachments max" means? Can't I
> trace thousands of kernel functions with a bpf program of
> tracing multi-link?

I mean what time does it take to attach one program
to 100 fentry-s ?
What is the time for 1k and for 10k ?

The kprobe multi test attaches to pretty much all funcs in
/sys/kernel/tracing/available_filter_functions
and it's fast enough to run in test_progs on every commit in bpf CI.
See get_syms() in prog_tests/kprobe_multi_test.c

Can this new multi fentry do that?
and at what speed?
The answer will decide how applicable this api is going to be.
Generating different trampolines for every attach point
is an approach as well. Pls benchmark it too.

> >
> > Let's step back.
> > What is the exact use case you're trying to solve?
> > Not an artificial one as selftest in patch 9, but the real use case?
>
> I have a tool, which is used to diagnose network problems,
> and its name is "nettrace". It will trace many kernel functions, whose
> function args contain "skb", like this:
>
> ./nettrace -p icmp
> begin trace...
> ***************** ffff889be8fbd500,ffff889be8fbcd00 ***************
> [1272349.614564] [dev_gro_receive     ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614579] [__netif_receive_skb_core] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614585] [ip_rcv              ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614592] [ip_rcv_core         ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614599] [skb_clone           ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614616] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614629] [nft_do_chain        ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614635] [ip_rcv_finish       ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614643] [ip_route_input_slow ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614647] [fib_validate_source ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614652] [ip_local_deliver    ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614658] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614663] [ip_local_deliver_finish] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614666] [icmp_rcv            ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614671] [icmp_echo           ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614675] [icmp_reply          ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614715] [consume_skb         ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614722] [packet_rcv          ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
> [1272349.614725] [consume_skb         ] ICMP: 169.254.128.15 ->
> 172.27.0.6 ping request, seq: 48220
>
> For now, I have to create a bpf program for every kernel
> function that I want to trace, which is up to 200.
>
> With this multi-link, I only need to create 5 bpf program,
> like this:
>
> int BPF_PROG(trace_skb_1, struct *skb);
> int BPF_PROG(trace_skb_2, u64 arg0, struct *skb);
> int BPF_PROG(trace_skb_3, u64 arg0, u64 arg1, struct *skb);
> int BPF_PROG(trace_skb_4, u64 arg0, u64 arg1, u64 arg2, struct *skb);
> int BPF_PROG(trace_skb_5, u64 arg0, u64 arg1, u64 arg2, u64 arg3, struct *skb);
>
> Then, I can attach trace_skb_1 to all the kernel functions that
> I want to trace and whose first arg is skb; attach trace_skb_2 to kernel
> functions whose 2nd arg is skb, etc.
>
> Or, I can create only one bpf program and store the index
> of skb to the attachment cookie, and attach this program to all
> the kernel functions that I want to trace.
>
> This is my use case. With the multi-link, now I only have
> 1 bpf program, 1 bpf link, 200 trampolines, instead of 200
> bpf programs, 200 bpf link and 200 trampolines.

I see. The use case makes sense to me.
Andrii's retsnoop is used to do similar thing before kprobe multi was
introduced.

> The shared trampoline you mentioned seems to be a
> wonderful idea, which can make the 200 trampolines
> to one. Let me have a look, we create a trampoline and
> record the max args count of all the target functions, let's
> mark it as arg_count.
>
> During generating the trampoline, we assume that the
> function args count is arg_count. During attaching, we
> check the consistency of all the target functions, just like
> what we do now.

For one trampoline to handle all attach points we might
need some arch support, but we can start simple.
Make btf_func_model with MAX_BPF_FUNC_REG_ARGS
by calling btf_distill_func_proto() with func==NULL.
And use that to build a trampoline.

The challenge is how to use minimal number of trampolines
when bpf_progA is attached for func1, func2, func3
and bpf_progB is attached to func3, func4, func5.
We'd still need 3 trampolines:
for func[12] to call bpf_progA,
for func3 to call bpf_progA and bpf_progB,
for func[45] to call bpf_progB.

Jiri was trying to solve it in the past. His slides from LPC:
https://lpc.events/event/16/contributions/1350/attachments/1033/1983/plumbers.pdf

Pls study them and his prior patchsets to avoid stepping on the same rakes.
Jiri Olsa March 14, 2024, 6:27 a.m. UTC | #9
On Wed, Mar 13, 2024 at 05:25:35PM -0700, Alexei Starovoitov wrote:
> On Tue, Mar 12, 2024 at 6:53 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> >
> > On Wed, Mar 13, 2024 at 12:42 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Mar 11, 2024 at 7:42 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> > > >
> > [......]
> > >
> > > I see.
> > > I thought you're sharing the trampoline across attachments.
> > > (since bpf prog is the same).
> >
> > That seems to be a good idea, which I hadn't thought before.
> >
> > > But above approach cannot possibly work with a shared trampoline.
> > > You need to create individual trampoline for all attachment
> > > and point them to single bpf prog.
> > >
> > > tbh I'm less excited about this feature now, since sharing
> > > the prog across different attachments is nice, but it won't scale
> > > to thousands of attachments.
> > > I assumed that there will be a single trampoline with max(argno)
> > > across attachments and attach/detach will scale to thousands.
> > >
> > > With individual trampoline this will work for up to a hundred
> > > attachments max.
> >
> > What does "a hundred attachments max" means? Can't I
> > trace thousands of kernel functions with a bpf program of
> > tracing multi-link?
> 
> I mean what time does it take to attach one program
> to 100 fentry-s ?
> What is the time for 1k and for 10k ?
> 
> The kprobe multi test attaches to pretty much all funcs in
> /sys/kernel/tracing/available_filter_functions
> and it's fast enough to run in test_progs on every commit in bpf CI.
> See get_syms() in prog_tests/kprobe_multi_test.c
> 
> Can this new multi fentry do that?
> and at what speed?
> The answer will decide how applicable this api is going to be.
> Generating different trampolines for every attach point
> is an approach as well. Pls benchmark it too.
> 
> > >
> > > Let's step back.
> > > What is the exact use case you're trying to solve?
> > > Not an artificial one as selftest in patch 9, but the real use case?
> >
> > I have a tool, which is used to diagnose network problems,
> > and its name is "nettrace". It will trace many kernel functions, whose
> > function args contain "skb", like this:
> >
> > ./nettrace -p icmp
> > begin trace...
> > ***************** ffff889be8fbd500,ffff889be8fbcd00 ***************
> > [1272349.614564] [dev_gro_receive     ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614579] [__netif_receive_skb_core] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614585] [ip_rcv              ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614592] [ip_rcv_core         ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614599] [skb_clone           ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614616] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614629] [nft_do_chain        ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614635] [ip_rcv_finish       ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614643] [ip_route_input_slow ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614647] [fib_validate_source ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614652] [ip_local_deliver    ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614658] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614663] [ip_local_deliver_finish] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614666] [icmp_rcv            ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614671] [icmp_echo           ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614675] [icmp_reply          ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614715] [consume_skb         ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614722] [packet_rcv          ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> > [1272349.614725] [consume_skb         ] ICMP: 169.254.128.15 ->
> > 172.27.0.6 ping request, seq: 48220
> >
> > For now, I have to create a bpf program for every kernel
> > function that I want to trace, which is up to 200.
> >
> > With this multi-link, I only need to create 5 bpf program,
> > like this:
> >
> > int BPF_PROG(trace_skb_1, struct *skb);
> > int BPF_PROG(trace_skb_2, u64 arg0, struct *skb);
> > int BPF_PROG(trace_skb_3, u64 arg0, u64 arg1, struct *skb);
> > int BPF_PROG(trace_skb_4, u64 arg0, u64 arg1, u64 arg2, struct *skb);
> > int BPF_PROG(trace_skb_5, u64 arg0, u64 arg1, u64 arg2, u64 arg3, struct *skb);
> >
> > Then, I can attach trace_skb_1 to all the kernel functions that
> > I want to trace and whose first arg is skb; attach trace_skb_2 to kernel
> > functions whose 2nd arg is skb, etc.
> >
> > Or, I can create only one bpf program and store the index
> > of skb to the attachment cookie, and attach this program to all
> > the kernel functions that I want to trace.
> >
> > This is my use case. With the multi-link, now I only have
> > 1 bpf program, 1 bpf link, 200 trampolines, instead of 200
> > bpf programs, 200 bpf link and 200 trampolines.
> 
> I see. The use case makes sense to me.
> Andrii's retsnoop is used to do similar thing before kprobe multi was
> introduced.
> 
> > The shared trampoline you mentioned seems to be a
> > wonderful idea, which can make the 200 trampolines
> > to one. Let me have a look, we create a trampoline and
> > record the max args count of all the target functions, let's
> > mark it as arg_count.
> >
> > During generating the trampoline, we assume that the
> > function args count is arg_count. During attaching, we
> > check the consistency of all the target functions, just like
> > what we do now.
> 
> For one trampoline to handle all attach points we might
> need some arch support, but we can start simple.
> Make btf_func_model with MAX_BPF_FUNC_REG_ARGS
> by calling btf_distill_func_proto() with func==NULL.
> And use that to build a trampoline.
> 
> The challenge is how to use minimal number of trampolines
> when bpf_progA is attached for func1, func2, func3
> and bpf_progB is attached to func3, func4, func5.
> We'd still need 3 trampolines:
> for func[12] to call bpf_progA,
> for func3 to call bpf_progA and bpf_progB,
> for func[45] to call bpf_progB.
> 
> Jiri was trying to solve it in the past. His slides from LPC:
> https://lpc.events/event/16/contributions/1350/attachments/1033/1983/plumbers.pdf
> 
> Pls study them and his prior patchsets to avoid stepping on the same rakes.

yep, I refrained from commenting not to take you down the same path
I did, but if you insist.. ;-) 

I managed to forgot almost all of it, but the IIRC the main pain point
was that at some point I had to split existing trampoline which caused
the whole trampolines management and error paths to become a mess

I tried to explain things in [1] changelog and the latest patchset is in [0]

feel free to use/take anything, but I advice strongly against it ;-)
please let me know if I can help

jirka


[0] https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git/log/?h=bpf/batch
[1] https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git/commit/?h=bpf/batch&id=52a1d4acdf55df41e99ca2cea51865e6821036ce
梦龙董 March 15, 2024, 8 a.m. UTC | #10
On Thu, Mar 14, 2024 at 8:27 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 12, 2024 at 6:53 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
[......]
> > What does "a hundred attachments max" means? Can't I
> > trace thousands of kernel functions with a bpf program of
> > tracing multi-link?
>
> I mean what time does it take to attach one program
> to 100 fentry-s ?
> What is the time for 1k and for 10k ?
>
> The kprobe multi test attaches to pretty much all funcs in
> /sys/kernel/tracing/available_filter_functions
> and it's fast enough to run in test_progs on every commit in bpf CI.
> See get_syms() in prog_tests/kprobe_multi_test.c
>
> Can this new multi fentry do that?
> and at what speed?
> The answer will decide how applicable this api is going to be.
> Generating different trampolines for every attach point
> is an approach as well. Pls benchmark it too.

I see. Creating plenty of trampolines does take a lot of time,
and I'll do testing on it.

>
> > >
> > > Let's step back.
[......]
>
> For one trampoline to handle all attach points we might
> need some arch support, but we can start simple.
> Make btf_func_model with MAX_BPF_FUNC_REG_ARGS
> by calling btf_distill_func_proto() with func==NULL.
> And use that to build a trampoline.
>
> The challenge is how to use minimal number of trampolines
> when bpf_progA is attached for func1, func2, func3
> and bpf_progB is attached to func3, func4, func5.
> We'd still need 3 trampolines:
> for func[12] to call bpf_progA,
> for func3 to call bpf_progA and bpf_progB,
> for func[45] to call bpf_progB.
>
> Jiri was trying to solve it in the past. His slides from LPC:
> https://lpc.events/event/16/contributions/1350/attachments/1033/1983/plumbers.pdf
>
> Pls study them and his prior patchsets to avoid stepping on the same rakes.
梦龙董 March 15, 2024, 8:17 a.m. UTC | #11
On Thu, Mar 14, 2024 at 2:29 PM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Wed, Mar 13, 2024 at 05:25:35PM -0700, Alexei Starovoitov wrote:
> > On Tue, Mar 12, 2024 at 6:53 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> > >
> > > On Wed, Mar 13, 2024 at 12:42 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 11, 2024 at 7:42 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> > > > >
> > > [......]
> > > >
> > > > I see.
> > > > I thought you're sharing the trampoline across attachments.
> > > > (since bpf prog is the same).
> > >
> > > That seems to be a good idea, which I hadn't thought before.
> > >
> > > > But above approach cannot possibly work with a shared trampoline.
> > > > You need to create individual trampoline for all attachment
> > > > and point them to single bpf prog.
> > > >
> > > > tbh I'm less excited about this feature now, since sharing
> > > > the prog across different attachments is nice, but it won't scale
> > > > to thousands of attachments.
> > > > I assumed that there will be a single trampoline with max(argno)
> > > > across attachments and attach/detach will scale to thousands.
> > > >
> > > > With individual trampoline this will work for up to a hundred
> > > > attachments max.
> > >
> > > What does "a hundred attachments max" means? Can't I
> > > trace thousands of kernel functions with a bpf program of
> > > tracing multi-link?
> >
> > I mean what time does it take to attach one program
> > to 100 fentry-s ?
> > What is the time for 1k and for 10k ?
> >
> > The kprobe multi test attaches to pretty much all funcs in
> > /sys/kernel/tracing/available_filter_functions
> > and it's fast enough to run in test_progs on every commit in bpf CI.
> > See get_syms() in prog_tests/kprobe_multi_test.c
> >
> > Can this new multi fentry do that?
> > and at what speed?
> > The answer will decide how applicable this api is going to be.
> > Generating different trampolines for every attach point
> > is an approach as well. Pls benchmark it too.
> >
> > > >
> > > > Let's step back.
> > > > What is the exact use case you're trying to solve?
> > > > Not an artificial one as selftest in patch 9, but the real use case?
> > >
> > > I have a tool, which is used to diagnose network problems,
> > > and its name is "nettrace". It will trace many kernel functions, whose
> > > function args contain "skb", like this:
> > >
> > > ./nettrace -p icmp
> > > begin trace...
> > > ***************** ffff889be8fbd500,ffff889be8fbcd00 ***************
> > > [1272349.614564] [dev_gro_receive     ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614579] [__netif_receive_skb_core] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614585] [ip_rcv              ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614592] [ip_rcv_core         ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614599] [skb_clone           ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614616] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614629] [nft_do_chain        ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614635] [ip_rcv_finish       ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614643] [ip_route_input_slow ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614647] [fib_validate_source ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614652] [ip_local_deliver    ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614658] [nf_hook_slow        ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614663] [ip_local_deliver_finish] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614666] [icmp_rcv            ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614671] [icmp_echo           ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614675] [icmp_reply          ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614715] [consume_skb         ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614722] [packet_rcv          ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > > [1272349.614725] [consume_skb         ] ICMP: 169.254.128.15 ->
> > > 172.27.0.6 ping request, seq: 48220
> > >
> > > For now, I have to create a bpf program for every kernel
> > > function that I want to trace, which is up to 200.
> > >
> > > With this multi-link, I only need to create 5 bpf program,
> > > like this:
> > >
> > > int BPF_PROG(trace_skb_1, struct *skb);
> > > int BPF_PROG(trace_skb_2, u64 arg0, struct *skb);
> > > int BPF_PROG(trace_skb_3, u64 arg0, u64 arg1, struct *skb);
> > > int BPF_PROG(trace_skb_4, u64 arg0, u64 arg1, u64 arg2, struct *skb);
> > > int BPF_PROG(trace_skb_5, u64 arg0, u64 arg1, u64 arg2, u64 arg3, struct *skb);
> > >
> > > Then, I can attach trace_skb_1 to all the kernel functions that
> > > I want to trace and whose first arg is skb; attach trace_skb_2 to kernel
> > > functions whose 2nd arg is skb, etc.
> > >
> > > Or, I can create only one bpf program and store the index
> > > of skb to the attachment cookie, and attach this program to all
> > > the kernel functions that I want to trace.
> > >
> > > This is my use case. With the multi-link, now I only have
> > > 1 bpf program, 1 bpf link, 200 trampolines, instead of 200
> > > bpf programs, 200 bpf link and 200 trampolines.
> >
> > I see. The use case makes sense to me.
> > Andrii's retsnoop is used to do similar thing before kprobe multi was
> > introduced.
> >
> > > The shared trampoline you mentioned seems to be a
> > > wonderful idea, which can make the 200 trampolines
> > > to one. Let me have a look, we create a trampoline and
> > > record the max args count of all the target functions, let's
> > > mark it as arg_count.
> > >
> > > During generating the trampoline, we assume that the
> > > function args count is arg_count. During attaching, we
> > > check the consistency of all the target functions, just like
> > > what we do now.
> >
> > For one trampoline to handle all attach points we might
> > need some arch support, but we can start simple.
> > Make btf_func_model with MAX_BPF_FUNC_REG_ARGS
> > by calling btf_distill_func_proto() with func==NULL.
> > And use that to build a trampoline.
> >
> > The challenge is how to use minimal number of trampolines
> > when bpf_progA is attached for func1, func2, func3
> > and bpf_progB is attached to func3, func4, func5.
> > We'd still need 3 trampolines:
> > for func[12] to call bpf_progA,
> > for func3 to call bpf_progA and bpf_progB,
> > for func[45] to call bpf_progB.
> >
> > Jiri was trying to solve it in the past. His slides from LPC:
> > https://lpc.events/event/16/contributions/1350/attachments/1033/1983/plumbers.pdf
> >
> > Pls study them and his prior patchsets to avoid stepping on the same rakes.
>
> yep, I refrained from commenting not to take you down the same path
> I did, but if you insist.. ;-)
>
> I managed to forgot almost all of it, but the IIRC the main pain point
> was that at some point I had to split existing trampoline which caused
> the whole trampolines management and error paths to become a mess
>
> I tried to explain things in [1] changelog and the latest patchset is in [0]
>
> feel free to use/take anything, but I advice strongly against it ;-)
> please let me know if I can help

I have to say that I have not gone far enough to encounter
this problem, and I didn't dig enough to be aware of the
complexity.

I suspect that I can't overcome this challenge. The only thing that
I thought when I hear about the "shared trampoline" is to fallback
and not use the shared trampoline for the kernel functions who
already have a trampoline.

Anyway, let's have a try on it, based on your research.

Thanks!
Menglong Dong

>
> jirka
>
>
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git/log/?h=bpf/batch
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git/commit/?h=bpf/batch&id=52a1d4acdf55df41e99ca2cea51865e6821036ce
梦龙董 March 28, 2024, 2:43 p.m. UTC | #12
On Fri, Mar 15, 2024 at 4:00 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> On Thu, Mar 14, 2024 at 8:27 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Mar 12, 2024 at 6:53 PM 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> [......]
> > > What does "a hundred attachments max" means? Can't I
> > > trace thousands of kernel functions with a bpf program of
> > > tracing multi-link?
> >
> > I mean what time does it take to attach one program
> > to 100 fentry-s ?
> > What is the time for 1k and for 10k ?
> >
> > The kprobe multi test attaches to pretty much all funcs in
> > /sys/kernel/tracing/available_filter_functions
> > and it's fast enough to run in test_progs on every commit in bpf CI.
> > See get_syms() in prog_tests/kprobe_multi_test.c
> >
> > Can this new multi fentry do that?
> > and at what speed?
> > The answer will decide how applicable this api is going to be.
> > Generating different trampolines for every attach point
> > is an approach as well. Pls benchmark it too.
>
> I see. Creating plenty of trampolines does take a lot of time,
> and I'll do testing on it.
>

I have done a simple benchmark on creating 1000
trampolines. It is slow, quite slow, which consume up to
60s. We can't do it this way.

Now, I have a bad idea. How about we introduce
a "dynamic trampoline"? The basic logic of it can be:

"""
save regs
bpfs = trampoline_lookup_ip(ip)
fentry = bpfs->fentries
while fentry:
  fentry(ctx)
  fentry = fentry->next

call origin
save return value

fexit = bpfs->fexits
while fexit:
  fexit(ctx)
  fexit = fexit->next

xxxxxx
"""

And we lookup the "bpfs" by the function ip in a hash map
in trampoline_lookup_ip. The type of "bpfs" is:

struct bpf_array {
  struct bpf_prog *fentries;
 struct bpf_prog *fexits;
  struct bpf_prog *modify_returns;
}

When we need to attach the bpf progA to function A/B/C,
we only need to create the bpf_arrayA, bpf_arrayB, bpf_arrayC
and add the progA to them, and insert them to the hash map
"direct_call_bpfs", and attach the "dynamic trampoline" to
A/B/C. If bpf_arrayA exist, just add progA to the tail of
bpf_arrayA->fentries. When we need to attach progB to
B/C, just add progB to bpf_arrayB->fentries and
bpf_arrayB->fentries.

Compared to the trampoline, extra overhead is introduced
by the hash lookuping.

I have not begun to code yet, and I am not sure the overhead is
acceptable. Considering that we also need to do hash lookup
by the function in kprobe_multi, maybe the overhead is
acceptable?

Thanks!
Menglong Dong

> >
> > > >
> > > > Let's step back.
> [......]
> >
> > For one trampoline to handle all attach points we might
> > need some arch support, but we can start simple.
> > Make btf_func_model with MAX_BPF_FUNC_REG_ARGS
> > by calling btf_distill_func_proto() with func==NULL.
> > And use that to build a trampoline.
> >
> > The challenge is how to use minimal number of trampolines
> > when bpf_progA is attached for func1, func2, func3
> > and bpf_progB is attached to func3, func4, func5.
> > We'd still need 3 trampolines:
> > for func[12] to call bpf_progA,
> > for func3 to call bpf_progA and bpf_progB,
> > for func[45] to call bpf_progB.
> >
> > Jiri was trying to solve it in the past. His slides from LPC:
> > https://lpc.events/event/16/contributions/1350/attachments/1033/1983/plumbers.pdf
> >
> > Pls study them and his prior patchsets to avoid stepping on the same rakes.
Steven Rostedt March 28, 2024, 3:13 p.m. UTC | #13
On Thu, 28 Mar 2024 22:43:46 +0800
梦龙董 <dongmenglong.8@bytedance.com> wrote:

> I have done a simple benchmark on creating 1000
> trampolines. It is slow, quite slow, which consume up to
> 60s. We can't do it this way.
> 
> Now, I have a bad idea. How about we introduce
> a "dynamic trampoline"? The basic logic of it can be:
> 
> """
> save regs
> bpfs = trampoline_lookup_ip(ip)
> fentry = bpfs->fentries
> while fentry:
>   fentry(ctx)
>   fentry = fentry->next
> 
> call origin
> save return value
> 
> fexit = bpfs->fexits
> while fexit:
>   fexit(ctx)
>   fexit = fexit->next
> 
> xxxxxx
> """
> 
> And we lookup the "bpfs" by the function ip in a hash map
> in trampoline_lookup_ip. The type of "bpfs" is:
> 
> struct bpf_array {
>   struct bpf_prog *fentries;
>  struct bpf_prog *fexits;
>   struct bpf_prog *modify_returns;
> }
> 
> When we need to attach the bpf progA to function A/B/C,
> we only need to create the bpf_arrayA, bpf_arrayB, bpf_arrayC
> and add the progA to them, and insert them to the hash map
> "direct_call_bpfs", and attach the "dynamic trampoline" to
> A/B/C. If bpf_arrayA exist, just add progA to the tail of
> bpf_arrayA->fentries. When we need to attach progB to
> B/C, just add progB to bpf_arrayB->fentries and
> bpf_arrayB->fentries.
> 
> Compared to the trampoline, extra overhead is introduced
> by the hash lookuping.
> 
> I have not begun to code yet, and I am not sure the overhead is
> acceptable. Considering that we also need to do hash lookup
> by the function in kprobe_multi, maybe the overhead is
> acceptable?

Sounds like you are just recreating the function management that ftrace
has. It also can add thousands of trampolines very quickly, because it does
it in batches. It takes special synchronization steps to attach to fentry.
ftrace (and I believe multi-kprobes) updates all the attachments for each
step, so the synchronization needed is only done once.

If you really want to have thousands of functions, why not just register it
with ftrace itself. It will give you the arguments via the ftrace_regs
structure. Can't you just register a program as the callback?

It will probably make your accounting much easier, and just let ftrace
handle the fentry logic. That's what it was made to do.

-- Steve
Alexei Starovoitov March 28, 2024, 11:17 p.m. UTC | #14
On Thu, Mar 28, 2024 at 8:10 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 28 Mar 2024 22:43:46 +0800
> 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> > I have done a simple benchmark on creating 1000
> > trampolines. It is slow, quite slow, which consume up to
> > 60s. We can't do it this way.
> >
> > Now, I have a bad idea. How about we introduce
> > a "dynamic trampoline"? The basic logic of it can be:
> >
> > """
> > save regs
> > bpfs = trampoline_lookup_ip(ip)
> > fentry = bpfs->fentries
> > while fentry:
> >   fentry(ctx)
> >   fentry = fentry->next
> >
> > call origin
> > save return value
> >
> > fexit = bpfs->fexits
> > while fexit:
> >   fexit(ctx)
> >   fexit = fexit->next
> >
> > xxxxxx
> > """
> >
> > And we lookup the "bpfs" by the function ip in a hash map
> > in trampoline_lookup_ip. The type of "bpfs" is:
> >
> > struct bpf_array {
> >   struct bpf_prog *fentries;
> >  struct bpf_prog *fexits;
> >   struct bpf_prog *modify_returns;
> > }
> >
> > When we need to attach the bpf progA to function A/B/C,
> > we only need to create the bpf_arrayA, bpf_arrayB, bpf_arrayC
> > and add the progA to them, and insert them to the hash map
> > "direct_call_bpfs", and attach the "dynamic trampoline" to
> > A/B/C. If bpf_arrayA exist, just add progA to the tail of
> > bpf_arrayA->fentries. When we need to attach progB to
> > B/C, just add progB to bpf_arrayB->fentries and
> > bpf_arrayB->fentries.
> >
> > Compared to the trampoline, extra overhead is introduced
> > by the hash lookuping.
> >
> > I have not begun to code yet, and I am not sure the overhead is
> > acceptable. Considering that we also need to do hash lookup
> > by the function in kprobe_multi, maybe the overhead is
> > acceptable?
>
> Sounds like you are just recreating the function management that ftrace
> has. It also can add thousands of trampolines very quickly, because it does
> it in batches. It takes special synchronization steps to attach to fentry.
> ftrace (and I believe multi-kprobes) updates all the attachments for each
> step, so the synchronization needed is only done once.
>
> If you really want to have thousands of functions, why not just register it
> with ftrace itself. It will give you the arguments via the ftrace_regs
> structure. Can't you just register a program as the callback?
>
> It will probably make your accounting much easier, and just let ftrace
> handle the fentry logic. That's what it was made to do.

Absolutely agree.
There is no point re-inventing this logic.

Menlong,
before you hook up into ftrace check whether
it's going to be any different from kprobe-multi,
since it's the same ftrace underneath.
I suspect it will look exactly the same.
So it sounds like multi-fentry idea will be shelved once again.
Andrii Nakryiko March 29, 2024, 11:28 p.m. UTC | #15
On Thu, Mar 28, 2024 at 8:10 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 28 Mar 2024 22:43:46 +0800
> 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> > I have done a simple benchmark on creating 1000
> > trampolines. It is slow, quite slow, which consume up to
> > 60s. We can't do it this way.
> >
> > Now, I have a bad idea. How about we introduce
> > a "dynamic trampoline"? The basic logic of it can be:
> >
> > """
> > save regs
> > bpfs = trampoline_lookup_ip(ip)
> > fentry = bpfs->fentries
> > while fentry:
> >   fentry(ctx)
> >   fentry = fentry->next
> >
> > call origin
> > save return value
> >
> > fexit = bpfs->fexits
> > while fexit:
> >   fexit(ctx)
> >   fexit = fexit->next
> >
> > xxxxxx
> > """
> >
> > And we lookup the "bpfs" by the function ip in a hash map
> > in trampoline_lookup_ip. The type of "bpfs" is:
> >
> > struct bpf_array {
> >   struct bpf_prog *fentries;
> >  struct bpf_prog *fexits;
> >   struct bpf_prog *modify_returns;
> > }
> >
> > When we need to attach the bpf progA to function A/B/C,
> > we only need to create the bpf_arrayA, bpf_arrayB, bpf_arrayC
> > and add the progA to them, and insert them to the hash map
> > "direct_call_bpfs", and attach the "dynamic trampoline" to
> > A/B/C. If bpf_arrayA exist, just add progA to the tail of
> > bpf_arrayA->fentries. When we need to attach progB to
> > B/C, just add progB to bpf_arrayB->fentries and
> > bpf_arrayB->fentries.
> >
> > Compared to the trampoline, extra overhead is introduced
> > by the hash lookuping.
> >
> > I have not begun to code yet, and I am not sure the overhead is
> > acceptable. Considering that we also need to do hash lookup
> > by the function in kprobe_multi, maybe the overhead is
> > acceptable?
>
> Sounds like you are just recreating the function management that ftrace
> has. It also can add thousands of trampolines very quickly, because it does
> it in batches. It takes special synchronization steps to attach to fentry.
> ftrace (and I believe multi-kprobes) updates all the attachments for each
> step, so the synchronization needed is only done once.
>
> If you really want to have thousands of functions, why not just register it
> with ftrace itself. It will give you the arguments via the ftrace_regs
> structure. Can't you just register a program as the callback?
>
> It will probably make your accounting much easier, and just let ftrace
> handle the fentry logic. That's what it was made to do.
>

I thought I'll just ask instead of digging through code, sorry for
being lazy :) Is there any way to pass pt_regs/ftrace_regs captured
before function execution to a return probe (fexit/kretprobe)? I.e.,
how hard is it to pass input function arguments to a kretprobe? That's
the biggest advantage of fexit over kretprobe, and if we can make
these original pt_regs/ftrace_regs available to kretprobe, then
multi-kretprobe will effectively be this multi-fexit.

> -- Steve
梦龙董 March 30, 2024, 3:18 a.m. UTC | #16
On Thu, Mar 28, 2024 at 11:11 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 28 Mar 2024 22:43:46 +0800
> 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> > I have done a simple benchmark on creating 1000
> > trampolines. It is slow, quite slow, which consume up to
> > 60s. We can't do it this way.
> >
> > Now, I have a bad idea. How about we introduce
> > a "dynamic trampoline"? The basic logic of it can be:
> >
> > """
> > save regs
> > bpfs = trampoline_lookup_ip(ip)
> > fentry = bpfs->fentries
> > while fentry:
> >   fentry(ctx)
> >   fentry = fentry->next
> >
> > call origin
> > save return value
> >
> > fexit = bpfs->fexits
> > while fexit:
> >   fexit(ctx)
> >   fexit = fexit->next
> >
> > xxxxxx
> > """
> >
> > And we lookup the "bpfs" by the function ip in a hash map
> > in trampoline_lookup_ip. The type of "bpfs" is:
> >
> > struct bpf_array {
> >   struct bpf_prog *fentries;
> >  struct bpf_prog *fexits;
> >   struct bpf_prog *modify_returns;
> > }
> >
> > When we need to attach the bpf progA to function A/B/C,
> > we only need to create the bpf_arrayA, bpf_arrayB, bpf_arrayC
> > and add the progA to them, and insert them to the hash map
> > "direct_call_bpfs", and attach the "dynamic trampoline" to
> > A/B/C. If bpf_arrayA exist, just add progA to the tail of
> > bpf_arrayA->fentries. When we need to attach progB to
> > B/C, just add progB to bpf_arrayB->fentries and
> > bpf_arrayB->fentries.
> >
> > Compared to the trampoline, extra overhead is introduced
> > by the hash lookuping.
> >
> > I have not begun to code yet, and I am not sure the overhead is
> > acceptable. Considering that we also need to do hash lookup
> > by the function in kprobe_multi, maybe the overhead is
> > acceptable?
>
> Sounds like you are just recreating the function management that ftrace
> has. It also can add thousands of trampolines very quickly, because it does
> it in batches. It takes special synchronization steps to attach to fentry.
> ftrace (and I believe multi-kprobes) updates all the attachments for each
> step, so the synchronization needed is only done once.
>

Yes, it is fast to register a trampoline for a kernel function
in the managed ftrace in
register_fentry->register_ftrace_direct->ftrace_add_rec_direct.
And it will add the trampoline to the hash table "direct_functions".

And the trampoline will be called in the following
step (I'm not sure if I understand it correctly):

ftrace_regs_caller
|
__ftrace_ops_list_func -> call_direct_funcs -> save trampoline to
pt_regs->origin_ax
|
call pt_regs->origin_ax if not NULL

The logic above means that we can only call a
trampoline once, and a kernel function can only have
one trampoline.

The original idea of mine is to register all the shared
trampoline to the managed ftrace. For example, if we have
the shared trampoline1 for function A/B/C, and shared
trampoline2 for function B/C/D, then I register trampoline1
and trampoline2 for function B/C. However, it can't work,
as we can't call 2 trampolines for a function.

Then, I thought that we could create a "dynamic trampoline".
The logic for the non-ftrace-managed case is simple, we
only need to replace the "nop" of all the target functions
to "call dynamic_trampoline". And for the ftrace-managed
case, the logic is the same too, except that the trampoline
that we add to the "direct_functions" hash is the
dynamic-trampoline:

ftrace_regs_caller
|
__ftrace_ops_list_func -> call_direct_funcs -> save dynamic-trampoline
to pt_regs->origin_ax
|
call pt_regs->origin_ax(dynamic-trampoline) if not NULL

And in the dynamic-trampoline, we can call prog1 for
A, call prog1 and prog2 for B/C, call prog2 for D.

And the register is fast enough.

> If you really want to have thousands of functions, why not just register it
> with ftrace itself. It will give you the arguments via the ftrace_regs
> structure. Can't you just register a program as the callback?
>

Ennn...I don't understand. The main purpose for
me to use TRACING is:

1. we can directly access the memory, which is more
   efficient.
2. we can obtain the function args in FEXIT, which
    kretprobe can't do it. And this is the main reason.

Thanks!
Menglong Dong

> It will probably make your accounting much easier, and just let ftrace
> handle the fentry logic. That's what it was made to do.
>
> -- Steve
梦龙董 March 30, 2024, 3:36 a.m. UTC | #17
On Fri, Mar 29, 2024 at 7:17 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 28, 2024 at 8:10 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu, 28 Mar 2024 22:43:46 +0800
> > 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> >
> > > I have done a simple benchmark on creating 1000
> > > trampolines. It is slow, quite slow, which consume up to
> > > 60s. We can't do it this way.
> > >
> > > Now, I have a bad idea. How about we introduce
> > > a "dynamic trampoline"? The basic logic of it can be:
> > >
> > > """
> > > save regs
> > > bpfs = trampoline_lookup_ip(ip)
> > > fentry = bpfs->fentries
> > > while fentry:
> > >   fentry(ctx)
> > >   fentry = fentry->next
> > >
> > > call origin
> > > save return value
> > >
> > > fexit = bpfs->fexits
> > > while fexit:
> > >   fexit(ctx)
> > >   fexit = fexit->next
> > >
> > > xxxxxx
> > > """
> > >
> > > And we lookup the "bpfs" by the function ip in a hash map
> > > in trampoline_lookup_ip. The type of "bpfs" is:
> > >
> > > struct bpf_array {
> > >   struct bpf_prog *fentries;
> > >  struct bpf_prog *fexits;
> > >   struct bpf_prog *modify_returns;
> > > }
> > >
> > > When we need to attach the bpf progA to function A/B/C,
> > > we only need to create the bpf_arrayA, bpf_arrayB, bpf_arrayC
> > > and add the progA to them, and insert them to the hash map
> > > "direct_call_bpfs", and attach the "dynamic trampoline" to
> > > A/B/C. If bpf_arrayA exist, just add progA to the tail of
> > > bpf_arrayA->fentries. When we need to attach progB to
> > > B/C, just add progB to bpf_arrayB->fentries and
> > > bpf_arrayB->fentries.
> > >
> > > Compared to the trampoline, extra overhead is introduced
> > > by the hash lookuping.
> > >
> > > I have not begun to code yet, and I am not sure the overhead is
> > > acceptable. Considering that we also need to do hash lookup
> > > by the function in kprobe_multi, maybe the overhead is
> > > acceptable?
> >
> > Sounds like you are just recreating the function management that ftrace
> > has. It also can add thousands of trampolines very quickly, because it does
> > it in batches. It takes special synchronization steps to attach to fentry.
> > ftrace (and I believe multi-kprobes) updates all the attachments for each
> > step, so the synchronization needed is only done once.
> >
> > If you really want to have thousands of functions, why not just register it
> > with ftrace itself. It will give you the arguments via the ftrace_regs
> > structure. Can't you just register a program as the callback?
> >
> > It will probably make your accounting much easier, and just let ftrace
> > handle the fentry logic. That's what it was made to do.
>
> Absolutely agree.
> There is no point re-inventing this logic.
>
> Menlong,
> before you hook up into ftrace check whether
> it's going to be any different from kprobe-multi,
> since it's the same ftrace underneath.
> I suspect it will look exactly the same.

Yeah, I dig it a little. I think it is different. For multi-kprobe,
it registers a ftrace_ops to ftrace_ops_list for every bpf
program. This means that we can register 2 or more
multi-kprobe in the same function. The bpf is called in
the following step:

ftrace_regs_caller
|
__ftrace_ops_list_func -> fprobe_handler -> kprobe_multi_link_handler -> run BPF

And for trampoline, it needs to be called directly,
so it can't be registered as a callback to ftrace_ops_list.
It need to be called in the following step:

ftrace_regs_caller
|
__ftrace_ops_list_func -> call_direct_funcs -> save trampoline to
pt_regs->origin_ax
|
call pt_regs->origin_ax if not NULL

> So it sounds like multi-fentry idea will be shelved once again.

Enn...this is the best solution that I can think of. If it
doesn't work, I suspect it will be shelved again.

Thanks!
Menglong Dong
梦龙董 March 30, 2024, 4:16 a.m. UTC | #18
On Sat, Mar 30, 2024 at 7:28 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Thu, Mar 28, 2024 at 8:10 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu, 28 Mar 2024 22:43:46 +0800
> > 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> >
> > > I have done a simple benchmark on creating 1000
> > > trampolines. It is slow, quite slow, which consume up to
> > > 60s. We can't do it this way.
> > >
> > > Now, I have a bad idea. How about we introduce
> > > a "dynamic trampoline"? The basic logic of it can be:
> > >
> > > """
> > > save regs
> > > bpfs = trampoline_lookup_ip(ip)
> > > fentry = bpfs->fentries
> > > while fentry:
> > >   fentry(ctx)
> > >   fentry = fentry->next
> > >
> > > call origin
> > > save return value
> > >
> > > fexit = bpfs->fexits
> > > while fexit:
> > >   fexit(ctx)
> > >   fexit = fexit->next
> > >
> > > xxxxxx
> > > """
> > >
> > > And we lookup the "bpfs" by the function ip in a hash map
> > > in trampoline_lookup_ip. The type of "bpfs" is:
> > >
> > > struct bpf_array {
> > >   struct bpf_prog *fentries;
> > >  struct bpf_prog *fexits;
> > >   struct bpf_prog *modify_returns;
> > > }
> > >
> > > When we need to attach the bpf progA to function A/B/C,
> > > we only need to create the bpf_arrayA, bpf_arrayB, bpf_arrayC
> > > and add the progA to them, and insert them to the hash map
> > > "direct_call_bpfs", and attach the "dynamic trampoline" to
> > > A/B/C. If bpf_arrayA exist, just add progA to the tail of
> > > bpf_arrayA->fentries. When we need to attach progB to
> > > B/C, just add progB to bpf_arrayB->fentries and
> > > bpf_arrayB->fentries.
> > >
> > > Compared to the trampoline, extra overhead is introduced
> > > by the hash lookuping.
> > >
> > > I have not begun to code yet, and I am not sure the overhead is
> > > acceptable. Considering that we also need to do hash lookup
> > > by the function in kprobe_multi, maybe the overhead is
> > > acceptable?
> >
> > Sounds like you are just recreating the function management that ftrace
> > has. It also can add thousands of trampolines very quickly, because it does
> > it in batches. It takes special synchronization steps to attach to fentry.
> > ftrace (and I believe multi-kprobes) updates all the attachments for each
> > step, so the synchronization needed is only done once.
> >
> > If you really want to have thousands of functions, why not just register it
> > with ftrace itself. It will give you the arguments via the ftrace_regs
> > structure. Can't you just register a program as the callback?
> >
> > It will probably make your accounting much easier, and just let ftrace
> > handle the fentry logic. That's what it was made to do.
> >
>
> I thought I'll just ask instead of digging through code, sorry for
> being lazy :) Is there any way to pass pt_regs/ftrace_regs captured
> before function execution to a return probe (fexit/kretprobe)? I.e.,
> how hard is it to pass input function arguments to a kretprobe? That's
> the biggest advantage of fexit over kretprobe, and if we can make
> these original pt_regs/ftrace_regs available to kretprobe, then
> multi-kretprobe will effectively be this multi-fexit.

Yes, we can use multi-kretprobe instead of multi-fexit
if we can obtain the function args in kretprobe.

I think it's hard. The reason that we can obtain the
function args is that we have a trampoline, and it
call the origin function for FEXIT. If we do the same
for multi-kretprobe, we need to modify ftrace_regs_caller
to:

ftrace_regs_caller
|
__ftrace_ops_list_func
|
call all multi-kprobe callbacks
|
call orgin
|
call all multi-kretprobe callbacks
|
call bpf trampoline(for TRACING)

However, this logic conflicts with bpf trampoline,
as it can also call the origin function. What's more,
the FENTRY should be called before the "call origin"
above.

I'm sure if I understand correctly, as I have not
figured out how multi-kretprobe works in fprobe.

Thanks!
Menglong Dong

>
> > -- Steve
Steven Rostedt March 30, 2024, 12:27 p.m. UTC | #19
On Fri, 29 Mar 2024 16:28:33 -0700
Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:

> I thought I'll just ask instead of digging through code, sorry for
> being lazy :) Is there any way to pass pt_regs/ftrace_regs captured
> before function execution to a return probe (fexit/kretprobe)? I.e.,
> how hard is it to pass input function arguments to a kretprobe? That's
> the biggest advantage of fexit over kretprobe, and if we can make
> these original pt_regs/ftrace_regs available to kretprobe, then
> multi-kretprobe will effectively be this multi-fexit.

This should be possible with the updates that Masami is doing with the
fgraph code.

-- Steve
Jiri Olsa March 30, 2024, 5:52 p.m. UTC | #20
On Sat, Mar 30, 2024 at 08:27:55AM -0400, Steven Rostedt wrote:
> On Fri, 29 Mar 2024 16:28:33 -0700
> Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> 
> > I thought I'll just ask instead of digging through code, sorry for
> > being lazy :) Is there any way to pass pt_regs/ftrace_regs captured
> > before function execution to a return probe (fexit/kretprobe)? I.e.,
> > how hard is it to pass input function arguments to a kretprobe? That's
> > the biggest advantage of fexit over kretprobe, and if we can make
> > these original pt_regs/ftrace_regs available to kretprobe, then
> > multi-kretprobe will effectively be this multi-fexit.
> 
> This should be possible with the updates that Masami is doing with the
> fgraph code.

yes, I have bpf kprobe-multi link support for that [0] (it's on top of
Masami's fprobe-over-fgraph changes) we discussed that in [1]

jirka

[0] https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git/log/?h=bpf/session_data
[1] https://lore.kernel.org/bpf/20240228090242.4040210-1-jolsa@kernel.org/
Steven Rostedt March 30, 2024, 7:37 p.m. UTC | #21
On Sat, 30 Mar 2024 11:18:29 +0800
梦龙董 <dongmenglong.8@bytedance.com> wrote:

> > If you really want to have thousands of functions, why not just register it
> > with ftrace itself. It will give you the arguments via the ftrace_regs
> > structure. Can't you just register a program as the callback?
> >  
> 
> Ennn...I don't understand. The main purpose for
> me to use TRACING is:
> 
> 1. we can directly access the memory, which is more
>    efficient.

I'm not sure what you mean by the above. Access what memory?

> 2. we can obtain the function args in FEXIT, which
>     kretprobe can't do it. And this is the main reason.

I didn't mention kretprobe. If you need access to the exit of the function,
you can use Masami's fgraph update.

 fentry -> ftrace_trampoline -> your_code

For fgraph:

 fentry -> ftrace_trampoline -> fgraph [sets up return call] -> your_entry_code

 function ret -> fgraph_ret_handler -> your_exit_code

And you will be able to pass data from the entry to the exit code,
including parameters.

-- Steve
Andrii Nakryiko March 31, 2024, 2:34 a.m. UTC | #22
On Sat, Mar 30, 2024 at 10:52 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Sat, Mar 30, 2024 at 08:27:55AM -0400, Steven Rostedt wrote:
> > On Fri, 29 Mar 2024 16:28:33 -0700
> > Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> >
> > > I thought I'll just ask instead of digging through code, sorry for
> > > being lazy :) Is there any way to pass pt_regs/ftrace_regs captured
> > > before function execution to a return probe (fexit/kretprobe)? I.e.,
> > > how hard is it to pass input function arguments to a kretprobe? That's
> > > the biggest advantage of fexit over kretprobe, and if we can make
> > > these original pt_regs/ftrace_regs available to kretprobe, then
> > > multi-kretprobe will effectively be this multi-fexit.
> >
> > This should be possible with the updates that Masami is doing with the
> > fgraph code.
>
> yes, I have bpf kprobe-multi link support for that [0] (it's on top of
> Masami's fprobe-over-fgraph changes) we discussed that in [1]

Sorry, I forgot the regs/args part, mostly remembering we discussed
session cookie ideas. Thanks for reminder!

>
> jirka
>
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git/log/?h=bpf/session_data
> [1] https://lore.kernel.org/bpf/20240228090242.4040210-1-jolsa@kernel.org/
梦龙董 April 1, 2024, 2:28 a.m. UTC | #23
On Sun, Mar 31, 2024 at 3:34 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Sat, 30 Mar 2024 11:18:29 +0800
> 梦龙董 <dongmenglong.8@bytedance.com> wrote:
>
> > > If you really want to have thousands of functions, why not just register it
> > > with ftrace itself. It will give you the arguments via the ftrace_regs
> > > structure. Can't you just register a program as the callback?
> > >
> >
> > Ennn...I don't understand. The main purpose for
> > me to use TRACING is:
> >
> > 1. we can directly access the memory, which is more
> >    efficient.
>
> I'm not sure what you mean by the above. Access what memory?
>

We need to use the helper of bpf_probe_read_kernel
when we read "skb->sk" in kprobe, and the "skb" is the
1st arg in ip_rcv(). And we can directly read "skb->sk"
in tracing, which is more efficient. Isn't it?

> > 2. we can obtain the function args in FEXIT, which
> >     kretprobe can't do it. And this is the main reason.
>
> I didn't mention kretprobe. If you need access to the exit of the function,
> you can use Masami's fgraph update.
>
>  fentry -> ftrace_trampoline -> your_code
>
> For fgraph:
>
>  fentry -> ftrace_trampoline -> fgraph [sets up return call] -> your_entry_code
>
>  function ret -> fgraph_ret_handler -> your_exit_code
>
> And you will be able to pass data from the entry to the exit code,
> including parameters.

Yeah, the fgraph sounds like a nice solution to my problem.
I'll have a try on it.

Thanks!
Menglong Dong

>
> -- Steve
>
>
Steven Rostedt April 1, 2024, 3:59 p.m. UTC | #24
On Mon, 1 Apr 2024 10:28:17 +0800
梦龙董 <dongmenglong.8@bytedance.com> wrote:

> On Sun, Mar 31, 2024 at 3:34 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Sat, 30 Mar 2024 11:18:29 +0800
> > 梦龙董 <dongmenglong.8@bytedance.com> wrote:
> >  
> > > > If you really want to have thousands of functions, why not just register it
> > > > with ftrace itself. It will give you the arguments via the ftrace_regs
> > > > structure. Can't you just register a program as the callback?
> > > >  
> > >
> > > Ennn...I don't understand. The main purpose for
> > > me to use TRACING is:
> > >
> > > 1. we can directly access the memory, which is more
> > >    efficient.  
> >
> > I'm not sure what you mean by the above. Access what memory?
> >  
> 
> We need to use the helper of bpf_probe_read_kernel
> when we read "skb->sk" in kprobe, and the "skb" is the
> 1st arg in ip_rcv(). And we can directly read "skb->sk"
> in tracing, which is more efficient. Isn't it?

If you add a ftrace_ops function handler that calls a BPF program, I don't
see why you can't just give it the parameters it needs instead of using bpf
helpers. It's no different than using a trampoline to do the same thing.

-- Steve
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95e07673cdc1..0f677fdcfcc7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1461,6 +1461,7 @@  struct bpf_prog_aux {
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
 	const char *attach_func_name;
+	u64 accessed_args;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
@@ -2565,6 +2566,9 @@  struct bpf_reg_state;
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
+int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
+			      struct btf *btf2, const struct btf_type *t2,
+			      u64 func_args);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
 				    int comp_idx, const char *tag_key);
 int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 170d017e8e4a..c2a0299d4358 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6125,19 +6125,24 @@  static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
 }
 
 static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
-			   int off)
+			   int off, int *aligned_idx)
 {
 	const struct btf_param *args;
 	const struct btf_type *t;
 	u32 offset = 0, nr_args;
 	int i;
 
+	if (aligned_idx)
+		*aligned_idx = -ENOENT;
+
 	if (!func_proto)
 		return off / 8;
 
 	nr_args = btf_type_vlen(func_proto);
 	args = (const struct btf_param *)(func_proto + 1);
 	for (i = 0; i < nr_args; i++) {
+		if (aligned_idx && offset == off)
+			*aligned_idx = i;
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
 		offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
 		if (off < offset)
@@ -6207,7 +6212,7 @@  bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			tname, off);
 		return false;
 	}
-	arg = get_ctx_arg_idx(btf, t, off);
+	arg = get_ctx_arg_idx(btf, t, off, NULL);
 	args = (const struct btf_param *)(t + 1);
 	/* if (t == NULL) Fall back to default BPF prog with
 	 * MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -6217,6 +6222,9 @@  bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		/* skip first 'void *__data' argument in btf_trace_##name typedef */
 		args++;
 		nr_args--;
+		prog->aux->accessed_args |= (1 << (arg + 1));
+	} else {
+		prog->aux->accessed_args |= (1 << arg);
 	}
 
 	if (arg > nr_args) {
@@ -7024,6 +7032,102 @@  int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static u32 get_ctx_arg_total_size(struct btf *btf, const struct btf_type *t)
+{
+	const struct btf_param *args;
+	u32 size = 0, nr_args;
+	int i;
+
+	nr_args = btf_type_vlen(t);
+	args = (const struct btf_param *)(t + 1);
+	for (i = 0; i < nr_args; i++) {
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		size += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+	}
+
+	return size;
+}
+
+/* This function is similar to btf_check_func_type_match(), except that it
+ * only compare some function args of the function prototype t1 and t2.
+ */
+int btf_check_func_part_match(struct btf *btf1, const struct btf_type *func1,
+			      struct btf *btf2, const struct btf_type *func2,
+			      u64 func_args)
+{
+	const struct btf_param *args1, *args2;
+	u32 nargs1, i, offset = 0;
+	const char *s1, *s2;
+
+	if (!btf_type_is_func_proto(func1) || !btf_type_is_func_proto(func2))
+		return -EINVAL;
+
+	args1 = (const struct btf_param *)(func1 + 1);
+	args2 = (const struct btf_param *)(func2 + 1);
+	nargs1 = btf_type_vlen(func1);
+
+	for (i = 0; i <= nargs1; i++) {
+		const struct btf_type *t1, *t2;
+
+		if (!(func_args & (1 << i)))
+			goto next;
+
+		if (i < nargs1) {
+			int t2_index;
+
+			/* get the index of the arg corresponding to args1[i]
+			 * by the offset.
+			 */
+			get_ctx_arg_idx(btf2, func2, offset, &t2_index);
+			if (t2_index < 0)
+				return -EINVAL;
+
+			t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+			t2 = btf_type_skip_modifiers(btf2, args2[t2_index].type,
+						     NULL);
+		} else {
+			/* i == nargs1, this is the index of return value of t1 */
+			if (get_ctx_arg_total_size(btf1, func1) !=
+			    get_ctx_arg_total_size(btf2, func2))
+				return -EINVAL;
+
+			/* check the return type of t1 and t2 */
+			t1 = btf_type_skip_modifiers(btf1, func1->type, NULL);
+			t2 = btf_type_skip_modifiers(btf2, func2->type, NULL);
+		}
+
+		if (t1->info != t2->info ||
+		    (btf_type_has_size(t1) && t1->size != t2->size))
+			return -EINVAL;
+		if (btf_type_is_int(t1) || btf_is_any_enum(t1))
+			goto next;
+
+		if (btf_type_is_struct(t1))
+			goto on_struct;
+
+		if (!btf_type_is_ptr(t1))
+			return -EINVAL;
+
+		t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+		t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+		if (!btf_type_is_struct(t1) || !btf_type_is_struct(t2))
+			return -EINVAL;
+
+on_struct:
+		s1 = btf_name_by_offset(btf1, t1->name_off);
+		s2 = btf_name_by_offset(btf2, t2->name_off);
+		if (strcmp(s1, s2))
+			return -EINVAL;
+next:
+		if (i < nargs1) {
+			t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+			offset += btf_type_is_ptr(t1) ? 8 : roundup(t1->size, 8);
+		}
+	}
+
+	return 0;
+}
+
 static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
 {
 	const char *name;