diff mbox series

[bpf-next,v3,1/2] libbpf: Add BPF_KPROBE_SYSCALL macro

Message ID 20220207143134.2977852-2-hengqi.chen@gmail.com (mailing list archive)
State Accepted
Delegated to: BPF
Headers show
Series libbpf: Add syscall-specific variant of BPF_KPROBE | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline fail Detected static functions without inline keyword in header files: 1
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 8 maintainers not CCed: kpsingh@kernel.org daniel@iogearbox.net john.fastabend@gmail.com kafai@fb.com songliubraving@fb.com ast@kernel.org yhs@fb.com netdev@vger.kernel.org
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch fail ERROR: Macros with complex values should be enclosed in parentheses ERROR: Macros with multiple statements should be enclosed in a do - while loop WARNING: Macros with flow control statements should be avoided WARNING: Prefer __always_inline over __attribute__((always_inline)) WARNING: line length of 105 exceeds 80 columns WARNING: line length of 109 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next fail VM_Test

Commit Message

Hengqi Chen Feb. 7, 2022, 2:31 p.m. UTC
Add syscall-specific variant of BPF_KPROBE named BPF_KPROBE_SYSCALL ([0]).
The new macro hides the underlying way of getting syscall input arguments.
With the new macro, the following code:

    SEC("kprobe/__x64_sys_close")
    int BPF_KPROBE(do_sys_close, struct pt_regs *regs)
    {
        int fd;

        fd = PT_REGS_PARM1_CORE(regs);
        /* do something with fd */
    }

can be written as:

    SEC("kprobe/__x64_sys_close")
    int BPF_KPROBE_SYSCALL(do_sys_close, int fd)
    {
        /* do something with fd */
    }

  [0] Closes: https://github.com/libbpf/libbpf/issues/425

Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
---
 tools/lib/bpf/bpf_tracing.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

--
2.30.2

Comments

Andrii Nakryiko Feb. 7, 2022, 9:58 p.m. UTC | #1
On Mon, Feb 7, 2022 at 6:31 AM Hengqi Chen <hengqi.chen@gmail.com> wrote:
>
> Add syscall-specific variant of BPF_KPROBE named BPF_KPROBE_SYSCALL ([0]).
> The new macro hides the underlying way of getting syscall input arguments.
> With the new macro, the following code:
>
>     SEC("kprobe/__x64_sys_close")
>     int BPF_KPROBE(do_sys_close, struct pt_regs *regs)
>     {
>         int fd;
>
>         fd = PT_REGS_PARM1_CORE(regs);
>         /* do something with fd */
>     }
>
> can be written as:
>
>     SEC("kprobe/__x64_sys_close")
>     int BPF_KPROBE_SYSCALL(do_sys_close, int fd)
>     {
>         /* do something with fd */
>     }
>
>   [0] Closes: https://github.com/libbpf/libbpf/issues/425
>
> Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
> ---
>  tools/lib/bpf/bpf_tracing.h | 33 +++++++++++++++++++++++++++++++++
>  1 file changed, 33 insertions(+)
>
> diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
> index cf980e54d331..7ad9cdea99e1 100644
> --- a/tools/lib/bpf/bpf_tracing.h
> +++ b/tools/lib/bpf/bpf_tracing.h
> @@ -461,4 +461,37 @@ typeof(name(0)) name(struct pt_regs *ctx)                              \
>  }                                                                          \
>  static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
>
> +#define ___bpf_syscall_args0()           ctx
> +#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
> +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
> +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
> +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
> +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
> +#define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
> +
> +/*
> + * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for
> + * tracing syscall functions, like __x64_sys_close. It hides the underlying
> + * platform-specific low-level way of getting syscall input arguments from
> + * struct pt_regs, and provides a familiar typed and named function arguments
> + * syntax and semantics of accessing syscall input parameters.
> + *
> + * Original struct pt_regs* context is preserved as 'ctx' argument. This might
> + * be necessary when using BPF helpers like bpf_perf_event_output().
> + */

LGTM. Please also mention that this macro relies on CO-RE so that
users are aware.

Unfortunately we had to back out Ilya's patches with
PT_REGS_SYSCALL_REGS() and PT_REGS_PARMx_CORE_SYSCALL(), so we'll need
to wait a bit before merging this.


> +#define BPF_KPROBE_SYSCALL(name, args...)                                  \
> +name(struct pt_regs *ctx);                                                 \
> +static __attribute__((always_inline)) typeof(name(0))                      \
> +____##name(struct pt_regs *ctx, ##args);                                   \
> +typeof(name(0)) name(struct pt_regs *ctx)                                  \
> +{                                                                          \
> +       struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx);                   \
> +       _Pragma("GCC diagnostic push")                                      \
> +       _Pragma("GCC diagnostic ignored \"-Wint-conversion\"")              \
> +       return ____##name(___bpf_syscall_args(args));                       \
> +       _Pragma("GCC diagnostic pop")                                       \
> +}                                                                          \
> +static __attribute__((always_inline)) typeof(name(0))                      \
> +____##name(struct pt_regs *ctx, ##args)
> +
>  #endif
> --
> 2.30.2
Andrii Nakryiko Feb. 9, 2022, 5:53 a.m. UTC | #2
On Mon, Feb 7, 2022 at 1:58 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Feb 7, 2022 at 6:31 AM Hengqi Chen <hengqi.chen@gmail.com> wrote:
> >
> > Add syscall-specific variant of BPF_KPROBE named BPF_KPROBE_SYSCALL ([0]).
> > The new macro hides the underlying way of getting syscall input arguments.
> > With the new macro, the following code:
> >
> >     SEC("kprobe/__x64_sys_close")
> >     int BPF_KPROBE(do_sys_close, struct pt_regs *regs)
> >     {
> >         int fd;
> >
> >         fd = PT_REGS_PARM1_CORE(regs);
> >         /* do something with fd */
> >     }
> >
> > can be written as:
> >
> >     SEC("kprobe/__x64_sys_close")
> >     int BPF_KPROBE_SYSCALL(do_sys_close, int fd)
> >     {
> >         /* do something with fd */
> >     }
> >
> >   [0] Closes: https://github.com/libbpf/libbpf/issues/425
> >
> > Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
> > ---
> >  tools/lib/bpf/bpf_tracing.h | 33 +++++++++++++++++++++++++++++++++
> >  1 file changed, 33 insertions(+)
> >
> > diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
> > index cf980e54d331..7ad9cdea99e1 100644
> > --- a/tools/lib/bpf/bpf_tracing.h
> > +++ b/tools/lib/bpf/bpf_tracing.h
> > @@ -461,4 +461,37 @@ typeof(name(0)) name(struct pt_regs *ctx)                              \
> >  }                                                                          \
> >  static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
> >
> > +#define ___bpf_syscall_args0()           ctx
> > +#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
> > +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
> > +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
> > +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
> > +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
> > +#define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
> > +
> > +/*
> > + * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for
> > + * tracing syscall functions, like __x64_sys_close. It hides the underlying
> > + * platform-specific low-level way of getting syscall input arguments from
> > + * struct pt_regs, and provides a familiar typed and named function arguments
> > + * syntax and semantics of accessing syscall input parameters.
> > + *
> > + * Original struct pt_regs* context is preserved as 'ctx' argument. This might
> > + * be necessary when using BPF helpers like bpf_perf_event_output().
> > + */
>
> LGTM. Please also mention that this macro relies on CO-RE so that
> users are aware.
>

Now that Ilya's fixes are in again, added a small note about reliance
on BPF CO-RE and pushed to bpf-next, thanks.


On a relevant note. The whole __x64_sys_close vs sys_close depending
on architecture and kernel version was always super annoying. BCC
makes this transparent to users (AFAIK) and it always bothered me a
little, but I didn't see a clean solution that fits libbpf.

I think I finally found it, though. Instead of guessing whether the
kprobe function is a syscall or not based on "sys_" prefix of a kernel
function, we can use libbpf SEC() handling to do this transparently.
What if we define two new SEC() definitions:

SEC("ksyscall/write") and SEC("kretsyscall/write") (or maybe
SEC("kprobe.syscall/write") and SEC("kretprobe.syscall/write"), not
sure which one is better, voice your opinion, please). And for such
special kprobes, libbpf will perform feature detection of this
ARCH_SYSCALL_WRAPPER (we'll need to see the best way to do this in a
simple and fast way, preferably without parsing kallsyms) and
depending on it substitute either sys_write (or should it be
__se_sys_write, according to Naveen) or __<arch>_sys_write. You get
the idea.

I like that this is still explicit and in the spirit of libbpf, but
offloads the burden of knowing these intricate differences from users.

Thoughts?


> Unfortunately we had to back out Ilya's patches with
> PT_REGS_SYSCALL_REGS() and PT_REGS_PARMx_CORE_SYSCALL(), so we'll need
> to wait a bit before merging this.
>
>
> > +#define BPF_KPROBE_SYSCALL(name, args...)                                  \
> > +name(struct pt_regs *ctx);                                                 \
> > +static __attribute__((always_inline)) typeof(name(0))                      \
> > +____##name(struct pt_regs *ctx, ##args);                                   \
> > +typeof(name(0)) name(struct pt_regs *ctx)                                  \
> > +{                                                                          \
> > +       struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx);                   \
> > +       _Pragma("GCC diagnostic push")                                      \
> > +       _Pragma("GCC diagnostic ignored \"-Wint-conversion\"")              \
> > +       return ____##name(___bpf_syscall_args(args));                       \
> > +       _Pragma("GCC diagnostic pop")                                       \
> > +}                                                                          \
> > +static __attribute__((always_inline)) typeof(name(0))                      \
> > +____##name(struct pt_regs *ctx, ##args)
> > +
> >  #endif
> > --
> > 2.30.2
Alexei Starovoitov Feb. 9, 2022, 4:38 p.m. UTC | #3
On Wed, Feb 9, 2022 at 2:25 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Feb 7, 2022 at 1:58 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Feb 7, 2022 at 6:31 AM Hengqi Chen <hengqi.chen@gmail.com> wrote:
> > >
> > > Add syscall-specific variant of BPF_KPROBE named BPF_KPROBE_SYSCALL ([0]).
> > > The new macro hides the underlying way of getting syscall input arguments.
> > > With the new macro, the following code:
> > >
> > >     SEC("kprobe/__x64_sys_close")
> > >     int BPF_KPROBE(do_sys_close, struct pt_regs *regs)
> > >     {
> > >         int fd;
> > >
> > >         fd = PT_REGS_PARM1_CORE(regs);
> > >         /* do something with fd */
> > >     }
> > >
> > > can be written as:
> > >
> > >     SEC("kprobe/__x64_sys_close")
> > >     int BPF_KPROBE_SYSCALL(do_sys_close, int fd)
> > >     {
> > >         /* do something with fd */
> > >     }
> > >
> > >   [0] Closes: https://github.com/libbpf/libbpf/issues/425
> > >
> > > Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
> > > ---
> > >  tools/lib/bpf/bpf_tracing.h | 33 +++++++++++++++++++++++++++++++++
> > >  1 file changed, 33 insertions(+)
> > >
> > > diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
> > > index cf980e54d331..7ad9cdea99e1 100644
> > > --- a/tools/lib/bpf/bpf_tracing.h
> > > +++ b/tools/lib/bpf/bpf_tracing.h
> > > @@ -461,4 +461,37 @@ typeof(name(0)) name(struct pt_regs *ctx)                              \
> > >  }                                                                          \
> > >  static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
> > >
> > > +#define ___bpf_syscall_args0()           ctx
> > > +#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
> > > +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
> > > +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
> > > +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
> > > +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
> > > +#define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
> > > +
> > > +/*
> > > + * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for
> > > + * tracing syscall functions, like __x64_sys_close. It hides the underlying
> > > + * platform-specific low-level way of getting syscall input arguments from
> > > + * struct pt_regs, and provides a familiar typed and named function arguments
> > > + * syntax and semantics of accessing syscall input parameters.
> > > + *
> > > + * Original struct pt_regs* context is preserved as 'ctx' argument. This might
> > > + * be necessary when using BPF helpers like bpf_perf_event_output().
> > > + */
> >
> > LGTM. Please also mention that this macro relies on CO-RE so that
> > users are aware.
> >
>
> Now that Ilya's fixes are in again, added a small note about reliance
> on BPF CO-RE and pushed to bpf-next, thanks.
>
>
> On a relevant note. The whole __x64_sys_close vs sys_close depending
> on architecture and kernel version was always super annoying. BCC
> makes this transparent to users (AFAIK) and it always bothered me a
> little, but I didn't see a clean solution that fits libbpf.
>
> I think I finally found it, though. Instead of guessing whether the
> kprobe function is a syscall or not based on "sys_" prefix of a kernel
> function, we can use libbpf SEC() handling to do this transparently.
> What if we define two new SEC() definitions:
>
> SEC("ksyscall/write") and SEC("kretsyscall/write") (or maybe
> SEC("kprobe.syscall/write") and SEC("kretprobe.syscall/write"), not
> sure which one is better, voice your opinion, please). And for such
> special kprobes, libbpf will perform feature detection of this
> ARCH_SYSCALL_WRAPPER (we'll need to see the best way to do this in a
> simple and fast way, preferably without parsing kallsyms) and
> depending on it substitute either sys_write (or should it be
> __se_sys_write, according to Naveen) or __<arch>_sys_write. You get
> the idea.
>
> I like that this is still explicit and in the spirit of libbpf, but
> offloads the burden of knowing these intricate differences from users.
>
> Thoughts?

I think it will be just as fragile.
That syscall prefix was changed by the kernel few times now.
libbpf will be chasing the moving target.
I think keeping the magic in .h is simpler and less of a maintenance burden.
Andrii Nakryiko Feb. 9, 2022, 5:47 p.m. UTC | #4
On Wed, Feb 9, 2022 at 8:38 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Feb 9, 2022 at 2:25 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Feb 7, 2022 at 1:58 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Mon, Feb 7, 2022 at 6:31 AM Hengqi Chen <hengqi.chen@gmail.com> wrote:
> > > >
> > > > Add syscall-specific variant of BPF_KPROBE named BPF_KPROBE_SYSCALL ([0]).
> > > > The new macro hides the underlying way of getting syscall input arguments.
> > > > With the new macro, the following code:
> > > >
> > > >     SEC("kprobe/__x64_sys_close")
> > > >     int BPF_KPROBE(do_sys_close, struct pt_regs *regs)
> > > >     {
> > > >         int fd;
> > > >
> > > >         fd = PT_REGS_PARM1_CORE(regs);
> > > >         /* do something with fd */
> > > >     }
> > > >
> > > > can be written as:
> > > >
> > > >     SEC("kprobe/__x64_sys_close")
> > > >     int BPF_KPROBE_SYSCALL(do_sys_close, int fd)
> > > >     {
> > > >         /* do something with fd */
> > > >     }
> > > >
> > > >   [0] Closes: https://github.com/libbpf/libbpf/issues/425
> > > >
> > > > Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
> > > > ---
> > > >  tools/lib/bpf/bpf_tracing.h | 33 +++++++++++++++++++++++++++++++++
> > > >  1 file changed, 33 insertions(+)
> > > >
> > > > diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
> > > > index cf980e54d331..7ad9cdea99e1 100644
> > > > --- a/tools/lib/bpf/bpf_tracing.h
> > > > +++ b/tools/lib/bpf/bpf_tracing.h
> > > > @@ -461,4 +461,37 @@ typeof(name(0)) name(struct pt_regs *ctx)                              \
> > > >  }                                                                          \
> > > >  static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
> > > >
> > > > +#define ___bpf_syscall_args0()           ctx
> > > > +#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
> > > > +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
> > > > +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
> > > > +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
> > > > +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
> > > > +#define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
> > > > +
> > > > +/*
> > > > + * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for
> > > > + * tracing syscall functions, like __x64_sys_close. It hides the underlying
> > > > + * platform-specific low-level way of getting syscall input arguments from
> > > > + * struct pt_regs, and provides a familiar typed and named function arguments
> > > > + * syntax and semantics of accessing syscall input parameters.
> > > > + *
> > > > + * Original struct pt_regs* context is preserved as 'ctx' argument. This might
> > > > + * be necessary when using BPF helpers like bpf_perf_event_output().
> > > > + */
> > >
> > > LGTM. Please also mention that this macro relies on CO-RE so that
> > > users are aware.
> > >
> >
> > Now that Ilya's fixes are in again, added a small note about reliance
> > on BPF CO-RE and pushed to bpf-next, thanks.
> >
> >
> > On a relevant note. The whole __x64_sys_close vs sys_close depending
> > on architecture and kernel version was always super annoying. BCC
> > makes this transparent to users (AFAIK) and it always bothered me a
> > little, but I didn't see a clean solution that fits libbpf.
> >
> > I think I finally found it, though. Instead of guessing whether the
> > kprobe function is a syscall or not based on "sys_" prefix of a kernel
> > function, we can use libbpf SEC() handling to do this transparently.
> > What if we define two new SEC() definitions:
> >
> > SEC("ksyscall/write") and SEC("kretsyscall/write") (or maybe
> > SEC("kprobe.syscall/write") and SEC("kretprobe.syscall/write"), not
> > sure which one is better, voice your opinion, please). And for such
> > special kprobes, libbpf will perform feature detection of this
> > ARCH_SYSCALL_WRAPPER (we'll need to see the best way to do this in a
> > simple and fast way, preferably without parsing kallsyms) and
> > depending on it substitute either sys_write (or should it be
> > __se_sys_write, according to Naveen) or __<arch>_sys_write. You get
> > the idea.
> >
> > I like that this is still explicit and in the spirit of libbpf, but
> > offloads the burden of knowing these intricate differences from users.
> >
> > Thoughts?
>
> I think it will be just as fragile.
> That syscall prefix was changed by the kernel few times now.
> libbpf will be chasing the moving target.
> I think keeping the magic in .h is simpler and less of a maintenance burden.

Absolutely it is simpler, but it only works for selftests (and even
then, when prefix changes again we'll need to adjust selftests). But
the point here is to not require end users to chase this instead. And
in the real world where users want to work on as wide a range of
kernel versions as possible SYS_PREFIX trick doesn't work (which is
why I didn't want to add that to bpf_tracing.h).

It's cheaper (maintenance-wise) to do it in libbpf than require all
the kprobe users to keep track of this, no? And good thing is that
libbpf is part of the kernel repo, so whenever someone changes this in
the kernel we'll get a failing selftest on next net-next -> bpf->next
sync (at least for architectures that we do test), so we'll be able to
fix and adjust quickly.
Hengqi Chen Feb. 10, 2022, 4:52 p.m. UTC | #5
On 2/9/22 1:53 PM, Andrii Nakryiko wrote:
> On Mon, Feb 7, 2022 at 1:58 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
>>
>> On Mon, Feb 7, 2022 at 6:31 AM Hengqi Chen <hengqi.chen@gmail.com> wrote:
>>>

...

>>
> 
> Now that Ilya's fixes are in again, added a small note about reliance
> on BPF CO-RE and pushed to bpf-next, thanks.
> 
> 

Thanks.

> On a relevant note. The whole __x64_sys_close vs sys_close depending
> on architecture and kernel version was always super annoying. BCC
> makes this transparent to users (AFAIK) and it always bothered me a

BCC does this by 'guessing' the syscall prefix ([0]).

  [0]: https://github.com/iovisor/bcc/blob/v0.24.0/src/python/bcc/__init__.py#L787-L794

> little, but I didn't see a clean solution that fits libbpf.
> 
> I think I finally found it, though. Instead of guessing whether the
> kprobe function is a syscall or not based on "sys_" prefix of a kernel
> function, we can use libbpf SEC() handling to do this transparently.
> What if we define two new SEC() definitions:
> 
> SEC("ksyscall/write") and SEC("kretsyscall/write") (or maybe
> SEC("kprobe.syscall/write") and SEC("kretprobe.syscall/write"), not
> sure which one is better, voice your opinion, please). And for such

ksyscall/kretsyscall is more succinct and in the same vein as kfunc/kretfunc.

> special kprobes, libbpf will perform feature detection of this
> ARCH_SYSCALL_WRAPPER (we'll need to see the best way to do this in a
> simple and fast way, preferably without parsing kallsyms) and

By detecting special structs like struct cpuinfo_x86/cpuinfo_arm64 in BTF ?

In a word, I like the idea and it would make libbpf-based tools more portable. :)

> depending on it substitute either sys_write (or should it be
> __se_sys_write, according to Naveen) or __<arch>_sys_write. You get
> the idea.
> 
> I like that this is still explicit and in the spirit of libbpf, but
> offloads the burden of knowing these intricate differences from users.
> 
> Thoughts?
> 
> 
>> Unfortunately we had to back out Ilya's patches with
>> PT_REGS_SYSCALL_REGS() and PT_REGS_PARMx_CORE_SYSCALL(), so we'll need
>> to wait a bit before merging this.
>>
>>
>>> +#define BPF_KPROBE_SYSCALL(name, args...)                                  \
>>> +name(struct pt_regs *ctx);                                                 \
>>> +static __attribute__((always_inline)) typeof(name(0))                      \
>>> +____##name(struct pt_regs *ctx, ##args);                                   \
>>> +typeof(name(0)) name(struct pt_regs *ctx)                                  \
>>> +{                                                                          \
>>> +       struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx);                   \
>>> +       _Pragma("GCC diagnostic push")                                      \
>>> +       _Pragma("GCC diagnostic ignored \"-Wint-conversion\"")              \
>>> +       return ____##name(___bpf_syscall_args(args));                       \
>>> +       _Pragma("GCC diagnostic pop")                                       \
>>> +}                                                                          \
>>> +static __attribute__((always_inline)) typeof(name(0))                      \
>>> +____##name(struct pt_regs *ctx, ##args)
>>> +
>>>  #endif
>>> --
>>> 2.30.2

--
Hengqi
diff mbox series

Patch

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index cf980e54d331..7ad9cdea99e1 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -461,4 +461,37 @@  typeof(name(0)) name(struct pt_regs *ctx)				    \
 }									    \
 static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)

+#define ___bpf_syscall_args0()           ctx
+#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
+#define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
+
+/*
+ * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for
+ * tracing syscall functions, like __x64_sys_close. It hides the underlying
+ * platform-specific low-level way of getting syscall input arguments from
+ * struct pt_regs, and provides a familiar typed and named function arguments
+ * syntax and semantics of accessing syscall input parameters.
+ *
+ * Original struct pt_regs* context is preserved as 'ctx' argument. This might
+ * be necessary when using BPF helpers like bpf_perf_event_output().
+ */
+#define BPF_KPROBE_SYSCALL(name, args...)				    \
+name(struct pt_regs *ctx);						    \
+static __attribute__((always_inline)) typeof(name(0))			    \
+____##name(struct pt_regs *ctx, ##args);				    \
+typeof(name(0)) name(struct pt_regs *ctx)				    \
+{									    \
+	struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx);		    \
+	_Pragma("GCC diagnostic push")					    \
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
+	return ____##name(___bpf_syscall_args(args));			    \
+	_Pragma("GCC diagnostic pop")					    \
+}									    \
+static __attribute__((always_inline)) typeof(name(0))			    \
+____##name(struct pt_regs *ctx, ##args)
+
 #endif