diff mbox series

[bpf-next] bpf: ringbuf: Support consuming BPF_MAP_TYPE_RINGBUF from prog

Message ID 18a9ddacc99bb95e9802f8ad1e81214433df496c.1725929645.git.dxu@dxuuu.xyz (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next] bpf: ringbuf: Support consuming BPF_MAP_TYPE_RINGBUF from prog | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 16 this patch: 16
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 2 maintainers not CCed: andrea.righi@canonical.com martin.kelly@crowdstrike.com
netdev/build_clang success Errors and warnings before: 17 this patch: 17
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 32 this patch: 32
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Daniel Xu Sept. 10, 2024, 12:54 a.m. UTC
Right now there exists prog produce / userspace consume and userspace
produce / prog consume support. But it is also useful to have prog
produce / prog consume.

For example, we want to track the latency overhead of cpumap in
production. Since we need to store enqueue timestamps somewhere and
cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF
ringbuf is such a data structure. Rather than reimplement (possibly
poorly) a custom ringbuffer in BPF, extend the existing interface to
allow the final quadrant of ringbuf usecases to be filled. Note we
ignore userspace to userspace use case - there is no need to involve
kernel for that.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 kernel/bpf/verifier.c                         |  6 +-
 tools/testing/selftests/bpf/Makefile          |  3 +-
 .../selftests/bpf/prog_tests/ringbuf.c        | 50 +++++++++++++++
 .../bpf/progs/test_ringbuf_bpf_to_bpf.c       | 64 +++++++++++++++++++
 4 files changed, 120 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c

Comments

Alexei Starovoitov Sept. 10, 2024, 6:36 p.m. UTC | #1
On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> Right now there exists prog produce / userspace consume and userspace
> produce / prog consume support. But it is also useful to have prog
> produce / prog consume.
>
> For example, we want to track the latency overhead of cpumap in
> production. Since we need to store enqueue timestamps somewhere and
> cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF
> ringbuf is such a data structure. Rather than reimplement (possibly
> poorly) a custom ringbuffer in BPF, extend the existing interface to
> allow the final quadrant of ringbuf usecases to be filled. Note we
> ignore userspace to userspace use case - there is no need to involve
> kernel for that.
>
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> ---
>  kernel/bpf/verifier.c                         |  6 +-
>  tools/testing/selftests/bpf/Makefile          |  3 +-
>  .../selftests/bpf/prog_tests/ringbuf.c        | 50 +++++++++++++++
>  .../bpf/progs/test_ringbuf_bpf_to_bpf.c       | 64 +++++++++++++++++++
>  4 files changed, 120 insertions(+), 3 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 53d0556fbbf3..56bfe559f228 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>                     func_id != BPF_FUNC_ringbuf_query &&
>                     func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
>                     func_id != BPF_FUNC_ringbuf_submit_dynptr &&
> -                   func_id != BPF_FUNC_ringbuf_discard_dynptr)
> +                   func_id != BPF_FUNC_ringbuf_discard_dynptr &&
> +                   func_id != BPF_FUNC_user_ringbuf_drain)
>                         goto error;
>                 break;
>         case BPF_MAP_TYPE_USER_RINGBUF:
> @@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
>                         goto error;
>                 break;
>         case BPF_FUNC_user_ringbuf_drain:
> -               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
> +               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
> +                   map->map_type != BPF_MAP_TYPE_RINGBUF)
>                         goto error;

I think it should work.

Andrii,

do you see any issues with such use?

>                 break;
>         case BPF_FUNC_get_stackid:
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 9905e3739dd0..233109843d4d 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -503,7 +503,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h               \
>  LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c           \
>         trace_printk.c trace_vprintk.c map_ptr_kern.c                   \
>         core_kern.c core_kern_overflow.c test_ringbuf.c                 \
> -       test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
> +       test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c    \
> +       test_ringbuf_bpf_to_bpf.c

Do you need it to be lskel ?

Regular skels are either to debug.

Also pls split selftest into a separate patch.

>
>  # Generate both light skeleton and libbpf skeleton for these
>  LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \
> diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
> index da430df45aa4..3e7955573ac5 100644
> --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c
> +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
> @@ -17,6 +17,7 @@
>  #include "test_ringbuf_n.lskel.h"
>  #include "test_ringbuf_map_key.lskel.h"
>  #include "test_ringbuf_write.lskel.h"
> +#include "test_ringbuf_bpf_to_bpf.lskel.h"
>
>  #define EDONE 7777
>
> @@ -497,6 +498,53 @@ static void ringbuf_map_key_subtest(void)
>         test_ringbuf_map_key_lskel__destroy(skel_map_key);
>  }
>
> +static void ringbuf_bpf_to_bpf_subtest(void)
> +{
> +       struct test_ringbuf_bpf_to_bpf_lskel *skel;
> +       int err, i;
> +
> +       skel = test_ringbuf_bpf_to_bpf_lskel__open();
> +       if (!ASSERT_OK_PTR(skel, "test_ringbuf_bpf_to_bpf_lskel__open"))
> +               return;
> +
> +       skel->maps.ringbuf.max_entries = getpagesize();
> +       skel->bss->pid = getpid();
> +
> +       err = test_ringbuf_bpf_to_bpf_lskel__load(skel);
> +       if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__load"))
> +               goto cleanup;
> +
> +       ringbuf = ring_buffer__new(skel->maps.ringbuf.map_fd, NULL, NULL, NULL);
> +       if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new"))
> +               goto cleanup;
> +
> +       err = test_ringbuf_bpf_to_bpf_lskel__attach(skel);
> +       if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__attach"))
> +               goto cleanup_ringbuf;
> +
> +       /* Produce N_SAMPLES samples in the ring buffer by calling getpid() */
> +       skel->bss->value = SAMPLE_VALUE;
> +       for (i = 0; i < N_SAMPLES; i++)
> +               syscall(__NR_getpgid);
> +
> +       /* Trigger bpf-side consumption */
> +       syscall(__NR_prctl);

This might conflict with other selftests that run in parallel.

Just load the skel and bpf_prog_run(prog_fd).
No need to attach anywhere in the kernel.

pw-bot: cr
Andrii Nakryiko Sept. 10, 2024, 8:41 p.m. UTC | #2
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> >
> > Right now there exists prog produce / userspace consume and userspace
> > produce / prog consume support. But it is also useful to have prog
> > produce / prog consume.
> >
> > For example, we want to track the latency overhead of cpumap in
> > production. Since we need to store enqueue timestamps somewhere and
> > cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF
> > ringbuf is such a data structure. Rather than reimplement (possibly
> > poorly) a custom ringbuffer in BPF, extend the existing interface to
> > allow the final quadrant of ringbuf usecases to be filled. Note we
> > ignore userspace to userspace use case - there is no need to involve
> > kernel for that.
> >
> > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > ---
> >  kernel/bpf/verifier.c                         |  6 +-
> >  tools/testing/selftests/bpf/Makefile          |  3 +-
> >  .../selftests/bpf/prog_tests/ringbuf.c        | 50 +++++++++++++++
> >  .../bpf/progs/test_ringbuf_bpf_to_bpf.c       | 64 +++++++++++++++++++
> >  4 files changed, 120 insertions(+), 3 deletions(-)
> >  create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 53d0556fbbf3..56bfe559f228 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> >                     func_id != BPF_FUNC_ringbuf_query &&
> >                     func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
> >                     func_id != BPF_FUNC_ringbuf_submit_dynptr &&
> > -                   func_id != BPF_FUNC_ringbuf_discard_dynptr)
> > +                   func_id != BPF_FUNC_ringbuf_discard_dynptr &&
> > +                   func_id != BPF_FUNC_user_ringbuf_drain)
> >                         goto error;
> >                 break;
> >         case BPF_MAP_TYPE_USER_RINGBUF:
> > @@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> >                         goto error;
> >                 break;
> >         case BPF_FUNC_user_ringbuf_drain:
> > -               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
> > +               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
> > +                   map->map_type != BPF_MAP_TYPE_RINGBUF)
> >                         goto error;
>
> I think it should work.
>
> Andrii,
>
> do you see any issues with such use?
>

Not from a quick glance. Both ringbufs have the same memory layout, so
user_ringbuf_drain() should probably work fine for normal BPF ringbuf
(and either way bpf_user_ringbuf_drain() has to protect from malicious
user space, so its code is pretty unassuming).

We should make it very explicit, though, that the user is responsible
for making sure that bpf_user_ringbuf_drain() will not be called
simultaneously in two threads, kernel or user space.

Also, Daniel, can you please make sure that dynptr we return for each
sample is read-only? We shouldn't let consumer BPF program ability to
corrupt ringbuf record headers (accidentally or otherwise).

And as a thought exercise. I wonder what would it take to have an
open-coded iterator returning these read-only dynptrs for each
consumed record? Maybe we already have all the pieces together. So
consider looking into that as well.

P.S. And yeah "user_" part in helper name is kind of unfortunate given
it will work for both ringbufs. Can/should we add some sort of alias
for this helper so it can be used with both bpf_user_ringbuf_drain()
and bpf_ringbuf_drain() names?


> >                 break;
> >         case BPF_FUNC_get_stackid:
> > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> > index 9905e3739dd0..233109843d4d 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -503,7 +503,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h               \
> >  LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c           \
> >         trace_printk.c trace_vprintk.c map_ptr_kern.c                   \
> >         core_kern.c core_kern_overflow.c test_ringbuf.c                 \
> > -       test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
> > +       test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c    \
> > +       test_ringbuf_bpf_to_bpf.c

[...]
Daniel Xu Sept. 10, 2024, 8:59 p.m. UTC | #3
On Tue, Sep 10, 2024 at 11:36:36AM GMT, Alexei Starovoitov wrote:
> On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> >
> > Right now there exists prog produce / userspace consume and userspace
> > produce / prog consume support. But it is also useful to have prog
> > produce / prog consume.
> >
> > For example, we want to track the latency overhead of cpumap in
> > production. Since we need to store enqueue timestamps somewhere and
> > cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF
> > ringbuf is such a data structure. Rather than reimplement (possibly
> > poorly) a custom ringbuffer in BPF, extend the existing interface to
> > allow the final quadrant of ringbuf usecases to be filled. Note we
> > ignore userspace to userspace use case - there is no need to involve
> > kernel for that.
> >
> > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > ---
> >  kernel/bpf/verifier.c                         |  6 +-
> >  tools/testing/selftests/bpf/Makefile          |  3 +-
> >  .../selftests/bpf/prog_tests/ringbuf.c        | 50 +++++++++++++++
> >  .../bpf/progs/test_ringbuf_bpf_to_bpf.c       | 64 +++++++++++++++++++
> >  4 files changed, 120 insertions(+), 3 deletions(-)
> >  create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 53d0556fbbf3..56bfe559f228 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> >                     func_id != BPF_FUNC_ringbuf_query &&
> >                     func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
> >                     func_id != BPF_FUNC_ringbuf_submit_dynptr &&
> > -                   func_id != BPF_FUNC_ringbuf_discard_dynptr)
> > +                   func_id != BPF_FUNC_ringbuf_discard_dynptr &&
> > +                   func_id != BPF_FUNC_user_ringbuf_drain)
> >                         goto error;
> >                 break;
> >         case BPF_MAP_TYPE_USER_RINGBUF:
> > @@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> >                         goto error;
> >                 break;
> >         case BPF_FUNC_user_ringbuf_drain:
> > -               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
> > +               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
> > +                   map->map_type != BPF_MAP_TYPE_RINGBUF)
> >                         goto error;
> 
> I think it should work.
> 
> Andrii,
> 
> do you see any issues with such use?
> 
> >                 break;
> >         case BPF_FUNC_get_stackid:
> > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> > index 9905e3739dd0..233109843d4d 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -503,7 +503,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h               \
> >  LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c           \
> >         trace_printk.c trace_vprintk.c map_ptr_kern.c                   \
> >         core_kern.c core_kern_overflow.c test_ringbuf.c                 \
> > -       test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
> > +       test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c    \
> > +       test_ringbuf_bpf_to_bpf.c
> 
> Do you need it to be lskel ?
> 
> Regular skels are either to debug.

I'm actually unsure the difference, still. But all the other tests in
the file were using lskel so I just copy/pasted.

> 
> Also pls split selftest into a separate patch.

Ack.

> 
> >
> >  # Generate both light skeleton and libbpf skeleton for these
> >  LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \
> > diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
> > index da430df45aa4..3e7955573ac5 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
> > @@ -17,6 +17,7 @@
> >  #include "test_ringbuf_n.lskel.h"
> >  #include "test_ringbuf_map_key.lskel.h"
> >  #include "test_ringbuf_write.lskel.h"
> > +#include "test_ringbuf_bpf_to_bpf.lskel.h"
> >
> >  #define EDONE 7777
> >
> > @@ -497,6 +498,53 @@ static void ringbuf_map_key_subtest(void)
> >         test_ringbuf_map_key_lskel__destroy(skel_map_key);
> >  }
> >
> > +static void ringbuf_bpf_to_bpf_subtest(void)
> > +{
> > +       struct test_ringbuf_bpf_to_bpf_lskel *skel;
> > +       int err, i;
> > +
> > +       skel = test_ringbuf_bpf_to_bpf_lskel__open();
> > +       if (!ASSERT_OK_PTR(skel, "test_ringbuf_bpf_to_bpf_lskel__open"))
> > +               return;
> > +
> > +       skel->maps.ringbuf.max_entries = getpagesize();
> > +       skel->bss->pid = getpid();
> > +
> > +       err = test_ringbuf_bpf_to_bpf_lskel__load(skel);
> > +       if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__load"))
> > +               goto cleanup;
> > +
> > +       ringbuf = ring_buffer__new(skel->maps.ringbuf.map_fd, NULL, NULL, NULL);
> > +       if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new"))
> > +               goto cleanup;
> > +
> > +       err = test_ringbuf_bpf_to_bpf_lskel__attach(skel);
> > +       if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__attach"))
> > +               goto cleanup_ringbuf;
> > +
> > +       /* Produce N_SAMPLES samples in the ring buffer by calling getpid() */
> > +       skel->bss->value = SAMPLE_VALUE;
> > +       for (i = 0; i < N_SAMPLES; i++)
> > +               syscall(__NR_getpgid);
> > +
> > +       /* Trigger bpf-side consumption */
> > +       syscall(__NR_prctl);
> 
> This might conflict with other selftests that run in parallel.
> 
> Just load the skel and bpf_prog_run(prog_fd).
> No need to attach anywhere in the kernel.

Ack.

> 
> pw-bot: cr
Daniel Xu Sept. 10, 2024, 9:07 p.m. UTC | #4
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > >
> > > Right now there exists prog produce / userspace consume and userspace
> > > produce / prog consume support. But it is also useful to have prog
> > > produce / prog consume.
> > >
> > > For example, we want to track the latency overhead of cpumap in
> > > production. Since we need to store enqueue timestamps somewhere and
> > > cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF
> > > ringbuf is such a data structure. Rather than reimplement (possibly
> > > poorly) a custom ringbuffer in BPF, extend the existing interface to
> > > allow the final quadrant of ringbuf usecases to be filled. Note we
> > > ignore userspace to userspace use case - there is no need to involve
> > > kernel for that.
> > >
> > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > > ---
> > >  kernel/bpf/verifier.c                         |  6 +-
> > >  tools/testing/selftests/bpf/Makefile          |  3 +-
> > >  .../selftests/bpf/prog_tests/ringbuf.c        | 50 +++++++++++++++
> > >  .../bpf/progs/test_ringbuf_bpf_to_bpf.c       | 64 +++++++++++++++++++
> > >  4 files changed, 120 insertions(+), 3 deletions(-)
> > >  create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
> > >
> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > index 53d0556fbbf3..56bfe559f228 100644
> > > --- a/kernel/bpf/verifier.c
> > > +++ b/kernel/bpf/verifier.c
> > > @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> > >                     func_id != BPF_FUNC_ringbuf_query &&
> > >                     func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
> > >                     func_id != BPF_FUNC_ringbuf_submit_dynptr &&
> > > -                   func_id != BPF_FUNC_ringbuf_discard_dynptr)
> > > +                   func_id != BPF_FUNC_ringbuf_discard_dynptr &&
> > > +                   func_id != BPF_FUNC_user_ringbuf_drain)
> > >                         goto error;
> > >                 break;
> > >         case BPF_MAP_TYPE_USER_RINGBUF:
> > > @@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> > >                         goto error;
> > >                 break;
> > >         case BPF_FUNC_user_ringbuf_drain:
> > > -               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
> > > +               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
> > > +                   map->map_type != BPF_MAP_TYPE_RINGBUF)
> > >                         goto error;
> >
> > I think it should work.
> >
> > Andrii,
> >
> > do you see any issues with such use?
> >
> 
> Not from a quick glance. Both ringbufs have the same memory layout, so
> user_ringbuf_drain() should probably work fine for normal BPF ringbuf
> (and either way bpf_user_ringbuf_drain() has to protect from malicious
> user space, so its code is pretty unassuming).
> 
> We should make it very explicit, though, that the user is responsible
> for making sure that bpf_user_ringbuf_drain() will not be called
> simultaneously in two threads, kernel or user space.

I see an atomic_try_cmpxchg() protecting the drain. So it should be
safe, right? What are they supposed to expect?

> 
> Also, Daniel, can you please make sure that dynptr we return for each
> sample is read-only? We shouldn't let consumer BPF program ability to
> corrupt ringbuf record headers (accidentally or otherwise).

Sure.

> 
> And as a thought exercise. I wonder what would it take to have an
> open-coded iterator returning these read-only dynptrs for each
> consumed record? Maybe we already have all the pieces together. So
> consider looking into that as well.
> 
> P.S. And yeah "user_" part in helper name is kind of unfortunate given
> it will work for both ringbufs. Can/should we add some sort of alias
> for this helper so it can be used with both bpf_user_ringbuf_drain()
> and bpf_ringbuf_drain() names?

You mean register a new helper that shares the impl? Easy enough, but I
thought we didn't want to add more uapi helpers.

Thanks,
Daniel
Daniel Xu Sept. 10, 2024, 10:16 p.m. UTC | #5
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
> On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
>> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
>
>> 
>> Also, Daniel, can you please make sure that dynptr we return for each
>> sample is read-only? We shouldn't let consumer BPF program ability to
>> corrupt ringbuf record headers (accidentally or otherwise).
>
> Sure.

So the sample is not read-only. But I think prog is prevented from messing
with header regardless.

__bpf_user_ringbuf_peek() returns sample past the header:

        *sample = (void *)((uintptr_t)rb->data +
                           (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));

dynptr is initialized with the above ptr:

        bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);

So I don't think there's a way for the prog to access the header thru the dynptr.

Thanks,
Daniel
Andrii Nakryiko Sept. 10, 2024, 10:18 p.m. UTC | #6
On Tue, Sep 10, 2024 at 2:07 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
> > On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > > >
> > > > Right now there exists prog produce / userspace consume and userspace
> > > > produce / prog consume support. But it is also useful to have prog
> > > > produce / prog consume.
> > > >
> > > > For example, we want to track the latency overhead of cpumap in
> > > > production. Since we need to store enqueue timestamps somewhere and
> > > > cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF
> > > > ringbuf is such a data structure. Rather than reimplement (possibly
> > > > poorly) a custom ringbuffer in BPF, extend the existing interface to
> > > > allow the final quadrant of ringbuf usecases to be filled. Note we
> > > > ignore userspace to userspace use case - there is no need to involve
> > > > kernel for that.
> > > >
> > > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > > > ---
> > > >  kernel/bpf/verifier.c                         |  6 +-
> > > >  tools/testing/selftests/bpf/Makefile          |  3 +-
> > > >  .../selftests/bpf/prog_tests/ringbuf.c        | 50 +++++++++++++++
> > > >  .../bpf/progs/test_ringbuf_bpf_to_bpf.c       | 64 +++++++++++++++++++
> > > >  4 files changed, 120 insertions(+), 3 deletions(-)
> > > >  create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
> > > >
> > > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > > index 53d0556fbbf3..56bfe559f228 100644
> > > > --- a/kernel/bpf/verifier.c
> > > > +++ b/kernel/bpf/verifier.c
> > > > @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> > > >                     func_id != BPF_FUNC_ringbuf_query &&
> > > >                     func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
> > > >                     func_id != BPF_FUNC_ringbuf_submit_dynptr &&
> > > > -                   func_id != BPF_FUNC_ringbuf_discard_dynptr)
> > > > +                   func_id != BPF_FUNC_ringbuf_discard_dynptr &&
> > > > +                   func_id != BPF_FUNC_user_ringbuf_drain)
> > > >                         goto error;
> > > >                 break;
> > > >         case BPF_MAP_TYPE_USER_RINGBUF:
> > > > @@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> > > >                         goto error;
> > > >                 break;
> > > >         case BPF_FUNC_user_ringbuf_drain:
> > > > -               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
> > > > +               if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
> > > > +                   map->map_type != BPF_MAP_TYPE_RINGBUF)
> > > >                         goto error;
> > >
> > > I think it should work.
> > >
> > > Andrii,
> > >
> > > do you see any issues with such use?
> > >
> >
> > Not from a quick glance. Both ringbufs have the same memory layout, so
> > user_ringbuf_drain() should probably work fine for normal BPF ringbuf
> > (and either way bpf_user_ringbuf_drain() has to protect from malicious
> > user space, so its code is pretty unassuming).
> >
> > We should make it very explicit, though, that the user is responsible
> > for making sure that bpf_user_ringbuf_drain() will not be called
> > simultaneously in two threads, kernel or user space.
>
> I see an atomic_try_cmpxchg() protecting the drain. So it should be
> safe, right? What are they supposed to expect?

User space can ignore rb->busy and start consuming in parallel. Ok,
given we had this atomic_try_cmpxchg() it was rather OK for user
ringbuf, but it's not so great for BPF ringbuf, way too easy to screw
up...

>
> >
> > Also, Daniel, can you please make sure that dynptr we return for each
> > sample is read-only? We shouldn't let consumer BPF program ability to
> > corrupt ringbuf record headers (accidentally or otherwise).
>
> Sure.
>
> >
> > And as a thought exercise. I wonder what would it take to have an
> > open-coded iterator returning these read-only dynptrs for each
> > consumed record? Maybe we already have all the pieces together. So
> > consider looking into that as well.
> >
> > P.S. And yeah "user_" part in helper name is kind of unfortunate given
> > it will work for both ringbufs. Can/should we add some sort of alias
> > for this helper so it can be used with both bpf_user_ringbuf_drain()
> > and bpf_ringbuf_drain() names?
>
> You mean register a new helper that shares the impl? Easy enough, but I
> thought we didn't want to add more uapi helpers.

No, not a new helper. Just an alternative name for it,
"bpf_ringbuf_drain()". Might not be worth doing, I haven't checked
what would be the best way to do this, can't tell right now.

>
> Thanks,
> Daniel
Andrii Nakryiko Sept. 10, 2024, 10:21 p.m. UTC | #7
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
>
>
> On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
> > On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
> >> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
> [...]
> >
> >>
> >> Also, Daniel, can you please make sure that dynptr we return for each
> >> sample is read-only? We shouldn't let consumer BPF program ability to
> >> corrupt ringbuf record headers (accidentally or otherwise).
> >
> > Sure.
>
> So the sample is not read-only. But I think prog is prevented from messing
> with header regardless.
>
> __bpf_user_ringbuf_peek() returns sample past the header:
>
>         *sample = (void *)((uintptr_t)rb->data +
>                            (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
>
> dynptr is initialized with the above ptr:
>
>         bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
>
> So I don't think there's a way for the prog to access the header thru the dynptr.
>

By "header" I mean 8 bytes that precede each submitted ringbuf record.
That header is part of ringbuf data area. Given user space can set
consumer_pos to arbitrary value, kernel can return arbitrary part of
ringbuf data area, including that 8 byte header. If that data is
writable, it's easy to screw up that header and crash another BPF
program that reserves/submits a new record. User space can only read
data area for BPF ringbuf, and so we rely heavily on a tight control
of who can write what into those 8 bytes.

> Thanks,
> Daniel
Daniel Xu Sept. 10, 2024, 11:44 p.m. UTC | #8
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
> On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> >
> >
> >
> > On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
> > > On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
> > >> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
> > [...]
> > >
> > >>
> > >> Also, Daniel, can you please make sure that dynptr we return for each
> > >> sample is read-only? We shouldn't let consumer BPF program ability to
> > >> corrupt ringbuf record headers (accidentally or otherwise).
> > >
> > > Sure.
> >
> > So the sample is not read-only. But I think prog is prevented from messing
> > with header regardless.
> >
> > __bpf_user_ringbuf_peek() returns sample past the header:
> >
> >         *sample = (void *)((uintptr_t)rb->data +
> >                            (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
> >
> > dynptr is initialized with the above ptr:
> >
> >         bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
> >
> > So I don't think there's a way for the prog to access the header thru the dynptr.
> >
> 
> By "header" I mean 8 bytes that precede each submitted ringbuf record.
> That header is part of ringbuf data area. Given user space can set
> consumer_pos to arbitrary value, kernel can return arbitrary part of
> ringbuf data area, including that 8 byte header. If that data is
> writable, it's easy to screw up that header and crash another BPF
> program that reserves/submits a new record. User space can only read
> data area for BPF ringbuf, and so we rely heavily on a tight control
> of who can write what into those 8 bytes.

Ah, ok. I think I understand.

Given this and your other comments about rb->busy, what about enforcing
bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are
different enough where this makes sense.
Andrii Nakryiko Sept. 11, 2024, 12:39 a.m. UTC | #9
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
> > On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > >
> > >
> > >
> > > On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
> > > > On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
> > > >> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
> > > [...]
> > > >
> > > >>
> > > >> Also, Daniel, can you please make sure that dynptr we return for each
> > > >> sample is read-only? We shouldn't let consumer BPF program ability to
> > > >> corrupt ringbuf record headers (accidentally or otherwise).
> > > >
> > > > Sure.
> > >
> > > So the sample is not read-only. But I think prog is prevented from messing
> > > with header regardless.
> > >
> > > __bpf_user_ringbuf_peek() returns sample past the header:
> > >
> > >         *sample = (void *)((uintptr_t)rb->data +
> > >                            (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
> > >
> > > dynptr is initialized with the above ptr:
> > >
> > >         bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
> > >
> > > So I don't think there's a way for the prog to access the header thru the dynptr.
> > >
> >
> > By "header" I mean 8 bytes that precede each submitted ringbuf record.
> > That header is part of ringbuf data area. Given user space can set
> > consumer_pos to arbitrary value, kernel can return arbitrary part of
> > ringbuf data area, including that 8 byte header. If that data is
> > writable, it's easy to screw up that header and crash another BPF
> > program that reserves/submits a new record. User space can only read
> > data area for BPF ringbuf, and so we rely heavily on a tight control
> > of who can write what into those 8 bytes.
>
> Ah, ok. I think I understand.
>
> Given this and your other comments about rb->busy, what about enforcing
> bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are
> different enough where this makes sense.

You mean disabling user-space mmap()? TBH, I'd like to understand the
use case first before we make such decisions. Maybe what you need is
not really a BPF ringbuf? Can you give us a bit more details on what
you are trying to achieve?
Daniel Xu Sept. 11, 2024, 3:31 a.m. UTC | #10
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
> On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> >
> > On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
> > > On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > > >
> > > >
> > > >
> > > > On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
> > > > > On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
> > > > >> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
> > > > [...]
> > > > >
> > > > >>
> > > > >> Also, Daniel, can you please make sure that dynptr we return for each
> > > > >> sample is read-only? We shouldn't let consumer BPF program ability to
> > > > >> corrupt ringbuf record headers (accidentally or otherwise).
> > > > >
> > > > > Sure.
> > > >
> > > > So the sample is not read-only. But I think prog is prevented from messing
> > > > with header regardless.
> > > >
> > > > __bpf_user_ringbuf_peek() returns sample past the header:
> > > >
> > > >         *sample = (void *)((uintptr_t)rb->data +
> > > >                            (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
> > > >
> > > > dynptr is initialized with the above ptr:
> > > >
> > > >         bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
> > > >
> > > > So I don't think there's a way for the prog to access the header thru the dynptr.
> > > >
> > >
> > > By "header" I mean 8 bytes that precede each submitted ringbuf record.
> > > That header is part of ringbuf data area. Given user space can set
> > > consumer_pos to arbitrary value, kernel can return arbitrary part of
> > > ringbuf data area, including that 8 byte header. If that data is
> > > writable, it's easy to screw up that header and crash another BPF
> > > program that reserves/submits a new record. User space can only read
> > > data area for BPF ringbuf, and so we rely heavily on a tight control
> > > of who can write what into those 8 bytes.
> >
> > Ah, ok. I think I understand.
> >
> > Given this and your other comments about rb->busy, what about enforcing
> > bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are
> > different enough where this makes sense.
> 
> You mean disabling user-space mmap()? TBH, I'd like to understand the
> use case first before we make such decisions. Maybe what you need is
> not really a BPF ringbuf? Can you give us a bit more details on what
> you are trying to achieve?

BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each
entry in the cpumap. When a prog redirects to an entry in the cpumap,
the machinery queues up the xdp frame onto the destination CPU ptr_ring.
This can occur on any cpu, thus multi-producer. On processing side,
there is only the kthread created by the cpumap entry and bound to the
specific cpu that is consuming entries. So single consumer.

Goal is to track the latency overhead added from ptr_ring and the
kthread (versus softirq where is less overhead). Ideally we want p50,
p90, p95, p99 percentiles.

To do this, we need to track every single entry enqueue time as well as
dequeue time - events that occur in the tail are quite important.

Since ptr_ring is also a ring buffer, I thought it would be easy,
reliable, and fast to just create a "shadow" ring buffer. Every time
producer enqueues entries, I'd enqueue the same number of current
timestamp onto shadow RB. Same thing on consumer side, except dequeue
and calculate timestamp delta.

I was originally planning on writing my own lockless ring buffer in pure
BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I
could avoid that with this patch.

About disabling user-space mmap: yeah, that's what I was suggesting. I
think it'd be a bit odd if you wanted BPF RB to support consumption from
both userspace && prog at the same time. And since draining from prog is
new functionality (and thus the NAND isn't a regression), you could
relax the restriction later without issues.
Daniel Xu Sept. 11, 2024, 4:43 a.m. UTC | #11
[cc Jesper]

On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
> On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
>> On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>> >
>> > On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
>> > > On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>> > > >
>> > > >
>> > > >
>> > > > On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
>> > > > > On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
>> > > > >> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
>> > > > [...]
>> > > > >
>> > > > >>
>> > > > >> Also, Daniel, can you please make sure that dynptr we return for each
>> > > > >> sample is read-only? We shouldn't let consumer BPF program ability to
>> > > > >> corrupt ringbuf record headers (accidentally or otherwise).
>> > > > >
>> > > > > Sure.
>> > > >
>> > > > So the sample is not read-only. But I think prog is prevented from messing
>> > > > with header regardless.
>> > > >
>> > > > __bpf_user_ringbuf_peek() returns sample past the header:
>> > > >
>> > > >         *sample = (void *)((uintptr_t)rb->data +
>> > > >                            (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
>> > > >
>> > > > dynptr is initialized with the above ptr:
>> > > >
>> > > >         bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
>> > > >
>> > > > So I don't think there's a way for the prog to access the header thru the dynptr.
>> > > >
>> > >
>> > > By "header" I mean 8 bytes that precede each submitted ringbuf record.
>> > > That header is part of ringbuf data area. Given user space can set
>> > > consumer_pos to arbitrary value, kernel can return arbitrary part of
>> > > ringbuf data area, including that 8 byte header. If that data is
>> > > writable, it's easy to screw up that header and crash another BPF
>> > > program that reserves/submits a new record. User space can only read
>> > > data area for BPF ringbuf, and so we rely heavily on a tight control
>> > > of who can write what into those 8 bytes.
>> >
>> > Ah, ok. I think I understand.
>> >
>> > Given this and your other comments about rb->busy, what about enforcing
>> > bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are
>> > different enough where this makes sense.
>> 
>> You mean disabling user-space mmap()? TBH, I'd like to understand the
>> use case first before we make such decisions. Maybe what you need is
>> not really a BPF ringbuf? Can you give us a bit more details on what
>> you are trying to achieve?
>
> BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each
> entry in the cpumap. When a prog redirects to an entry in the cpumap,
> the machinery queues up the xdp frame onto the destination CPU ptr_ring.
> This can occur on any cpu, thus multi-producer. On processing side,
> there is only the kthread created by the cpumap entry and bound to the
> specific cpu that is consuming entries. So single consumer.
>
> Goal is to track the latency overhead added from ptr_ring and the
> kthread (versus softirq where is less overhead). Ideally we want p50,
> p90, p95, p99 percentiles.
>
> To do this, we need to track every single entry enqueue time as well as
> dequeue time - events that occur in the tail are quite important.
>
> Since ptr_ring is also a ring buffer, I thought it would be easy,
> reliable, and fast to just create a "shadow" ring buffer. Every time
> producer enqueues entries, I'd enqueue the same number of current
> timestamp onto shadow RB. Same thing on consumer side, except dequeue
> and calculate timestamp delta.
>
> I was originally planning on writing my own lockless ring buffer in pure
> BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I
> could avoid that with this patch.

[...]

Alternatively, could add a u64 timestamp to xdp_frame, which makes all
this tracking inline (and thus more reliable). But I'm not sure how precious
the space in that struct is - I see some references online saying most drivers
save 128B headroom. I also see:

        #define XDP_PACKET_HEADROOM 256

Could probably amortize the timestamp read by setting it in
bq_flush_to_queue().
Jesper Dangaard Brouer Sept. 11, 2024, 8:32 a.m. UTC | #12
On 11/09/2024 06.43, Daniel Xu wrote:
> [cc Jesper]
> 
> On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
>> On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
>>> On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>>>>
>>>> On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
>>>>> On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>>>>>>
[...cut...]

>>> Can you give us a bit more details on what
>>> you are trying to achieve?
>>
>> BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each
>> entry in the cpumap. When a prog redirects to an entry in the cpumap,
>> the machinery queues up the xdp frame onto the destination CPU ptr_ring.
>> This can occur on any cpu, thus multi-producer. On processing side,
>> there is only the kthread created by the cpumap entry and bound to the
>> specific cpu that is consuming entries. So single consumer.
>>

An important detail: to get Multi-Producer (MP) to scale the CPUMAP does
bulk enqueue into the ptr_ring.  It stores the xdp_frame's in a per-CPU
array and does the flush/enqueue as part of the xdp_do_flush(). Because
I was afraid of this adding latency, I choose to also flush every 8
frames (CPU_MAP_BULK_SIZE).

Looking at code I see this is also explained in a comment:

/* General idea: XDP packets getting XDP redirected to another CPU,
  * will maximum be stored/queued for one driver ->poll() call.  It is
  * guaranteed that queueing the frame and the flush operation happen on
  * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
  * which queue in bpf_cpu_map_entry contains packets.
  */


>> Goal is to track the latency overhead added from ptr_ring and the
>> kthread (versus softirq where is less overhead). Ideally we want p50,
>> p90, p95, p99 percentiles.
>>

I'm very interesting in this use-case of understanding the latency of 
CPUMAP.
I'm a fan of latency histograms that I turn into heatmaps in grafana.

>> To do this, we need to track every single entry enqueue time as well as
>> dequeue time - events that occur in the tail are quite important.
>>
>> Since ptr_ring is also a ring buffer, I thought it would be easy,
>> reliable, and fast to just create a "shadow" ring buffer. Every time
>> producer enqueues entries, I'd enqueue the same number of current
>> timestamp onto shadow RB. Same thing on consumer side, except dequeue
>> and calculate timestamp delta.
>>

This idea seems overkill and will likely produce unreliable results.
E.g. the overhead of this additional ring buffer will also affect the
measurements.

>> I was originally planning on writing my own lockless ring buffer in pure
>> BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I
>> could avoid that with this patch.
> 
> [...]
> 
> Alternatively, could add a u64 timestamp to xdp_frame, which makes all
> this tracking inline (and thus more reliable). But I'm not sure how precious
> the space in that struct is - I see some references online saying most drivers
> save 128B headroom. I also see:
> 
>          #define XDP_PACKET_HEADROOM 256
> 

I like the inline idea. I would suggest to add u64 timestamp into 
XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in 
RX-NAPI.  Then at the remote CPU you can run another CPUMAP-XDP program 
that pickup this timestamp, and then calc a delta from "now" timestamp.


  [1] 
https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/af_xdp_kern.c#L62-L77


> Could probably amortize the timestamp read by setting it in
> bq_flush_to_queue().

To amortize, consider that you might not need to timestamp EVERY packet 
to get sufficient statistics on the latency.

Regarding bq_flush_to_queue() and the enqueue tracepoint:
   trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)

I have an idea for you, on how to measure the latency overhead from XDP 
RX-processing to when enqueue "flush" happens.  It is a little tricky to 
explain, so I will outline the steps.

1. XDP bpf_prog store timestamp in per-CPU array,
    unless timestamp is already set.

2. trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp
    and calc latency diff, and clears timestamp.

This measures the latency overhead of bulk enqueue. (Notice: Only the
first XDP redirect frame after a bq_flush_to_queue() will set the
timestamp). This per-CPU store should work as this all runs under same
RX-NAPI "poll" execution.

This latency overhead of bulk enqueue, will (unfortunately) also
count/measure the XDP_PASS packets that gets processed by the normal
netstack.  So, watch out for this. e.g could have XDP actions (e.g
XDP_PASS) counters as part of step 1, and have statistic for cases where
XDP_PASS interfered.


--Jesper
Daniel Xu Sept. 11, 2024, 6:53 p.m. UTC | #13
On Wed, Sep 11, 2024 at 10:32:56AM GMT, Jesper Dangaard Brouer wrote:
> 
> 
> On 11/09/2024 06.43, Daniel Xu wrote:
> > [cc Jesper]
> > 
> > On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
> > > On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
> > > > On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > > > > 
> > > > > On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
> > > > > > On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > > > > > > 
> [...cut...]
> 
> > > > Can you give us a bit more details on what
> > > > you are trying to achieve?
> > > 
> > > BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each
> > > entry in the cpumap. When a prog redirects to an entry in the cpumap,
> > > the machinery queues up the xdp frame onto the destination CPU ptr_ring.
> > > This can occur on any cpu, thus multi-producer. On processing side,
> > > there is only the kthread created by the cpumap entry and bound to the
> > > specific cpu that is consuming entries. So single consumer.
> > > 
> 
> An important detail: to get Multi-Producer (MP) to scale the CPUMAP does
> bulk enqueue into the ptr_ring.  It stores the xdp_frame's in a per-CPU
> array and does the flush/enqueue as part of the xdp_do_flush(). Because
> I was afraid of this adding latency, I choose to also flush every 8
> frames (CPU_MAP_BULK_SIZE).
> 
> Looking at code I see this is also explained in a comment:
> 
> /* General idea: XDP packets getting XDP redirected to another CPU,
>  * will maximum be stored/queued for one driver ->poll() call.  It is
>  * guaranteed that queueing the frame and the flush operation happen on
>  * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
>  * which queue in bpf_cpu_map_entry contains packets.
>  */
> 
> 
> > > Goal is to track the latency overhead added from ptr_ring and the
> > > kthread (versus softirq where is less overhead). Ideally we want p50,
> > > p90, p95, p99 percentiles.
> > > 
> 
> I'm very interesting in this use-case of understanding the latency of
> CPUMAP.
> I'm a fan of latency histograms that I turn into heatmaps in grafana.
> 
> > > To do this, we need to track every single entry enqueue time as well as
> > > dequeue time - events that occur in the tail are quite important.
> > > 
> > > Since ptr_ring is also a ring buffer, I thought it would be easy,
> > > reliable, and fast to just create a "shadow" ring buffer. Every time
> > > producer enqueues entries, I'd enqueue the same number of current
> > > timestamp onto shadow RB. Same thing on consumer side, except dequeue
> > > and calculate timestamp delta.
> > > 
> 
> This idea seems overkill and will likely produce unreliable results.
> E.g. the overhead of this additional ring buffer will also affect the
> measurements.

Yeah, good point.

> 
> > > I was originally planning on writing my own lockless ring buffer in pure
> > > BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I
> > > could avoid that with this patch.
> > 
> > [...]
> > 
> > Alternatively, could add a u64 timestamp to xdp_frame, which makes all
> > this tracking inline (and thus more reliable). But I'm not sure how precious
> > the space in that struct is - I see some references online saying most drivers
> > save 128B headroom. I also see:
> > 
> >          #define XDP_PACKET_HEADROOM 256
> > 
> 
> I like the inline idea. I would suggest to add u64 timestamp into
> XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in
> RX-NAPI.  Then at the remote CPU you can run another CPUMAP-XDP program that
> pickup this timestamp, and then calc a delta from "now" timestamp.
> 
> 
>  [1] https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/af_xdp_kern.c#L62-L77

Cool! This is a much better idea than mine :)

I'll give this a try.

> 
> 
> > Could probably amortize the timestamp read by setting it in
> > bq_flush_to_queue().
> 
> To amortize, consider that you might not need to timestamp EVERY packet to
> get sufficient statistics on the latency.
> 
> Regarding bq_flush_to_queue() and the enqueue tracepoint:
>   trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)
> 
> I have an idea for you, on how to measure the latency overhead from XDP
> RX-processing to when enqueue "flush" happens.  It is a little tricky to
> explain, so I will outline the steps.
> 
> 1. XDP bpf_prog store timestamp in per-CPU array,
>    unless timestamp is already set.
> 
> 2. trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp
>    and calc latency diff, and clears timestamp.
> 
> This measures the latency overhead of bulk enqueue. (Notice: Only the
> first XDP redirect frame after a bq_flush_to_queue() will set the
> timestamp). This per-CPU store should work as this all runs under same
> RX-NAPI "poll" execution.

Makes sense to me. This breaks down the latency even further. I'll keep
it in mind if we need further troubleshooting.

> This latency overhead of bulk enqueue, will (unfortunately) also
> count/measure the XDP_PASS packets that gets processed by the normal
> netstack.  So, watch out for this. e.g could have XDP actions (e.g
> XDP_PASS) counters as part of step 1, and have statistic for cases where
> XDP_PASS interfered.

Not sure I got this. If we only set the percpu timestamp for
XDP_REDIRECT frames, then I don't see how XDP_PASS interferes. Maybe I
misunderstand something.

Thanks,
Daniel
Andrii Nakryiko Sept. 11, 2024, 8:04 p.m. UTC | #14
On Tue, Sep 10, 2024 at 8:31 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
> > On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > >
> > > On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
> > > > On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > > > >
> > > > >
> > > > >
> > > > > On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
> > > > > > On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
> > > > > >> On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
> > > > > [...]
> > > > > >
> > > > > >>
> > > > > >> Also, Daniel, can you please make sure that dynptr we return for each
> > > > > >> sample is read-only? We shouldn't let consumer BPF program ability to
> > > > > >> corrupt ringbuf record headers (accidentally or otherwise).
> > > > > >
> > > > > > Sure.
> > > > >
> > > > > So the sample is not read-only. But I think prog is prevented from messing
> > > > > with header regardless.
> > > > >
> > > > > __bpf_user_ringbuf_peek() returns sample past the header:
> > > > >
> > > > >         *sample = (void *)((uintptr_t)rb->data +
> > > > >                            (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
> > > > >
> > > > > dynptr is initialized with the above ptr:
> > > > >
> > > > >         bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
> > > > >
> > > > > So I don't think there's a way for the prog to access the header thru the dynptr.
> > > > >
> > > >
> > > > By "header" I mean 8 bytes that precede each submitted ringbuf record.
> > > > That header is part of ringbuf data area. Given user space can set
> > > > consumer_pos to arbitrary value, kernel can return arbitrary part of
> > > > ringbuf data area, including that 8 byte header. If that data is
> > > > writable, it's easy to screw up that header and crash another BPF
> > > > program that reserves/submits a new record. User space can only read
> > > > data area for BPF ringbuf, and so we rely heavily on a tight control
> > > > of who can write what into those 8 bytes.
> > >
> > > Ah, ok. I think I understand.
> > >
> > > Given this and your other comments about rb->busy, what about enforcing
> > > bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are
> > > different enough where this makes sense.
> >
> > You mean disabling user-space mmap()? TBH, I'd like to understand the
> > use case first before we make such decisions. Maybe what you need is
> > not really a BPF ringbuf? Can you give us a bit more details on what
> > you are trying to achieve?
>
> BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each
> entry in the cpumap. When a prog redirects to an entry in the cpumap,
> the machinery queues up the xdp frame onto the destination CPU ptr_ring.
> This can occur on any cpu, thus multi-producer. On processing side,
> there is only the kthread created by the cpumap entry and bound to the
> specific cpu that is consuming entries. So single consumer.
>
> Goal is to track the latency overhead added from ptr_ring and the
> kthread (versus softirq where is less overhead). Ideally we want p50,
> p90, p95, p99 percentiles.
>
> To do this, we need to track every single entry enqueue time as well as
> dequeue time - events that occur in the tail are quite important.
>
> Since ptr_ring is also a ring buffer, I thought it would be easy,
> reliable, and fast to just create a "shadow" ring buffer. Every time
> producer enqueues entries, I'd enqueue the same number of current
> timestamp onto shadow RB. Same thing on consumer side, except dequeue
> and calculate timestamp delta.
>
> I was originally planning on writing my own lockless ring buffer in pure
> BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I
> could avoid that with this patch.

TBH, I'd just do a fixed-sized array and use atomic counters to
enqueue and consumer position to dequeue. But seems like you might get
away without any circular buffer, based on Jesper's suggestion.

>
> About disabling user-space mmap: yeah, that's what I was suggesting. I
> think it'd be a bit odd if you wanted BPF RB to support consumption from
> both userspace && prog at the same time. And since draining from prog is
> new functionality (and thus the NAND isn't a regression), you could
> relax the restriction later without issues.

I probably wouldn't disable mmap-ing from user space and let the user
handle coordination of consumers, if necessary (we could probably
expose the "busy" field through the consumer metadata page, if it's
not yet, to help with that a bit). This is more flexible as it allows
to alternate who's consuming, if necessary. Consumer side can't
compromise kernel-side producers (if we prevent writes from the
consumer side).

Still, all this feels a bit kludgy anyways. There is also unnecessary
epoll notification, which will be happening by default unless consumer
explicitly requests to not do notification.

Anyways, if there is a better way for your task, see if that works first.
Jesper Dangaard Brouer Sept. 12, 2024, 9:40 a.m. UTC | #15
On 11/09/2024 20.53, Daniel Xu wrote:
> On Wed, Sep 11, 2024 at 10:32:56AM GMT, Jesper Dangaard Brouer wrote:
>>
>>
>> On 11/09/2024 06.43, Daniel Xu wrote:
>>> [cc Jesper]
>>>
>>> On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
>>>> On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
>>>>> On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>>>>>>
>>>>>> On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
>>>>>>> On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>>>>>>>>
>> [...cut...]
>>
>>>>> Can you give us a bit more details on what
>>>>> you are trying to achieve?
>>>>
>>>> BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each
>>>> entry in the cpumap. When a prog redirects to an entry in the cpumap,
>>>> the machinery queues up the xdp frame onto the destination CPU ptr_ring.
>>>> This can occur on any cpu, thus multi-producer. On processing side,
>>>> there is only the kthread created by the cpumap entry and bound to the
>>>> specific cpu that is consuming entries. So single consumer.
>>>>
>>
>> An important detail: to get Multi-Producer (MP) to scale the CPUMAP does
>> bulk enqueue into the ptr_ring.  It stores the xdp_frame's in a per-CPU
>> array and does the flush/enqueue as part of the xdp_do_flush(). Because
>> I was afraid of this adding latency, I choose to also flush every 8
>> frames (CPU_MAP_BULK_SIZE).
>>
>> Looking at code I see this is also explained in a comment:
>>
>> /* General idea: XDP packets getting XDP redirected to another CPU,
>>   * will maximum be stored/queued for one driver ->poll() call.  It is
>>   * guaranteed that queueing the frame and the flush operation happen on
>>   * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
>>   * which queue in bpf_cpu_map_entry contains packets.
>>   */
>>
>>
>>>> Goal is to track the latency overhead added from ptr_ring and the
>>>> kthread (versus softirq where is less overhead). Ideally we want p50,
>>>> p90, p95, p99 percentiles.
>>>>
>>
>> I'm very interesting in this use-case of understanding the latency of
>> CPUMAP.
>> I'm a fan of latency histograms that I turn into heatmaps in grafana.
>>
>>>> To do this, we need to track every single entry enqueue time as well as
>>>> dequeue time - events that occur in the tail are quite important.
>>>>
>>>> Since ptr_ring is also a ring buffer, I thought it would be easy,
>>>> reliable, and fast to just create a "shadow" ring buffer. Every time
>>>> producer enqueues entries, I'd enqueue the same number of current
>>>> timestamp onto shadow RB. Same thing on consumer side, except dequeue
>>>> and calculate timestamp delta.
>>>>
>>
>> This idea seems overkill and will likely produce unreliable results.
>> E.g. the overhead of this additional ring buffer will also affect the
>> measurements.
> 
> Yeah, good point.
> 
>>
>>>> I was originally planning on writing my own lockless ring buffer in pure
>>>> BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I
>>>> could avoid that with this patch.
>>>
>>> [...]
>>>
>>> Alternatively, could add a u64 timestamp to xdp_frame, which makes all
>>> this tracking inline (and thus more reliable). But I'm not sure how precious
>>> the space in that struct is - I see some references online saying most drivers
>>> save 128B headroom. I also see:
>>>
>>>           #define XDP_PACKET_HEADROOM 256
>>>
>>
>> I like the inline idea. I would suggest to add u64 timestamp into
>> XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in
>> RX-NAPI.  Then at the remote CPU you can run another CPUMAP-XDP program that
>> pickup this timestamp, and then calc a delta from "now" timestamp.
>>
>>
>>   [1] https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/af_xdp_kern.c#L62-L77
> 
> Cool! This is a much better idea than mine :)
> 
> I'll give this a try.
> 
>>
>>
>>> Could probably amortize the timestamp read by setting it in
>>> bq_flush_to_queue().
>>
>> To amortize, consider that you might not need to timestamp EVERY packet to
>> get sufficient statistics on the latency.
>>
>> Regarding bq_flush_to_queue() and the enqueue tracepoint:
>>    trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)
>>
>> I have an idea for you, on how to measure the latency overhead from XDP
>> RX-processing to when enqueue "flush" happens.  It is a little tricky to
>> explain, so I will outline the steps.
>>
>> 1. XDP bpf_prog store timestamp in per-CPU array,
>>     unless timestamp is already set.
>>
>> 2. trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp
>>     and calc latency diff, and clears timestamp.
>>
>> This measures the latency overhead of bulk enqueue. (Notice: Only the
>> first XDP redirect frame after a bq_flush_to_queue() will set the
>> timestamp). This per-CPU store should work as this all runs under same
>> RX-NAPI "poll" execution.
> 
> Makes sense to me. This breaks down the latency even further. I'll keep
> it in mind if we need further troubleshooting.
> 

Yes, this breaks down the latency even further :-)

>> This latency overhead of bulk enqueue, will (unfortunately) also
>> count/measure the XDP_PASS packets that gets processed by the normal
>> netstack.  So, watch out for this. e.g could have XDP actions (e.g
>> XDP_PASS) counters as part of step 1, and have statistic for cases where
>> XDP_PASS interfered.
> 
> Not sure I got this. If we only set the percpu timestamp for
> XDP_REDIRECT frames, then I don't see how XDP_PASS interferes. Maybe I
> misunderstand something.
> 

Not quite. You only set timestamp for the first XDP_REDIRECT frames.
I'm simply saying, all the packets processed *after* this timestamp will
attribute to the time it takes until trace_xdp_cpumap_enqueue() runs.
That part should be obvious.  Then, I'm saying remember that an XDP_PASS
packet takes more time than a XDP_REDIRECT packet.  I hope this makes it
more clear. Point: This is a pitfall you need to be aware of when
looking at your metrics.

For the inline timestamping same pitfall actually applies. There you
timestamp the packets themselves.  Because the CPUMAP enqueue happens as
the *LAST* thing in NAPI loop, during xdp_do_flush() call. This means
that interleaved normal netstack (XDP_PASS) packets will be processed
BEFORE this call to xdp_do_flush().  As noted earlier, to compensate for
this effect, code will enqueue-flush every 8 frames (CPU_MAP_BULK_SIZE).

I hope, I've not confused you more :-|

--Jesper
Daniel Xu Sept. 13, 2024, 2:15 a.m. UTC | #16
On Thu, Sep 12, 2024, at 2:40 AM, Jesper Dangaard Brouer wrote:
> On 11/09/2024 20.53, Daniel Xu wrote:
>> On Wed, Sep 11, 2024 at 10:32:56AM GMT, Jesper Dangaard Brouer wrote:
>>>
>>>
>>> On 11/09/2024 06.43, Daniel Xu wrote:
>>>> [cc Jesper]
>>>>
>>>> On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
>>>>> On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
>>>>>> On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>>>>>>>
>>>>>>> On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
>>>>>>>> On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>>>>>>>>>
>>> [...cut...]
>>>
>>>>>> Can you give us a bit more details on what
>>>>>> you are trying to achieve?
>>>>>
>>>>> BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each
>>>>> entry in the cpumap. When a prog redirects to an entry in the cpumap,
>>>>> the machinery queues up the xdp frame onto the destination CPU ptr_ring.
>>>>> This can occur on any cpu, thus multi-producer. On processing side,
>>>>> there is only the kthread created by the cpumap entry and bound to the
>>>>> specific cpu that is consuming entries. So single consumer.
>>>>>
>>>
>>> An important detail: to get Multi-Producer (MP) to scale the CPUMAP does
>>> bulk enqueue into the ptr_ring.  It stores the xdp_frame's in a per-CPU
>>> array and does the flush/enqueue as part of the xdp_do_flush(). Because
>>> I was afraid of this adding latency, I choose to also flush every 8
>>> frames (CPU_MAP_BULK_SIZE).
>>>
>>> Looking at code I see this is also explained in a comment:
>>>
>>> /* General idea: XDP packets getting XDP redirected to another CPU,
>>>   * will maximum be stored/queued for one driver ->poll() call.  It is
>>>   * guaranteed that queueing the frame and the flush operation happen on
>>>   * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
>>>   * which queue in bpf_cpu_map_entry contains packets.
>>>   */
>>>
>>>
>>>>> Goal is to track the latency overhead added from ptr_ring and the
>>>>> kthread (versus softirq where is less overhead). Ideally we want p50,
>>>>> p90, p95, p99 percentiles.
>>>>>
>>>
>>> I'm very interesting in this use-case of understanding the latency of
>>> CPUMAP.
>>> I'm a fan of latency histograms that I turn into heatmaps in grafana.
>>>
>>>>> To do this, we need to track every single entry enqueue time as well as
>>>>> dequeue time - events that occur in the tail are quite important.
>>>>>
>>>>> Since ptr_ring is also a ring buffer, I thought it would be easy,
>>>>> reliable, and fast to just create a "shadow" ring buffer. Every time
>>>>> producer enqueues entries, I'd enqueue the same number of current
>>>>> timestamp onto shadow RB. Same thing on consumer side, except dequeue
>>>>> and calculate timestamp delta.
>>>>>
>>>
>>> This idea seems overkill and will likely produce unreliable results.
>>> E.g. the overhead of this additional ring buffer will also affect the
>>> measurements.
>> 
>> Yeah, good point.
>> 
>>>
>>>>> I was originally planning on writing my own lockless ring buffer in pure
>>>>> BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I
>>>>> could avoid that with this patch.
>>>>
>>>> [...]
>>>>
>>>> Alternatively, could add a u64 timestamp to xdp_frame, which makes all
>>>> this tracking inline (and thus more reliable). But I'm not sure how precious
>>>> the space in that struct is - I see some references online saying most drivers
>>>> save 128B headroom. I also see:
>>>>
>>>>           #define XDP_PACKET_HEADROOM 256
>>>>
>>>
>>> I like the inline idea. I would suggest to add u64 timestamp into
>>> XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in
>>> RX-NAPI.  Then at the remote CPU you can run another CPUMAP-XDP program that
>>> pickup this timestamp, and then calc a delta from "now" timestamp.
>>>
>>>
>>>   [1] https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/af_xdp_kern.c#L62-L77
>> 
>> Cool! This is a much better idea than mine :)
>> 
>> I'll give this a try.
>> 
>>>
>>>
>>>> Could probably amortize the timestamp read by setting it in
>>>> bq_flush_to_queue().
>>>
>>> To amortize, consider that you might not need to timestamp EVERY packet to
>>> get sufficient statistics on the latency.
>>>
>>> Regarding bq_flush_to_queue() and the enqueue tracepoint:
>>>    trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)
>>>
>>> I have an idea for you, on how to measure the latency overhead from XDP
>>> RX-processing to when enqueue "flush" happens.  It is a little tricky to
>>> explain, so I will outline the steps.
>>>
>>> 1. XDP bpf_prog store timestamp in per-CPU array,
>>>     unless timestamp is already set.
>>>
>>> 2. trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp
>>>     and calc latency diff, and clears timestamp.
>>>
>>> This measures the latency overhead of bulk enqueue. (Notice: Only the
>>> first XDP redirect frame after a bq_flush_to_queue() will set the
>>> timestamp). This per-CPU store should work as this all runs under same
>>> RX-NAPI "poll" execution.
>> 
>> Makes sense to me. This breaks down the latency even further. I'll keep
>> it in mind if we need further troubleshooting.
>> 
>
> Yes, this breaks down the latency even further :-)
>
>>> This latency overhead of bulk enqueue, will (unfortunately) also
>>> count/measure the XDP_PASS packets that gets processed by the normal
>>> netstack.  So, watch out for this. e.g could have XDP actions (e.g
>>> XDP_PASS) counters as part of step 1, and have statistic for cases where
>>> XDP_PASS interfered.
>> 
>> Not sure I got this. If we only set the percpu timestamp for
>> XDP_REDIRECT frames, then I don't see how XDP_PASS interferes. Maybe I
>> misunderstand something.
>> 
>
> Not quite. You only set timestamp for the first XDP_REDIRECT frames.
> I'm simply saying, all the packets processed *after* this timestamp will
> attribute to the time it takes until trace_xdp_cpumap_enqueue() runs.
> That part should be obvious.  Then, I'm saying remember that an XDP_PASS
> packet takes more time than a XDP_REDIRECT packet.  I hope this makes it
> more clear. Point: This is a pitfall you need to be aware of when
> looking at your metrics.
>
> For the inline timestamping same pitfall actually applies. There you
> timestamp the packets themselves.  Because the CPUMAP enqueue happens as
> the *LAST* thing in NAPI loop, during xdp_do_flush() call. This means
> that interleaved normal netstack (XDP_PASS) packets will be processed
> BEFORE this call to xdp_do_flush().  As noted earlier, to compensate for
> this effect, code will enqueue-flush every 8 frames (CPU_MAP_BULK_SIZE).

Ah, that makes sense. Thanks for explaining.

>
> I hope, I've not confused you more :-|
>

No, that was helpful!

Thanks,
Daniel
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 53d0556fbbf3..56bfe559f228 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9142,7 +9142,8 @@  static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_ringbuf_query &&
 		    func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
 		    func_id != BPF_FUNC_ringbuf_submit_dynptr &&
-		    func_id != BPF_FUNC_ringbuf_discard_dynptr)
+		    func_id != BPF_FUNC_ringbuf_discard_dynptr &&
+		    func_id != BPF_FUNC_user_ringbuf_drain)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_USER_RINGBUF:
@@ -9276,7 +9277,8 @@  static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_FUNC_user_ringbuf_drain:
-		if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+		if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
+		    map->map_type != BPF_MAP_TYPE_RINGBUF)
 			goto error;
 		break;
 	case BPF_FUNC_get_stackid:
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9905e3739dd0..233109843d4d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -503,7 +503,8 @@  LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c 		\
 	trace_printk.c trace_vprintk.c map_ptr_kern.c 			\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
-	test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
+	test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c    \
+	test_ringbuf_bpf_to_bpf.c
 
 # Generate both light skeleton and libbpf skeleton for these
 LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \
diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
index da430df45aa4..3e7955573ac5 100644
--- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
@@ -17,6 +17,7 @@ 
 #include "test_ringbuf_n.lskel.h"
 #include "test_ringbuf_map_key.lskel.h"
 #include "test_ringbuf_write.lskel.h"
+#include "test_ringbuf_bpf_to_bpf.lskel.h"
 
 #define EDONE 7777
 
@@ -497,6 +498,53 @@  static void ringbuf_map_key_subtest(void)
 	test_ringbuf_map_key_lskel__destroy(skel_map_key);
 }
 
+static void ringbuf_bpf_to_bpf_subtest(void)
+{
+	struct test_ringbuf_bpf_to_bpf_lskel *skel;
+	int err, i;
+
+	skel = test_ringbuf_bpf_to_bpf_lskel__open();
+	if (!ASSERT_OK_PTR(skel, "test_ringbuf_bpf_to_bpf_lskel__open"))
+		return;
+
+	skel->maps.ringbuf.max_entries = getpagesize();
+	skel->bss->pid = getpid();
+
+	err = test_ringbuf_bpf_to_bpf_lskel__load(skel);
+	if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__load"))
+		goto cleanup;
+
+	ringbuf = ring_buffer__new(skel->maps.ringbuf.map_fd, NULL, NULL, NULL);
+	if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new"))
+		goto cleanup;
+
+	err = test_ringbuf_bpf_to_bpf_lskel__attach(skel);
+	if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__attach"))
+		goto cleanup_ringbuf;
+
+	/* Produce N_SAMPLES samples in the ring buffer by calling getpid() */
+	skel->bss->value = SAMPLE_VALUE;
+	for (i = 0; i < N_SAMPLES; i++)
+		syscall(__NR_getpgid);
+
+	/* Trigger bpf-side consumption */
+	syscall(__NR_prctl);
+
+	/* Check correct number samples were consumed */
+	if (!ASSERT_EQ(skel->bss->round_tripped, N_SAMPLES, "round_tripped"))
+		goto cleanup_ringbuf;
+
+	/* Check all samples were consumed */
+	err = ring_buffer__consume(ringbuf);
+	if (!ASSERT_EQ(err, 0, "rb_consume"))
+		goto cleanup_ringbuf;
+
+cleanup_ringbuf:
+	ring_buffer__free(ringbuf);
+cleanup:
+	test_ringbuf_bpf_to_bpf_lskel__destroy(skel);
+}
+
 void test_ringbuf(void)
 {
 	if (test__start_subtest("ringbuf"))
@@ -507,4 +555,6 @@  void test_ringbuf(void)
 		ringbuf_map_key_subtest();
 	if (test__start_subtest("ringbuf_write"))
 		ringbuf_write_subtest();
+	if (test__start_subtest("ringbuf_bpf_to_bpf"))
+		ringbuf_bpf_to_bpf_subtest();
 }
diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c b/tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
new file mode 100644
index 000000000000..378154922024
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
@@ -0,0 +1,64 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+struct sample {
+	long value;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+} ringbuf SEC(".maps");
+
+int pid = 0;
+long value = 0;
+int round_tripped = 0;
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int test_ringbuf_bpf_to_bpf_produce(void *ctx)
+{
+	int cur_pid = bpf_get_current_pid_tgid() >> 32;
+	struct sample *sample;
+
+	if (cur_pid != pid)
+		return 0;
+
+	sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0);
+	if (!sample)
+		return 0;
+	sample->value = value;
+
+	bpf_ringbuf_submit(sample, 0);
+	return 0;
+}
+
+static long consume_cb(struct bpf_dynptr *dynptr, void *context)
+{
+	struct sample *sample = NULL;
+
+	sample = bpf_dynptr_data(dynptr, 0, sizeof(*sample));
+	if (!sample)
+		return 0;
+
+	if (sample->value == value)
+		round_tripped++;
+
+	return 0;
+}
+
+SEC("fentry/" SYS_PREFIX "sys_prctl")
+int test_ringbuf_bpf_to_bpf_consume(void *ctx)
+{
+	int cur_pid = bpf_get_current_pid_tgid() >> 32;
+
+	if (cur_pid != pid)
+		return 0;
+
+	bpf_user_ringbuf_drain(&ringbuf, consume_cb, NULL, 0);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";