diff mbox series

[bpf-next] selftests/bpf: add multi-uprobe benchmarks

Message ID 20240806042935.3867862-1-andrii@kernel.org (mailing list archive)
State Accepted
Commit f727b13dbea16c5e117e263aa8aea59d632d5660
Delegated to: BPF
Headers show
Series [bpf-next] selftests/bpf: add multi-uprobe benchmarks | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7 this patch: 7
netdev/build_tools success Errors and warnings before: 10 this patch: 10
netdev/cc_maintainers warning 12 maintainers not CCed: kpsingh@kernel.org shuah@kernel.org haoluo@google.com john.fastabend@gmail.com jolsa@kernel.org linux-kselftest@vger.kernel.org yonghong.song@linux.dev martin.lau@linux.dev mykolal@fb.com song@kernel.org eddyz87@gmail.com sdf@fomichev.me
netdev/build_clang success Errors and warnings before: 7 this patch: 7
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7 this patch: 7
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Lines should not end with a '(' WARNING: 'peformance' may be misspelled - perhaps 'performance'? WARNING: externs should be avoided in .c files WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18

Commit Message

Andrii Nakryiko Aug. 6, 2024, 4:29 a.m. UTC
Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
Multi- and classic uprobes/uretprobes have different low-level
triggering code paths, so it's sometimes important to be able to
benchmark both flavors of uprobes/uretprobes.

Sample examples from my dev machine below. Single-threaded peformance
almost doesn't differ, but with more parallel CPUs triggering the same
uprobe/uretprobe the difference grows. This might be due to [0], but
given the code is slightly different, there could be other sources of
slowdown.

Note, all these numbers will change due to ongoing work to improve
uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this
is useful for measurements and debugging nevertheless.

uprobe-nop            ( 1 cpus):    1.020 ± 0.005M/s  (  1.020M/s/cpu)
uretprobe-nop         ( 1 cpus):    0.515 ± 0.009M/s  (  0.515M/s/cpu)
uprobe-multi-nop      ( 1 cpus):    1.036 ± 0.004M/s  (  1.036M/s/cpu)
uretprobe-multi-nop   ( 1 cpus):    0.512 ± 0.005M/s  (  0.512M/s/cpu)

uprobe-nop            ( 8 cpus):    3.481 ± 0.030M/s  (  0.435M/s/cpu)
uretprobe-nop         ( 8 cpus):    2.222 ± 0.008M/s  (  0.278M/s/cpu)
uprobe-multi-nop      ( 8 cpus):    3.769 ± 0.094M/s  (  0.471M/s/cpu)
uretprobe-multi-nop   ( 8 cpus):    2.482 ± 0.007M/s  (  0.310M/s/cpu)

uprobe-nop            (16 cpus):    2.968 ± 0.011M/s  (  0.185M/s/cpu)
uretprobe-nop         (16 cpus):    1.870 ± 0.002M/s  (  0.117M/s/cpu)
uprobe-multi-nop      (16 cpus):    3.541 ± 0.037M/s  (  0.221M/s/cpu)
uretprobe-multi-nop   (16 cpus):    2.123 ± 0.026M/s  (  0.133M/s/cpu)

uprobe-nop            (32 cpus):    2.524 ± 0.026M/s  (  0.079M/s/cpu)
uretprobe-nop         (32 cpus):    1.572 ± 0.003M/s  (  0.049M/s/cpu)
uprobe-multi-nop      (32 cpus):    2.717 ± 0.003M/s  (  0.085M/s/cpu)
uretprobe-multi-nop   (32 cpus):    1.687 ± 0.007M/s  (  0.053M/s/cpu)

  [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/
  [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/bench.c           | 12 +++
 .../selftests/bpf/benchs/bench_trigger.c      | 81 +++++++++++++++----
 .../selftests/bpf/progs/trigger_bench.c       |  7 ++
 3 files changed, 85 insertions(+), 15 deletions(-)

Comments

Jiri Olsa Aug. 6, 2024, 7:25 a.m. UTC | #1
On Mon, Aug 05, 2024 at 09:29:35PM -0700, Andrii Nakryiko wrote:
> Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
> Multi- and classic uprobes/uretprobes have different low-level
> triggering code paths, so it's sometimes important to be able to
> benchmark both flavors of uprobes/uretprobes.
> 
> Sample examples from my dev machine below. Single-threaded peformance
> almost doesn't differ, but with more parallel CPUs triggering the same
> uprobe/uretprobe the difference grows. This might be due to [0], but
> given the code is slightly different, there could be other sources of
> slowdown.
> 
> Note, all these numbers will change due to ongoing work to improve
> uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this
> is useful for measurements and debugging nevertheless.
> 
> uprobe-nop            ( 1 cpus):    1.020 ± 0.005M/s  (  1.020M/s/cpu)
> uretprobe-nop         ( 1 cpus):    0.515 ± 0.009M/s  (  0.515M/s/cpu)
> uprobe-multi-nop      ( 1 cpus):    1.036 ± 0.004M/s  (  1.036M/s/cpu)
> uretprobe-multi-nop   ( 1 cpus):    0.512 ± 0.005M/s  (  0.512M/s/cpu)
> 
> uprobe-nop            ( 8 cpus):    3.481 ± 0.030M/s  (  0.435M/s/cpu)
> uretprobe-nop         ( 8 cpus):    2.222 ± 0.008M/s  (  0.278M/s/cpu)
> uprobe-multi-nop      ( 8 cpus):    3.769 ± 0.094M/s  (  0.471M/s/cpu)
> uretprobe-multi-nop   ( 8 cpus):    2.482 ± 0.007M/s  (  0.310M/s/cpu)
> 
> uprobe-nop            (16 cpus):    2.968 ± 0.011M/s  (  0.185M/s/cpu)
> uretprobe-nop         (16 cpus):    1.870 ± 0.002M/s  (  0.117M/s/cpu)
> uprobe-multi-nop      (16 cpus):    3.541 ± 0.037M/s  (  0.221M/s/cpu)
> uretprobe-multi-nop   (16 cpus):    2.123 ± 0.026M/s  (  0.133M/s/cpu)
> 
> uprobe-nop            (32 cpus):    2.524 ± 0.026M/s  (  0.079M/s/cpu)
> uretprobe-nop         (32 cpus):    1.572 ± 0.003M/s  (  0.049M/s/cpu)
> uprobe-multi-nop      (32 cpus):    2.717 ± 0.003M/s  (  0.085M/s/cpu)
> uretprobe-multi-nop   (32 cpus):    1.687 ± 0.007M/s  (  0.053M/s/cpu)

nice, do you have script for this output? 
we could add it to benchs/run_bench_uprobes.sh

lgtm

Acked-by: Jiri Olsa <jolsa@kernel.org>

jirka

> 
>   [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/
>   [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/
> 
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  tools/testing/selftests/bpf/bench.c           | 12 +++
>  .../selftests/bpf/benchs/bench_trigger.c      | 81 +++++++++++++++----
>  .../selftests/bpf/progs/trigger_bench.c       |  7 ++
>  3 files changed, 85 insertions(+), 15 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
> index 90dc3aca32bd..1bd403a5ef7b 100644
> --- a/tools/testing/selftests/bpf/bench.c
> +++ b/tools/testing/selftests/bpf/bench.c
> @@ -520,6 +520,12 @@ extern const struct bench bench_trig_uprobe_push;
>  extern const struct bench bench_trig_uretprobe_push;
>  extern const struct bench bench_trig_uprobe_ret;
>  extern const struct bench bench_trig_uretprobe_ret;
> +extern const struct bench bench_trig_uprobe_multi_nop;
> +extern const struct bench bench_trig_uretprobe_multi_nop;
> +extern const struct bench bench_trig_uprobe_multi_push;
> +extern const struct bench bench_trig_uretprobe_multi_push;
> +extern const struct bench bench_trig_uprobe_multi_ret;
> +extern const struct bench bench_trig_uretprobe_multi_ret;

SNIP
Andrii Nakryiko Aug. 6, 2024, 5:30 p.m. UTC | #2
On Tue, Aug 6, 2024 at 12:25 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Mon, Aug 05, 2024 at 09:29:35PM -0700, Andrii Nakryiko wrote:
> > Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
> > Multi- and classic uprobes/uretprobes have different low-level
> > triggering code paths, so it's sometimes important to be able to
> > benchmark both flavors of uprobes/uretprobes.
> >
> > Sample examples from my dev machine below. Single-threaded peformance
> > almost doesn't differ, but with more parallel CPUs triggering the same
> > uprobe/uretprobe the difference grows. This might be due to [0], but
> > given the code is slightly different, there could be other sources of
> > slowdown.
> >
> > Note, all these numbers will change due to ongoing work to improve
> > uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this
> > is useful for measurements and debugging nevertheless.
> >
> > uprobe-nop            ( 1 cpus):    1.020 ± 0.005M/s  (  1.020M/s/cpu)
> > uretprobe-nop         ( 1 cpus):    0.515 ± 0.009M/s  (  0.515M/s/cpu)
> > uprobe-multi-nop      ( 1 cpus):    1.036 ± 0.004M/s  (  1.036M/s/cpu)
> > uretprobe-multi-nop   ( 1 cpus):    0.512 ± 0.005M/s  (  0.512M/s/cpu)
> >
> > uprobe-nop            ( 8 cpus):    3.481 ± 0.030M/s  (  0.435M/s/cpu)
> > uretprobe-nop         ( 8 cpus):    2.222 ± 0.008M/s  (  0.278M/s/cpu)
> > uprobe-multi-nop      ( 8 cpus):    3.769 ± 0.094M/s  (  0.471M/s/cpu)
> > uretprobe-multi-nop   ( 8 cpus):    2.482 ± 0.007M/s  (  0.310M/s/cpu)
> >
> > uprobe-nop            (16 cpus):    2.968 ± 0.011M/s  (  0.185M/s/cpu)
> > uretprobe-nop         (16 cpus):    1.870 ± 0.002M/s  (  0.117M/s/cpu)
> > uprobe-multi-nop      (16 cpus):    3.541 ± 0.037M/s  (  0.221M/s/cpu)
> > uretprobe-multi-nop   (16 cpus):    2.123 ± 0.026M/s  (  0.133M/s/cpu)
> >
> > uprobe-nop            (32 cpus):    2.524 ± 0.026M/s  (  0.079M/s/cpu)
> > uretprobe-nop         (32 cpus):    1.572 ± 0.003M/s  (  0.049M/s/cpu)
> > uprobe-multi-nop      (32 cpus):    2.717 ± 0.003M/s  (  0.085M/s/cpu)
> > uretprobe-multi-nop   (32 cpus):    1.687 ± 0.007M/s  (  0.053M/s/cpu)
>
> nice, do you have script for this output?
> we could add it to benchs/run_bench_uprobes.sh
>

I keep tuning those scripts to my own needs, so I'm not sure if it's
worth adding all of them to selftests. It's very similar to what we
already have, but see the exact script below:

#!/bin/bash

set -eufo pipefail

for p in 1 8 16 32; do
    for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do
        summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1)
        total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-)
        percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut
-d'/' -f1)
        printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu"
    done
    echo
done


> lgtm
>
> Acked-by: Jiri Olsa <jolsa@kernel.org>
>
> jirka
>
> >
> >   [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/
> >   [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  tools/testing/selftests/bpf/bench.c           | 12 +++
> >  .../selftests/bpf/benchs/bench_trigger.c      | 81 +++++++++++++++----
> >  .../selftests/bpf/progs/trigger_bench.c       |  7 ++
> >  3 files changed, 85 insertions(+), 15 deletions(-)
> >
> > diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
> > index 90dc3aca32bd..1bd403a5ef7b 100644
> > --- a/tools/testing/selftests/bpf/bench.c
> > +++ b/tools/testing/selftests/bpf/bench.c
> > @@ -520,6 +520,12 @@ extern const struct bench bench_trig_uprobe_push;
> >  extern const struct bench bench_trig_uretprobe_push;
> >  extern const struct bench bench_trig_uprobe_ret;
> >  extern const struct bench bench_trig_uretprobe_ret;
> > +extern const struct bench bench_trig_uprobe_multi_nop;
> > +extern const struct bench bench_trig_uretprobe_multi_nop;
> > +extern const struct bench bench_trig_uprobe_multi_push;
> > +extern const struct bench bench_trig_uretprobe_multi_push;
> > +extern const struct bench bench_trig_uprobe_multi_ret;
> > +extern const struct bench bench_trig_uretprobe_multi_ret;
>
> SNIP
Alexei Starovoitov Aug. 23, 2024, 5:02 p.m. UTC | #3
On Tue, Aug 6, 2024 at 10:31 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Aug 6, 2024 at 12:25 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Mon, Aug 05, 2024 at 09:29:35PM -0700, Andrii Nakryiko wrote:
> > > Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
> > > Multi- and classic uprobes/uretprobes have different low-level
> > > triggering code paths, so it's sometimes important to be able to
> > > benchmark both flavors of uprobes/uretprobes.
> > >
> > > Sample examples from my dev machine below. Single-threaded peformance
> > > almost doesn't differ, but with more parallel CPUs triggering the same
> > > uprobe/uretprobe the difference grows. This might be due to [0], but
> > > given the code is slightly different, there could be other sources of
> > > slowdown.
> > >
> > > Note, all these numbers will change due to ongoing work to improve
> > > uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this
> > > is useful for measurements and debugging nevertheless.
> > >
> > > uprobe-nop            ( 1 cpus):    1.020 ± 0.005M/s  (  1.020M/s/cpu)
> > > uretprobe-nop         ( 1 cpus):    0.515 ± 0.009M/s  (  0.515M/s/cpu)
> > > uprobe-multi-nop      ( 1 cpus):    1.036 ± 0.004M/s  (  1.036M/s/cpu)
> > > uretprobe-multi-nop   ( 1 cpus):    0.512 ± 0.005M/s  (  0.512M/s/cpu)
> > >
> > > uprobe-nop            ( 8 cpus):    3.481 ± 0.030M/s  (  0.435M/s/cpu)
> > > uretprobe-nop         ( 8 cpus):    2.222 ± 0.008M/s  (  0.278M/s/cpu)
> > > uprobe-multi-nop      ( 8 cpus):    3.769 ± 0.094M/s  (  0.471M/s/cpu)
> > > uretprobe-multi-nop   ( 8 cpus):    2.482 ± 0.007M/s  (  0.310M/s/cpu)
> > >
> > > uprobe-nop            (16 cpus):    2.968 ± 0.011M/s  (  0.185M/s/cpu)
> > > uretprobe-nop         (16 cpus):    1.870 ± 0.002M/s  (  0.117M/s/cpu)
> > > uprobe-multi-nop      (16 cpus):    3.541 ± 0.037M/s  (  0.221M/s/cpu)
> > > uretprobe-multi-nop   (16 cpus):    2.123 ± 0.026M/s  (  0.133M/s/cpu)
> > >
> > > uprobe-nop            (32 cpus):    2.524 ± 0.026M/s  (  0.079M/s/cpu)
> > > uretprobe-nop         (32 cpus):    1.572 ± 0.003M/s  (  0.049M/s/cpu)
> > > uprobe-multi-nop      (32 cpus):    2.717 ± 0.003M/s  (  0.085M/s/cpu)
> > > uretprobe-multi-nop   (32 cpus):    1.687 ± 0.007M/s  (  0.053M/s/cpu)
> >
> > nice, do you have script for this output?
> > we could add it to benchs/run_bench_uprobes.sh
> >
>
> I keep tuning those scripts to my own needs, so I'm not sure if it's
> worth adding all of them to selftests. It's very similar to what we
> already have, but see the exact script below:
>
> #!/bin/bash
>
> set -eufo pipefail
>
> for p in 1 8 16 32; do
>     for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do
>         summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1)
>         total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-)
>         percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut
> -d'/' -f1)
>         printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu"
>     done
>     echo
> done

Added this script to commit log while applying.
patchwork-bot+netdevbpf@kernel.org Aug. 23, 2024, 5:10 p.m. UTC | #4
Hello:

This patch was applied to bpf/bpf-next.git (master)
by Alexei Starovoitov <ast@kernel.org>:

On Mon,  5 Aug 2024 21:29:35 -0700 you wrote:
> Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
> Multi- and classic uprobes/uretprobes have different low-level
> triggering code paths, so it's sometimes important to be able to
> benchmark both flavors of uprobes/uretprobes.
> 
> Sample examples from my dev machine below. Single-threaded peformance
> almost doesn't differ, but with more parallel CPUs triggering the same
> uprobe/uretprobe the difference grows. This might be due to [0], but
> given the code is slightly different, there could be other sources of
> slowdown.
> 
> [...]

Here is the summary with links:
  - [bpf-next] selftests/bpf: add multi-uprobe benchmarks
    https://git.kernel.org/bpf/bpf-next/c/f727b13dbea1

You are awesome, thank you!
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 90dc3aca32bd..1bd403a5ef7b 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -520,6 +520,12 @@  extern const struct bench bench_trig_uprobe_push;
 extern const struct bench bench_trig_uretprobe_push;
 extern const struct bench bench_trig_uprobe_ret;
 extern const struct bench bench_trig_uretprobe_ret;
+extern const struct bench bench_trig_uprobe_multi_nop;
+extern const struct bench bench_trig_uretprobe_multi_nop;
+extern const struct bench bench_trig_uprobe_multi_push;
+extern const struct bench bench_trig_uretprobe_multi_push;
+extern const struct bench bench_trig_uprobe_multi_ret;
+extern const struct bench bench_trig_uretprobe_multi_ret;
 
 extern const struct bench bench_rb_libbpf;
 extern const struct bench bench_rb_custom;
@@ -574,6 +580,12 @@  static const struct bench *benchs[] = {
 	&bench_trig_uretprobe_push,
 	&bench_trig_uprobe_ret,
 	&bench_trig_uretprobe_ret,
+	&bench_trig_uprobe_multi_nop,
+	&bench_trig_uretprobe_multi_nop,
+	&bench_trig_uprobe_multi_push,
+	&bench_trig_uretprobe_multi_push,
+	&bench_trig_uprobe_multi_ret,
+	&bench_trig_uretprobe_multi_ret,
 	/* ringbuf/perfbuf benchmarks */
 	&bench_rb_libbpf,
 	&bench_rb_custom,
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 4b05539f167d..a220545a3238 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -332,7 +332,7 @@  static void *uprobe_producer_ret(void *input)
 	return NULL;
 }
 
-static void usetup(bool use_retprobe, void *target_addr)
+static void usetup(bool use_retprobe, bool use_multi, void *target_addr)
 {
 	size_t uprobe_offset;
 	struct bpf_link *link;
@@ -346,7 +346,10 @@  static void usetup(bool use_retprobe, void *target_addr)
 		exit(1);
 	}
 
-	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
+	if (use_multi)
+		bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe_multi, true);
+	else
+		bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
 
 	err = trigger_bench__load(ctx.skel);
 	if (err) {
@@ -355,16 +358,28 @@  static void usetup(bool use_retprobe, void *target_addr)
 	}
 
 	uprobe_offset = get_uprobe_offset(target_addr);
-	link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
-					  use_retprobe,
-					  -1 /* all PIDs */,
-					  "/proc/self/exe",
-					  uprobe_offset);
+	if (use_multi) {
+		LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
+			.retprobe = use_retprobe,
+			.cnt = 1,
+			.offsets = &uprobe_offset,
+		);
+		link = bpf_program__attach_uprobe_multi(
+			ctx.skel->progs.bench_trigger_uprobe_multi,
+			-1 /* all PIDs */, "/proc/self/exe", NULL, &opts);
+		ctx.skel->links.bench_trigger_uprobe_multi = link;
+	} else {
+		link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
+						  use_retprobe,
+						  -1 /* all PIDs */,
+						  "/proc/self/exe",
+						  uprobe_offset);
+		ctx.skel->links.bench_trigger_uprobe = link;
+	}
 	if (!link) {
-		fprintf(stderr, "failed to attach uprobe!\n");
+		fprintf(stderr, "failed to attach %s!\n", use_multi ? "multi-uprobe" : "uprobe");
 		exit(1);
 	}
-	ctx.skel->links.bench_trigger_uprobe = link;
 }
 
 static void usermode_count_setup(void)
@@ -374,32 +389,62 @@  static void usermode_count_setup(void)
 
 static void uprobe_nop_setup(void)
 {
-	usetup(false, &uprobe_target_nop);
+	usetup(false, false /* !use_multi */, &uprobe_target_nop);
 }
 
 static void uretprobe_nop_setup(void)
 {
-	usetup(true, &uprobe_target_nop);
+	usetup(true, false /* !use_multi */, &uprobe_target_nop);
 }
 
 static void uprobe_push_setup(void)
 {
-	usetup(false, &uprobe_target_push);
+	usetup(false, false /* !use_multi */, &uprobe_target_push);
 }
 
 static void uretprobe_push_setup(void)
 {
-	usetup(true, &uprobe_target_push);
+	usetup(true, false /* !use_multi */, &uprobe_target_push);
 }
 
 static void uprobe_ret_setup(void)
 {
-	usetup(false, &uprobe_target_ret);
+	usetup(false, false /* !use_multi */, &uprobe_target_ret);
 }
 
 static void uretprobe_ret_setup(void)
 {
-	usetup(true, &uprobe_target_ret);
+	usetup(true, false /* !use_multi */, &uprobe_target_ret);
+}
+
+static void uprobe_multi_nop_setup(void)
+{
+	usetup(false, true /* use_multi */, &uprobe_target_nop);
+}
+
+static void uretprobe_multi_nop_setup(void)
+{
+	usetup(true, true /* use_multi */, &uprobe_target_nop);
+}
+
+static void uprobe_multi_push_setup(void)
+{
+	usetup(false, true /* use_multi */, &uprobe_target_push);
+}
+
+static void uretprobe_multi_push_setup(void)
+{
+	usetup(true, true /* use_multi */, &uprobe_target_push);
+}
+
+static void uprobe_multi_ret_setup(void)
+{
+	usetup(false, true /* use_multi */, &uprobe_target_ret);
+}
+
+static void uretprobe_multi_ret_setup(void)
+{
+	usetup(true, true /* use_multi */, &uprobe_target_ret);
 }
 
 const struct bench bench_trig_syscall_count = {
@@ -454,3 +499,9 @@  BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret");
 BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop");
 BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push");
 BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret");
+BENCH_TRIG_USERMODE(uprobe_multi_nop, nop, "uprobe-multi-nop");
+BENCH_TRIG_USERMODE(uprobe_multi_push, push, "uprobe-multi-push");
+BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret");
+BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop");
+BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push");
+BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret");
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 2619ed193c65..044a6d78923e 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -32,6 +32,13 @@  int bench_trigger_uprobe(void *ctx)
 	return 0;
 }
 
+SEC("?uprobe.multi")
+int bench_trigger_uprobe_multi(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
 const volatile int batch_iters = 0;
 
 SEC("?raw_tp")