diff mbox series

[v2,bpf-next,2/2] selftests/bpf: add fast mostly in-kernel BPF triggering benchmarks

Message ID 20240315051813.1320559-2-andrii@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [v2,bpf-next,1/2] selftests/bpf: scale benchmark counting by using per-CPU counters | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 11 maintainers not CCed: haoluo@google.com linux-kselftest@vger.kernel.org john.fastabend@gmail.com eddyz87@gmail.com sdf@google.com song@kernel.org kpsingh@kernel.org yonghong.song@linux.dev shuah@kernel.org martin.lau@linux.dev mykolal@fb.com
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success net selftest script(s) already in Makefile
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: Use of volatile is usually wrong: see Documentation/process/volatile-considered-harmful.rst WARNING: externs should be avoided in .c files
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc

Commit Message

Andrii Nakryiko March 15, 2024, 5:18 a.m. UTC
Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
one syscall execution and BPF program run. While we use a fast
get_pgid() syscall, syscall overhead can still be non-trivial.

This patch adds kprobe/fentry set of benchmarks significantly amortizing
the cost of syscall vs actual BPF triggering overhead. We do this by
employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
which does a tight parameterized loop calling cheap BPF helper
(bpf_get_smp_processor_id()), to which kprobe/fentry programs are
attached for benchmarking.

This way 1 bpf() syscall causes N executions of BPF program being
benchmarked. N defaults to 100, but can be adjusted with
--trig-batch-iters CLI argument.

Results speak for themselves:

$ ./run_bench_trigger.sh
uprobe-base         :  138.054 ± 0.556M/s
base                :   16.650 ± 0.123M/s
tp                  :   11.068 ± 0.100M/s
rawtp               :   14.087 ± 0.511M/s
kprobe              :    9.641 ± 0.027M/s
kprobe-multi        :   10.263 ± 0.061M/s
kretprobe           :    5.475 ± 0.028M/s
kretprobe-multi     :    5.703 ± 0.036M/s
fentry              :   14.544 ± 0.112M/s
fexit               :   10.637 ± 0.073M/s
fmodret             :   11.357 ± 0.061M/s
kprobe-fast         :   14.286 ± 0.377M/s
kprobe-multi-fast   :   14.999 ± 0.204M/s
kretprobe-fast      :    7.646 ± 0.084M/s
kretprobe-multi-fast:    4.354 ± 0.066M/s
fentry-fast         :   31.475 ± 0.254M/s
fexit-fast          :   17.379 ± 0.195M/s

Note how xxx-fast variants are measured with significantly higher
throughput, even though it's exactly the same in-kernel overhead:

fentry              :   14.544 ± 0.112M/s
fentry-fast         :   31.475 ± 0.254M/s

kprobe-multi        :   10.263 ± 0.061M/s
kprobe-multi-fast   :   14.999 ± 0.204M/s

One huge and not yet explained deviation is a slowdown of
kretprobe-multi, we should look into that separately.

kretprobe           :    5.475 ± 0.028M/s
kretprobe-multi     :    5.703 ± 0.036M/s
kretprobe-fast      :    7.646 ± 0.084M/s
kretprobe-multi-fast:    4.354 ± 0.066M/s

Kprobe cases don't seem to have this illogical slowdown:

kprobe              :    9.641 ± 0.027M/s
kprobe-multi        :   10.263 ± 0.061M/s
kprobe-fast         :   14.286 ± 0.377M/s
kprobe-multi-fast   :   14.999 ± 0.204M/s

Cc: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/bench.c           |  18 +++
 .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
 .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
 .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
 4 files changed, 201 insertions(+), 4 deletions(-)

Comments

Jiri Olsa March 15, 2024, 9:10 a.m. UTC | #1
On Thu, Mar 14, 2024 at 10:18:13PM -0700, Andrii Nakryiko wrote:
> Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> one syscall execution and BPF program run. While we use a fast
> get_pgid() syscall, syscall overhead can still be non-trivial.
> 
> This patch adds kprobe/fentry set of benchmarks significantly amortizing
> the cost of syscall vs actual BPF triggering overhead. We do this by
> employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> which does a tight parameterized loop calling cheap BPF helper
> (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> attached for benchmarking.
> 
> This way 1 bpf() syscall causes N executions of BPF program being
> benchmarked. N defaults to 100, but can be adjusted with
> --trig-batch-iters CLI argument.
> 
> Results speak for themselves:
> 
> $ ./run_bench_trigger.sh
> uprobe-base         :  138.054 ± 0.556M/s
> base                :   16.650 ± 0.123M/s
> tp                  :   11.068 ± 0.100M/s
> rawtp               :   14.087 ± 0.511M/s
> kprobe              :    9.641 ± 0.027M/s
> kprobe-multi        :   10.263 ± 0.061M/s
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> fentry              :   14.544 ± 0.112M/s
> fexit               :   10.637 ± 0.073M/s
> fmodret             :   11.357 ± 0.061M/s
> kprobe-fast         :   14.286 ± 0.377M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s
> fentry-fast         :   31.475 ± 0.254M/s
> fexit-fast          :   17.379 ± 0.195M/s
> 
> Note how xxx-fast variants are measured with significantly higher
> throughput, even though it's exactly the same in-kernel overhead:
> 
> fentry              :   14.544 ± 0.112M/s
> fentry-fast         :   31.475 ± 0.254M/s
> 
> kprobe-multi        :   10.263 ± 0.061M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s
> 
> One huge and not yet explained deviation is a slowdown of
> kretprobe-multi, we should look into that separately.
> 
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s
> 
> Kprobe cases don't seem to have this illogical slowdown:
> 
> kprobe              :    9.641 ± 0.027M/s
> kprobe-multi        :   10.263 ± 0.061M/s
> kprobe-fast         :   14.286 ± 0.377M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s

hum, I see that as well:

uprobe-base         :  230.624 ± 0.527M/s 
base                :   16.320 ± 0.087M/s 
tp                  :   10.057 ± 0.122M/s 
rawtp               :   14.851 ± 0.300M/s 
kprobe              :   10.993 ± 0.104M/s 
kprobe-multi        :   11.053 ± 0.038M/s 
kretprobe           :    6.679 ± 0.015M/s 
kretprobe-multi     :    6.466 ± 0.015M/s 
fentry              :   14.949 ± 0.064M/s 
fexit               :   10.530 ± 1.275M/s 
fmodret             :   11.145 ± 0.245M/s 
kprobe-fast         :   20.080 ± 0.468M/s 
kprobe-multi-fast   :   17.603 ± 0.102M/s 
kretprobe-fast      :    9.943 ± 0.056M/s 
kretprobe-multi-fast:    5.185 ± 0.022M/s 
fentry-fast         :   46.697 ± 0.260M/s 
fexit-fast          :   19.250 ± 0.108M/s 


I even see decline in kprobe-multi-fast:

  kprobe-fast         :   20.080 ± 0.468M/s 
  kprobe-multi-fast   :   17.603 ± 0.102M/s 

  kretprobe-fast      :    9.943 ± 0.056M/s 
  kretprobe-multi-fast:    5.185 ± 0.022M/s 


I've got some IBT related code showing up in the perf profile
and when I disabled it I got better results for kprobe-multi-fast
but kretprobe-multi-fast is still bad


uprobe-base         :  234.024 ± 0.225M/s 
base                :   16.383 ± 0.029M/s 
tp                  :    9.973 ± 0.017M/s 
rawtp               :   14.889 ± 0.047M/s 
kprobe              :   10.970 ± 0.011M/s 
kprobe-multi        :   11.640 ± 0.009M/s 
kretprobe           :    6.667 ± 0.005M/s 
kretprobe-multi     :    6.704 ± 0.005M/s 
fentry              :   14.968 ± 0.024M/s 
fexit               :   11.860 ± 0.012M/s 
fmodret             :   12.656 ± 0.036M/s 
kprobe-fast         :   20.340 ± 0.043M/s 
kprobe-multi-fast   :   21.203 ± 0.019M/s 
kretprobe-fast      :    9.956 ± 0.021M/s 
kretprobe-multi-fast:    5.611 ± 0.006M/s 
fentry-fast         :   46.825 ± 0.041M/s 
fexit-fast          :   19.746 ± 0.024M/s 

slightly better:

  kprobe-fast         :   20.340 ± 0.043M/s 
  kprobe-multi-fast   :   21.203 ± 0.019M/s 

still almost half perf:
  kretprobe-fast      :    9.956 ± 0.021M/s 
  kretprobe-multi-fast:    5.611 ± 0.006M/s 

jirka

> 
> Cc: Jiri Olsa <jolsa@kernel.org>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  tools/testing/selftests/bpf/bench.c           |  18 +++
>  .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
>  .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
>  .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
>  4 files changed, 201 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
> index b2b4c391eb0a..67212b89f876 100644
> --- a/tools/testing/selftests/bpf/bench.c
> +++ b/tools/testing/selftests/bpf/bench.c
> @@ -280,6 +280,7 @@ extern struct argp bench_strncmp_argp;
>  extern struct argp bench_hashmap_lookup_argp;
>  extern struct argp bench_local_storage_create_argp;
>  extern struct argp bench_htab_mem_argp;
> +extern struct argp bench_trigger_fast_argp;
>  
>  static const struct argp_child bench_parsers[] = {
>  	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
> @@ -292,6 +293,7 @@ static const struct argp_child bench_parsers[] = {
>  	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
>  	{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
>  	{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
> +	{ &bench_trigger_fast_argp, 0, "BPF triggering benchmark", 0 },
>  	{},
>  };
>  
> @@ -502,6 +504,12 @@ extern const struct bench bench_trig_fentry;
>  extern const struct bench bench_trig_fexit;
>  extern const struct bench bench_trig_fentry_sleep;
>  extern const struct bench bench_trig_fmodret;
> +extern const struct bench bench_trig_kprobe_fast;
> +extern const struct bench bench_trig_kretprobe_fast;
> +extern const struct bench bench_trig_kprobe_multi_fast;
> +extern const struct bench bench_trig_kretprobe_multi_fast;
> +extern const struct bench bench_trig_fentry_fast;
> +extern const struct bench bench_trig_fexit_fast;
>  extern const struct bench bench_trig_uprobe_base;
>  extern const struct bench bench_trig_uprobe_nop;
>  extern const struct bench bench_trig_uretprobe_nop;
> @@ -539,6 +547,7 @@ static const struct bench *benchs[] = {
>  	&bench_rename_rawtp,
>  	&bench_rename_fentry,
>  	&bench_rename_fexit,
> +	/* syscall-driven triggering benchmarks */
>  	&bench_trig_base,
>  	&bench_trig_tp,
>  	&bench_trig_rawtp,
> @@ -550,6 +559,14 @@ static const struct bench *benchs[] = {
>  	&bench_trig_fexit,
>  	&bench_trig_fentry_sleep,
>  	&bench_trig_fmodret,
> +	/* fast, mostly in-kernel triggers */
> +	&bench_trig_kprobe_fast,
> +	&bench_trig_kretprobe_fast,
> +	&bench_trig_kprobe_multi_fast,
> +	&bench_trig_kretprobe_multi_fast,
> +	&bench_trig_fentry_fast,
> +	&bench_trig_fexit_fast,
> +	/* uprobes */
>  	&bench_trig_uprobe_base,
>  	&bench_trig_uprobe_nop,
>  	&bench_trig_uretprobe_nop,
> @@ -557,6 +574,7 @@ static const struct bench *benchs[] = {
>  	&bench_trig_uretprobe_push,
>  	&bench_trig_uprobe_ret,
>  	&bench_trig_uretprobe_ret,
> +	/* ringbuf/perfbuf benchmarks */
>  	&bench_rb_libbpf,
>  	&bench_rb_custom,
>  	&bench_pb_libbpf,
> diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> index 8fbc78d5f8a4..d6c87180c887 100644
> --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> @@ -1,11 +1,54 @@
>  // SPDX-License-Identifier: GPL-2.0
>  /* Copyright (c) 2020 Facebook */
>  #define _GNU_SOURCE
> +#include <argp.h>
>  #include <unistd.h>
> +#include <stdint.h>
>  #include "bench.h"
>  #include "trigger_bench.skel.h"
>  #include "trace_helpers.h"
>  
> +static struct {
> +	__u32 batch_iters;
> +} args = {
> +	.batch_iters = 100,
> +};
> +
> +enum {
> +	ARG_TRIG_BATCH_ITERS = 7000,
> +};
> +
> +static const struct argp_option opts[] = {
> +	{ "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0,
> +		"Number of in-kernel iterations per one driver test run"},
> +	{},
> +};
> +
> +static error_t parse_arg(int key, char *arg, struct argp_state *state)
> +{
> +	long ret;
> +
> +	switch (key) {
> +	case ARG_TRIG_BATCH_ITERS:
> +		ret = strtol(arg, NULL, 10);
> +		if (ret < 1 || ret > UINT_MAX) {
> +			fprintf(stderr, "invalid --trig-batch-iters value");
> +			argp_usage(state);
> +		}
> +		args.batch_iters = ret;
> +		break;
> +	default:
> +		return ARGP_ERR_UNKNOWN;
> +	}
> +
> +	return 0;
> +}
> +
> +const struct argp bench_trigger_fast_argp = {
> +	.options = opts,
> +	.parser = parse_arg,
> +};
> +
>  /* adjust slot shift in inc_hits() if changing */
>  #define MAX_BUCKETS 256
>  
> @@ -70,6 +113,16 @@ static void *trigger_producer(void *input)
>  	return NULL;
>  }
>  
> +static void *trigger_producer_fast(void *input)
> +{
> +	int fd = bpf_program__fd(ctx.skel->progs.trigger_driver);
> +
> +	while (true)
> +		bpf_prog_test_run_opts(fd, NULL);
> +
> +	return NULL;
> +}
> +
>  static void trigger_measure(struct bench_res *res)
>  {
>  	res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
> @@ -77,13 +130,23 @@ static void trigger_measure(struct bench_res *res)
>  
>  static void setup_ctx(void)
>  {
> +	int err;
> +
>  	setup_libbpf();
>  
> -	ctx.skel = trigger_bench__open_and_load();
> +	ctx.skel = trigger_bench__open();
>  	if (!ctx.skel) {
>  		fprintf(stderr, "failed to open skeleton\n");
>  		exit(1);
>  	}
> +
> +	ctx.skel->rodata->batch_iters = args.batch_iters;
> +
> +	err = trigger_bench__load(ctx.skel);
> +	if (err) {
> +		fprintf(stderr, "failed to open skeleton\n");
> +		exit(1);
> +	}
>  }
>  
>  static void attach_bpf(struct bpf_program *prog)
> @@ -157,6 +220,44 @@ static void trigger_fmodret_setup(void)
>  	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
>  }
>  
> +/* Fast, mostly in-kernel triggering setups */
> +
> +static void trigger_kprobe_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_fast);
> +}
> +
> +static void trigger_kretprobe_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_fast);
> +}
> +
> +static void trigger_kprobe_multi_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_fast);
> +}
> +
> +static void trigger_kretprobe_multi_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_fast);
> +}
> +
> +static void trigger_fentry_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_fentry_fast);
> +}
> +
> +static void trigger_fexit_fast_setup(void)
> +{
> +	setup_ctx();
> +	attach_bpf(ctx.skel->progs.bench_trigger_fexit_fast);
> +}
> +
>  /* make sure call is not inlined and not avoided by compiler, so __weak and
>   * inline asm volatile in the body of the function
>   *
> @@ -385,6 +486,26 @@ const struct bench bench_trig_fmodret = {
>  	.report_final = hits_drops_report_final,
>  };
>  
> +/* fast (staying mostly in kernel) kprobe/fentry benchmarks */
> +#define BENCH_TRIG_FAST(KIND, NAME)					\
> +const struct bench bench_trig_##KIND = {				\
> +	.name = "trig-" NAME,						\
> +	.setup = trigger_##KIND##_setup,				\
> +	.producer_thread = trigger_producer_fast,			\
> +	.measure = trigger_measure,					\
> +	.report_progress = hits_drops_report_progress,			\
> +	.report_final = hits_drops_report_final,			\
> +	.argp = &bench_trigger_fast_argp,				\
> +}
> +
> +BENCH_TRIG_FAST(kprobe_fast, "kprobe-fast");
> +BENCH_TRIG_FAST(kretprobe_fast, "kretprobe-fast");
> +BENCH_TRIG_FAST(kprobe_multi_fast, "kprobe-multi-fast");
> +BENCH_TRIG_FAST(kretprobe_multi_fast, "kretprobe-multi-fast");
> +BENCH_TRIG_FAST(fentry_fast, "fentry-fast");
> +BENCH_TRIG_FAST(fexit_fast, "fexit-fast");
> +
> +/* uprobe benchmarks */
>  const struct bench bench_trig_uprobe_base = {
>  	.name = "trig-uprobe-base",
>  	.setup = NULL, /* no uprobe/uretprobe is attached */
> diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> index 78e83f243294..fee069ac930b 100755
> --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
> @@ -2,8 +2,12 @@
>  
>  set -eufo pipefail
>  
> -for i in base tp rawtp kprobe fentry fmodret
> +for i in uprobe-base base tp rawtp \
> +	 kprobe kprobe-multi kretprobe kretprobe-multi \
> +	 fentry fexit fmodret \
> +	 kprobe-fast kprobe-multi-fast kretprobe-fast kretprobe-multi-fast \
> +	 fentry-fast fexit-fast
>  do
>  	summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
> -	printf "%-10s: %s\n" $i "$summary"
> +	printf "%-20s: %s\n" $i "$summary"
>  done
> diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
> index 42ec202015ed..2886c2cb3570 100644
> --- a/tools/testing/selftests/bpf/progs/trigger_bench.c
> +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
> @@ -1,6 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
>  // Copyright (c) 2020 Facebook
> -
>  #include <linux/bpf.h>
>  #include <asm/unistd.h>
>  #include <bpf/bpf_helpers.h>
> @@ -103,3 +102,58 @@ int bench_trigger_uprobe(void *ctx)
>  	inc_counter();
>  	return 0;
>  }
> +
> +const volatile int batch_iters = 0;
> +
> +SEC("raw_tp")
> +int trigger_driver(void *ctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < batch_iters; i++)
> +		(void)bpf_get_smp_processor_id(); /* attach here to benchmark */
> +
> +	return 0;
> +}
> +
> +SEC("kprobe/bpf_get_smp_processor_id")
> +int bench_trigger_kprobe_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("kretprobe/bpf_get_smp_processor_id")
> +int bench_trigger_kretprobe_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("kprobe.multi/bpf_get_smp_processor_id")
> +int bench_trigger_kprobe_multi_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("kretprobe.multi/bpf_get_smp_processor_id")
> +int bench_trigger_kretprobe_multi_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("fentry/bpf_get_smp_processor_id")
> +int bench_trigger_fentry_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> +
> +SEC("fexit/bpf_get_smp_processor_id")
> +int bench_trigger_fexit_fast(void *ctx)
> +{
> +	inc_counter();
> +	return 0;
> +}
> -- 
> 2.43.0
>
Alexei Starovoitov March 15, 2024, 4:03 p.m. UTC | #2
On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> one syscall execution and BPF program run. While we use a fast
> get_pgid() syscall, syscall overhead can still be non-trivial.
>
> This patch adds kprobe/fentry set of benchmarks significantly amortizing
> the cost of syscall vs actual BPF triggering overhead. We do this by
> employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> which does a tight parameterized loop calling cheap BPF helper
> (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> attached for benchmarking.
>
> This way 1 bpf() syscall causes N executions of BPF program being
> benchmarked. N defaults to 100, but can be adjusted with
> --trig-batch-iters CLI argument.
>
> Results speak for themselves:
>
> $ ./run_bench_trigger.sh
> uprobe-base         :  138.054 ± 0.556M/s
> base                :   16.650 ± 0.123M/s

What's going on here? Why two bases are so different?
I thought the "base" is what all other benchmarks
should be compared against.
The "base" is the theoretical maximum for all benchs.
Or uprobe* benches should be compared with uprobe-base
while all other benchs compared with 'base' ?
Probably not anymore due to this new approach.
The 'base' is kinda lost its value then.

> tp                  :   11.068 ± 0.100M/s
> rawtp               :   14.087 ± 0.511M/s
> kprobe              :    9.641 ± 0.027M/s
> kprobe-multi        :   10.263 ± 0.061M/s
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> fentry              :   14.544 ± 0.112M/s
> fexit               :   10.637 ± 0.073M/s
> fmodret             :   11.357 ± 0.061M/s
> kprobe-fast         :   14.286 ± 0.377M/s
> kprobe-multi-fast   :   14.999 ± 0.204M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s
> fentry-fast         :   31.475 ± 0.254M/s
> fexit-fast          :   17.379 ± 0.195M/s

I think the "-fast" suffix doesn't really fit here.
It's a different way to benchmark fexit vs kprobe overhead.

I think the old approach should be replaced with the new one.
There is no value in keeping the old stuff around when
now it's clear that it's measuring much more than it should.

> One huge and not yet explained deviation is a slowdown of
> kretprobe-multi, we should look into that separately.
>
> kretprobe           :    5.475 ± 0.028M/s
> kretprobe-multi     :    5.703 ± 0.036M/s
> kretprobe-fast      :    7.646 ± 0.084M/s
> kretprobe-multi-fast:    4.354 ± 0.066M/s

That is weird indeed. Since it's a single cpu run
it cannot be due to contention.
My gut feel is that something odd is here:
#ifdef CONFIG_X86_KERNEL_IBT
static unsigned long get_entry_ip(unsigned long fentry_ip)

Did you have it in your config?

> +SEC("raw_tp")
> +int trigger_driver(void *ctx)
> +{
> +       int i;
> +
> +       for (i = 0; i < batch_iters; i++)
> +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */

bpf_get_numa_node_id() is probably even faster and
not subject to DEBUG configs.
Andrii Nakryiko March 15, 2024, 4:31 p.m. UTC | #3
On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > one syscall execution and BPF program run. While we use a fast
> > get_pgid() syscall, syscall overhead can still be non-trivial.
> >
> > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > the cost of syscall vs actual BPF triggering overhead. We do this by
> > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > which does a tight parameterized loop calling cheap BPF helper
> > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > attached for benchmarking.
> >
> > This way 1 bpf() syscall causes N executions of BPF program being
> > benchmarked. N defaults to 100, but can be adjusted with
> > --trig-batch-iters CLI argument.
> >
> > Results speak for themselves:
> >
> > $ ./run_bench_trigger.sh
> > uprobe-base         :  138.054 ± 0.556M/s
> > base                :   16.650 ± 0.123M/s
>
> What's going on here? Why two bases are so different?
> I thought the "base" is what all other benchmarks
> should be compared against.
> The "base" is the theoretical maximum for all benchs.
> Or uprobe* benches should be compared with uprobe-base
> while all other benchs compared with 'base' ?
> Probably not anymore due to this new approach.
> The 'base' is kinda lost its value then.

naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
base compared to previous trigger benchmarks where we used syscall to
trigger kprobe/fentry programs. uprobe-base is just a user-space loop
that does atomic_inc() in a tight loop. So uprobe-base is basically
the measure of how fast CPU is, but it's unrealistic to expect either
fentry/kprobe to get close, and especially it's unrealistic to expect
uprobes to get close to it.

Naming suggestions are welcome, though.

I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
a counter loop in driver BPF program, perhaps? Would you like me to
add base-fast benchmark doing just that?


>
> > tp                  :   11.068 ± 0.100M/s
> > rawtp               :   14.087 ± 0.511M/s
> > kprobe              :    9.641 ± 0.027M/s
> > kprobe-multi        :   10.263 ± 0.061M/s
> > kretprobe           :    5.475 ± 0.028M/s
> > kretprobe-multi     :    5.703 ± 0.036M/s
> > fentry              :   14.544 ± 0.112M/s
> > fexit               :   10.637 ± 0.073M/s
> > fmodret             :   11.357 ± 0.061M/s
> > kprobe-fast         :   14.286 ± 0.377M/s
> > kprobe-multi-fast   :   14.999 ± 0.204M/s
> > kretprobe-fast      :    7.646 ± 0.084M/s
> > kretprobe-multi-fast:    4.354 ± 0.066M/s
> > fentry-fast         :   31.475 ± 0.254M/s
> > fexit-fast          :   17.379 ± 0.195M/s
>
> I think the "-fast" suffix doesn't really fit here.
> It's a different way to benchmark fexit vs kprobe overhead.
>
> I think the old approach should be replaced with the new one.
> There is no value in keeping the old stuff around when
> now it's clear that it's measuring much more than it should.

I'd say we should keep them. Note that fmod_ret, tp, and raw_tp can't
be benchmarked the same way. So we don't have -fast variants for them.
If we get rid of pre-existing fentry/kprobe benchmarks, we won't be
able to compare raw_tp/fmod_ret/fexit.s to them.

And to be clear, in practice I'm not comparing xxx-fast with xxx at
all. I just put all the numbers in this patch summary to show that we
indeed have a significant performance lost on syscall.

In practice when comparing kprobe vs fentry, it should be kprobe-fast
vs fentry-fast.

>
> > One huge and not yet explained deviation is a slowdown of
> > kretprobe-multi, we should look into that separately.
> >
> > kretprobe           :    5.475 ± 0.028M/s
> > kretprobe-multi     :    5.703 ± 0.036M/s
> > kretprobe-fast      :    7.646 ± 0.084M/s
> > kretprobe-multi-fast:    4.354 ± 0.066M/s
>
> That is weird indeed. Since it's a single cpu run
> it cannot be due to contention.
> My gut feel is that something odd is here:
> #ifdef CONFIG_X86_KERNEL_IBT
> static unsigned long get_entry_ip(unsigned long fentry_ip)
>
> Did you have it in your config?

No, this was without IBT. I do have an upcoming small patch to
eliminate unnecessary copy_kernel_nofault() in IBT path, but I don't
think this is it. It should be pretty clear with profiling where we
are losing CPU, I just didn't have time/desire to look into it, as I
have tons of small things being worked on in parallel and didn't want
to add another one.

>
> > +SEC("raw_tp")
> > +int trigger_driver(void *ctx)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < batch_iters; i++)
> > +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */
>
> bpf_get_numa_node_id() is probably even faster and
> not subject to DEBUG configs.

Sure, I can switch to that, np.
Andrii Nakryiko March 15, 2024, 4:58 p.m. UTC | #4
On Fri, Mar 15, 2024 at 9:31 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > >
> > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > one syscall execution and BPF program run. While we use a fast
> > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > >
> > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > which does a tight parameterized loop calling cheap BPF helper
> > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > attached for benchmarking.
> > >
> > > This way 1 bpf() syscall causes N executions of BPF program being
> > > benchmarked. N defaults to 100, but can be adjusted with
> > > --trig-batch-iters CLI argument.
> > >
> > > Results speak for themselves:
> > >
> > > $ ./run_bench_trigger.sh
> > > uprobe-base         :  138.054 ± 0.556M/s
> > > base                :   16.650 ± 0.123M/s
> >
> > What's going on here? Why two bases are so different?
> > I thought the "base" is what all other benchmarks
> > should be compared against.
> > The "base" is the theoretical maximum for all benchs.
> > Or uprobe* benches should be compared with uprobe-base
> > while all other benchs compared with 'base' ?
> > Probably not anymore due to this new approach.
> > The 'base' is kinda lost its value then.
>
> naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
> base compared to previous trigger benchmarks where we used syscall to
> trigger kprobe/fentry programs. uprobe-base is just a user-space loop
> that does atomic_inc() in a tight loop. So uprobe-base is basically
> the measure of how fast CPU is, but it's unrealistic to expect either
> fentry/kprobe to get close, and especially it's unrealistic to expect
> uprobes to get close to it.
>
> Naming suggestions are welcome, though.
>
> I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
> a counter loop in driver BPF program, perhaps? Would you like me to
> add base-fast benchmark doing just that?
>

How about this.

base -> base-syscall (i.e., "syscall-calling baseline")
uprobe-base -> base-user-loop (i.e., "user space-only baseline")
and then for "fast" baseline we add "base-kernel-loop" for
"kernel-side looping baseline"

Or we can use some naming based on "counting": base-syscall-count,
base-user-count, base-kernel-count?

Any preferences, anyone?

>
> >
> > > tp                  :   11.068 ± 0.100M/s
> > > rawtp               :   14.087 ± 0.511M/s
> > > kprobe              :    9.641 ± 0.027M/s
> > > kprobe-multi        :   10.263 ± 0.061M/s
> > > kretprobe           :    5.475 ± 0.028M/s
> > > kretprobe-multi     :    5.703 ± 0.036M/s
> > > fentry              :   14.544 ± 0.112M/s
> > > fexit               :   10.637 ± 0.073M/s
> > > fmodret             :   11.357 ± 0.061M/s
> > > kprobe-fast         :   14.286 ± 0.377M/s
> > > kprobe-multi-fast   :   14.999 ± 0.204M/s
> > > kretprobe-fast      :    7.646 ± 0.084M/s
> > > kretprobe-multi-fast:    4.354 ± 0.066M/s
> > > fentry-fast         :   31.475 ± 0.254M/s
> > > fexit-fast          :   17.379 ± 0.195M/s
> >
> > I think the "-fast" suffix doesn't really fit here.
> > It's a different way to benchmark fexit vs kprobe overhead.
> >
> > I think the old approach should be replaced with the new one.
> > There is no value in keeping the old stuff around when
> > now it's clear that it's measuring much more than it should.
>
> I'd say we should keep them. Note that fmod_ret, tp, and raw_tp can't
> be benchmarked the same way. So we don't have -fast variants for them.
> If we get rid of pre-existing fentry/kprobe benchmarks, we won't be
> able to compare raw_tp/fmod_ret/fexit.s to them.
>
> And to be clear, in practice I'm not comparing xxx-fast with xxx at
> all. I just put all the numbers in this patch summary to show that we
> indeed have a significant performance lost on syscall.
>
> In practice when comparing kprobe vs fentry, it should be kprobe-fast
> vs fentry-fast.
>
> >
> > > One huge and not yet explained deviation is a slowdown of
> > > kretprobe-multi, we should look into that separately.
> > >
> > > kretprobe           :    5.475 ± 0.028M/s
> > > kretprobe-multi     :    5.703 ± 0.036M/s
> > > kretprobe-fast      :    7.646 ± 0.084M/s
> > > kretprobe-multi-fast:    4.354 ± 0.066M/s
> >
> > That is weird indeed. Since it's a single cpu run
> > it cannot be due to contention.
> > My gut feel is that something odd is here:
> > #ifdef CONFIG_X86_KERNEL_IBT
> > static unsigned long get_entry_ip(unsigned long fentry_ip)
> >
> > Did you have it in your config?
>
> No, this was without IBT. I do have an upcoming small patch to
> eliminate unnecessary copy_kernel_nofault() in IBT path, but I don't
> think this is it. It should be pretty clear with profiling where we
> are losing CPU, I just didn't have time/desire to look into it, as I
> have tons of small things being worked on in parallel and didn't want
> to add another one.
>
> >
> > > +SEC("raw_tp")
> > > +int trigger_driver(void *ctx)
> > > +{
> > > +       int i;
> > > +
> > > +       for (i = 0; i < batch_iters; i++)
> > > +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */
> >
> > bpf_get_numa_node_id() is probably even faster and
> > not subject to DEBUG configs.
>
> Sure, I can switch to that, np.
Alexei Starovoitov March 15, 2024, 6:47 p.m. UTC | #5
On Fri, Mar 15, 2024 at 9:59 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Mar 15, 2024 at 9:31 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > >
> > > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > > one syscall execution and BPF program run. While we use a fast
> > > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > > >
> > > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > > which does a tight parameterized loop calling cheap BPF helper
> > > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > > attached for benchmarking.
> > > >
> > > > This way 1 bpf() syscall causes N executions of BPF program being
> > > > benchmarked. N defaults to 100, but can be adjusted with
> > > > --trig-batch-iters CLI argument.
> > > >
> > > > Results speak for themselves:
> > > >
> > > > $ ./run_bench_trigger.sh
> > > > uprobe-base         :  138.054 ± 0.556M/s
> > > > base                :   16.650 ± 0.123M/s
> > >
> > > What's going on here? Why two bases are so different?
> > > I thought the "base" is what all other benchmarks
> > > should be compared against.
> > > The "base" is the theoretical maximum for all benchs.
> > > Or uprobe* benches should be compared with uprobe-base
> > > while all other benchs compared with 'base' ?
> > > Probably not anymore due to this new approach.
> > > The 'base' is kinda lost its value then.
> >
> > naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
> > base compared to previous trigger benchmarks where we used syscall to
> > trigger kprobe/fentry programs. uprobe-base is just a user-space loop
> > that does atomic_inc() in a tight loop. So uprobe-base is basically
> > the measure of how fast CPU is, but it's unrealistic to expect either
> > fentry/kprobe to get close, and especially it's unrealistic to expect
> > uprobes to get close to it.
> >
> > Naming suggestions are welcome, though.
> >
> > I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
> > a counter loop in driver BPF program, perhaps? Would you like me to
> > add base-fast benchmark doing just that?
> >
>
> How about this.
>
> base -> base-syscall (i.e., "syscall-calling baseline")
> uprobe-base -> base-user-loop (i.e., "user space-only baseline")
> and then for "fast" baseline we add "base-kernel-loop" for
> "kernel-side looping baseline"

I think "base" part in both doesn't fit.
Maybe
base -> syscall_loop
uprobe-base - user_space_loop

since the first is only doing syscall in a loop
and the 2nd doesn't even go to the kernel.

>
> Or we can use some naming based on "counting": base-syscall-count,
> base-user-count, base-kernel-count?

Instead of fenty-fast -> fentry-batch
sounds more accurate.

But in general it doesn't feel right to keep the existing
benchs, since we've discovered that syscall overhead
affects the measurement so much.
I think we should bite the bullet and use bpf_testmod in bench tool.
Then we can add a tracepoint there and call it in proper kernel loop,
then add fmodret-able empty function and call it in another loop.

Then bench tool can trigger them via bpf->kfunc->that_looping_helper
and capture as accurate as possibly overhead of
tp, raw_tp, kprobe, fentry, fmodret.
And all of them will be comparable to each other.
Right now fmodret vs fentry are comparable, but both have
syscall time addon which makes the comparison questionable.
Especially consider the kernels with mitigations.
Overhead of syscall is even higher and the delta is smaller,
to the point that fentry vs fexit might measure equal.
A benchmarking tool should be a gold standard and accurate tool
for performance measurement.

The trick:
+       for (i = 0; i < batch_iters; i++)
+               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */

is neat, but since it works for fentry/kprobe only
we need testmod to generalize it for fmodret and tracepoints.
Andrii Nakryiko March 15, 2024, 9:07 p.m. UTC | #6
On Fri, Mar 15, 2024 at 11:48 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Mar 15, 2024 at 9:59 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Mar 15, 2024 at 9:31 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Fri, Mar 15, 2024 at 9:03 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > > >
> > > > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > > > one syscall execution and BPF program run. While we use a fast
> > > > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > > > >
> > > > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > > > which does a tight parameterized loop calling cheap BPF helper
> > > > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > > > attached for benchmarking.
> > > > >
> > > > > This way 1 bpf() syscall causes N executions of BPF program being
> > > > > benchmarked. N defaults to 100, but can be adjusted with
> > > > > --trig-batch-iters CLI argument.
> > > > >
> > > > > Results speak for themselves:
> > > > >
> > > > > $ ./run_bench_trigger.sh
> > > > > uprobe-base         :  138.054 ± 0.556M/s
> > > > > base                :   16.650 ± 0.123M/s
> > > >
> > > > What's going on here? Why two bases are so different?
> > > > I thought the "base" is what all other benchmarks
> > > > should be compared against.
> > > > The "base" is the theoretical maximum for all benchs.
> > > > Or uprobe* benches should be compared with uprobe-base
> > > > while all other benchs compared with 'base' ?
> > > > Probably not anymore due to this new approach.
> > > > The 'base' is kinda lost its value then.
> > >
> > > naming is hard. base is doing syscall(get_pgid) in a tight loop. It's
> > > base compared to previous trigger benchmarks where we used syscall to
> > > trigger kprobe/fentry programs. uprobe-base is just a user-space loop
> > > that does atomic_inc() in a tight loop. So uprobe-base is basically
> > > the measure of how fast CPU is, but it's unrealistic to expect either
> > > fentry/kprobe to get close, and especially it's unrealistic to expect
> > > uprobes to get close to it.
> > >
> > > Naming suggestions are welcome, though.
> > >
> > > I'm not sure what the "base" should be for xxx-fast benchmarks? Doing
> > > a counter loop in driver BPF program, perhaps? Would you like me to
> > > add base-fast benchmark doing just that?
> > >
> >
> > How about this.
> >
> > base -> base-syscall (i.e., "syscall-calling baseline")
> > uprobe-base -> base-user-loop (i.e., "user space-only baseline")
> > and then for "fast" baseline we add "base-kernel-loop" for
> > "kernel-side looping baseline"
>
> I think "base" part in both doesn't fit.
> Maybe
> base -> syscall_loop
> uprobe-base - user_space_loop

ok

>
> since the first is only doing syscall in a loop
> and the 2nd doesn't even go to the kernel.
>
> >
> > Or we can use some naming based on "counting": base-syscall-count,
> > base-user-count, base-kernel-count?
>
> Instead of fenty-fast -> fentry-batch
> sounds more accurate.
>

ok, -batch it is

> But in general it doesn't feel right to keep the existing
> benchs, since we've discovered that syscall overhead
> affects the measurement so much.
> I think we should bite the bullet and use bpf_testmod in bench tool.

This will worsen the logistics of using it for any benchmark so much
that it will be pretty much a useless tool.

I can try to find some other way to trigger tracepoint and fmod_ret
without relying on custom modules, but I'd rather just remove those
benchmarks altogether than add dependency on bpf_testmod.

> Then we can add a tracepoint there and call it in proper kernel loop,
> then add fmodret-able empty function and call it in another loop.
>
> Then bench tool can trigger them via bpf->kfunc->that_looping_helper
> and capture as accurate as possibly overhead of
> tp, raw_tp, kprobe, fentry, fmodret.
> And all of them will be comparable to each other.
> Right now fmodret vs fentry are comparable, but both have
> syscall time addon which makes the comparison questionable.
> Especially consider the kernels with mitigations.
> Overhead of syscall is even higher and the delta is smaller,
> to the point that fentry vs fexit might measure equal.

It's still useful to be able to do this, even if syscall overhead is
very measurable. Because when profiling it's easy to filter out
syscall overhead and compare purely fentry vs fexit overheads.

Even with mitigations on the differences between fentry and fexit are
measurable. And bench is a nice tool to generate non-stop work to be
able to capture profiling data conveniently. While a custom test
module will make it much harder.


> A benchmarking tool should be a gold standard and accurate tool
> for performance measurement.
>

This was neither a goal, nor do I sign up for that aspiration :) It's
first and foremost a small and useful tool for doing local performance
optimization work when it comes to BPF-related functionality in the
kernel. And its portability is a huge part of this.

> The trick:
> +       for (i = 0; i < batch_iters; i++)
> +               (void)bpf_get_smp_processor_id(); /* attach here to benchmark */
>
> is neat, but since it works for fentry/kprobe only
> we need testmod to generalize it for fmodret and tracepoints.

see above, I agree it would be nice to have (and I'll look for some
other ways to achieve this), but is not really a required property to
make benchmark useful

For now, I'll drop this patch and re-submit the first one with
__always_inline removed just to get those prerequisites landed.
Andrii Nakryiko March 15, 2024, 10:22 p.m. UTC | #7
On Fri, Mar 15, 2024 at 2:10 AM Jiri Olsa <olsajiri@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 10:18:13PM -0700, Andrii Nakryiko wrote:
> > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > one syscall execution and BPF program run. While we use a fast
> > get_pgid() syscall, syscall overhead can still be non-trivial.
> >
> > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > the cost of syscall vs actual BPF triggering overhead. We do this by
> > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > which does a tight parameterized loop calling cheap BPF helper
> > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > attached for benchmarking.
> >
> > This way 1 bpf() syscall causes N executions of BPF program being
> > benchmarked. N defaults to 100, but can be adjusted with
> > --trig-batch-iters CLI argument.
> >
> > Results speak for themselves:
> >
> > $ ./run_bench_trigger.sh
> > uprobe-base         :  138.054 ą 0.556M/s
> > base                :   16.650 ą 0.123M/s
> > tp                  :   11.068 ą 0.100M/s
> > rawtp               :   14.087 ą 0.511M/s
> > kprobe              :    9.641 ą 0.027M/s
> > kprobe-multi        :   10.263 ą 0.061M/s
> > kretprobe           :    5.475 ą 0.028M/s
> > kretprobe-multi     :    5.703 ą 0.036M/s
> > fentry              :   14.544 ą 0.112M/s
> > fexit               :   10.637 ą 0.073M/s
> > fmodret             :   11.357 ą 0.061M/s
> > kprobe-fast         :   14.286 ą 0.377M/s
> > kprobe-multi-fast   :   14.999 ą 0.204M/s
> > kretprobe-fast      :    7.646 ą 0.084M/s
> > kretprobe-multi-fast:    4.354 ą 0.066M/s
> > fentry-fast         :   31.475 ą 0.254M/s
> > fexit-fast          :   17.379 ą 0.195M/s
> >
> > Note how xxx-fast variants are measured with significantly higher
> > throughput, even though it's exactly the same in-kernel overhead:
> >
> > fentry              :   14.544 ą 0.112M/s
> > fentry-fast         :   31.475 ą 0.254M/s
> >
> > kprobe-multi        :   10.263 ą 0.061M/s
> > kprobe-multi-fast   :   14.999 ą 0.204M/s
> >
> > One huge and not yet explained deviation is a slowdown of
> > kretprobe-multi, we should look into that separately.
> >
> > kretprobe           :    5.475 ą 0.028M/s
> > kretprobe-multi     :    5.703 ą 0.036M/s
> > kretprobe-fast      :    7.646 ą 0.084M/s
> > kretprobe-multi-fast:    4.354 ą 0.066M/s
> >
> > Kprobe cases don't seem to have this illogical slowdown:
> >
> > kprobe              :    9.641 ą 0.027M/s
> > kprobe-multi        :   10.263 ą 0.061M/s
> > kprobe-fast         :   14.286 ą 0.377M/s
> > kprobe-multi-fast   :   14.999 ą 0.204M/s
>
> hum, I see that as well:
>
> uprobe-base         :  230.624 ą 0.527M/s
> base                :   16.320 ą 0.087M/s
> tp                  :   10.057 ą 0.122M/s
> rawtp               :   14.851 ą 0.300M/s
> kprobe              :   10.993 ą 0.104M/s
> kprobe-multi        :   11.053 ą 0.038M/s
> kretprobe           :    6.679 ą 0.015M/s
> kretprobe-multi     :    6.466 ą 0.015M/s
> fentry              :   14.949 ą 0.064M/s
> fexit               :   10.530 ą 1.275M/s
> fmodret             :   11.145 ą 0.245M/s
> kprobe-fast         :   20.080 ą 0.468M/s
> kprobe-multi-fast   :   17.603 ą 0.102M/s
> kretprobe-fast      :    9.943 ą 0.056M/s
> kretprobe-multi-fast:    5.185 ą 0.022M/s
> fentry-fast         :   46.697 ą 0.260M/s
> fexit-fast          :   19.250 ą 0.108M/s
>
>
> I even see decline in kprobe-multi-fast:
>
>   kprobe-fast         :   20.080 ą 0.468M/s
>   kprobe-multi-fast   :   17.603 ą 0.102M/s
>
>   kretprobe-fast      :    9.943 ą 0.056M/s
>   kretprobe-multi-fast:    5.185 ą 0.022M/s
>
>
> I've got some IBT related code showing up in the perf profile
> and when I disabled it I got better results for kprobe-multi-fast
> but kretprobe-multi-fast is still bad

ok, I think it's a self-inflicted false alarm. Note how this benchmark
is using bpf_get_smp_processor_id() as an attach point? Well, guess
what, each triggered program is then calling
bpf_get_smp_processor_id() again to get the per-CPU counter to
increment. So this benchmark was effectively heavily testing recursion
protection, and I guess kretprobe hurts the most.

I realized that when I moved to bpf_get_numa_node_id() and seeing much
higher numbers:

BEFORE
======
$ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
kprobe-batch        :   14.018 ± 0.333M/s
fentry-batch        :   31.470 ± 0.791M/s

AFTER
=====
$ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
kprobe-batch         :   19.391 ± 0.868M/s
fentry-batch         :   48.785 ± 0.932M/s

That's when I realized something is off here. Now results make a bit
more sense (machine on which I test is different, so absolute values
differ from before):

$ benchs/run_bench_trigger.sh kprobe kprobe-multi kprobe-batch
kprobe-multi-batch kretprobe kretprobe-multi kretprobe-batch
kretprobe-multi-batch
kprobe               :    8.114 ± 0.199M/s
kprobe-multi         :    8.739 ± 0.052M/s
kprobe-batch         :   19.379 ± 0.709M/s
kprobe-multi-batch   :   24.969 ± 0.226M/s
kretprobe            :    4.859 ± 0.027M/s
kretprobe-multi      :    4.940 ± 0.083M/s
kretprobe-batch      :    8.460 ± 0.044M/s
kretprobe-multi-batch:    8.761 ± 0.282M/s


>
>
> uprobe-base         :  234.024 ą 0.225M/s
> base                :   16.383 ą 0.029M/s
> tp                  :    9.973 ą 0.017M/s
> rawtp               :   14.889 ą 0.047M/s
> kprobe              :   10.970 ą 0.011M/s
> kprobe-multi        :   11.640 ą 0.009M/s
> kretprobe           :    6.667 ą 0.005M/s
> kretprobe-multi     :    6.704 ą 0.005M/s
> fentry              :   14.968 ą 0.024M/s
> fexit               :   11.860 ą 0.012M/s
> fmodret             :   12.656 ą 0.036M/s
> kprobe-fast         :   20.340 ą 0.043M/s
> kprobe-multi-fast   :   21.203 ą 0.019M/s
> kretprobe-fast      :    9.956 ą 0.021M/s
> kretprobe-multi-fast:    5.611 ą 0.006M/s
> fentry-fast         :   46.825 ą 0.041M/s
> fexit-fast          :   19.746 ą 0.024M/s
>
> slightly better:
>
>   kprobe-fast         :   20.340 ą 0.043M/s
>   kprobe-multi-fast   :   21.203 ą 0.019M/s
>
> still almost half perf:
>   kretprobe-fast      :    9.956 ą 0.021M/s
>   kretprobe-multi-fast:    5.611 ą 0.006M/s
>
> jirka
>
> >
> > Cc: Jiri Olsa <jolsa@kernel.org>
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  tools/testing/selftests/bpf/bench.c           |  18 +++
> >  .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
> >  .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
> >  .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
> >  4 files changed, 201 insertions(+), 4 deletions(-)
> >

[...]
Jiri Olsa March 17, 2024, 8:38 p.m. UTC | #8
On Fri, Mar 15, 2024 at 03:22:29PM -0700, Andrii Nakryiko wrote:
> On Fri, Mar 15, 2024 at 2:10 AM Jiri Olsa <olsajiri@gmail.com> wrote:
> >
> > On Thu, Mar 14, 2024 at 10:18:13PM -0700, Andrii Nakryiko wrote:
> > > Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between
> > > one syscall execution and BPF program run. While we use a fast
> > > get_pgid() syscall, syscall overhead can still be non-trivial.
> > >
> > > This patch adds kprobe/fentry set of benchmarks significantly amortizing
> > > the cost of syscall vs actual BPF triggering overhead. We do this by
> > > employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program
> > > which does a tight parameterized loop calling cheap BPF helper
> > > (bpf_get_smp_processor_id()), to which kprobe/fentry programs are
> > > attached for benchmarking.
> > >
> > > This way 1 bpf() syscall causes N executions of BPF program being
> > > benchmarked. N defaults to 100, but can be adjusted with
> > > --trig-batch-iters CLI argument.
> > >
> > > Results speak for themselves:
> > >
> > > $ ./run_bench_trigger.sh
> > > uprobe-base         :  138.054 ą 0.556M/s
> > > base                :   16.650 ą 0.123M/s
> > > tp                  :   11.068 ą 0.100M/s
> > > rawtp               :   14.087 ą 0.511M/s
> > > kprobe              :    9.641 ą 0.027M/s
> > > kprobe-multi        :   10.263 ą 0.061M/s
> > > kretprobe           :    5.475 ą 0.028M/s
> > > kretprobe-multi     :    5.703 ą 0.036M/s
> > > fentry              :   14.544 ą 0.112M/s
> > > fexit               :   10.637 ą 0.073M/s
> > > fmodret             :   11.357 ą 0.061M/s
> > > kprobe-fast         :   14.286 ą 0.377M/s
> > > kprobe-multi-fast   :   14.999 ą 0.204M/s
> > > kretprobe-fast      :    7.646 ą 0.084M/s
> > > kretprobe-multi-fast:    4.354 ą 0.066M/s
> > > fentry-fast         :   31.475 ą 0.254M/s
> > > fexit-fast          :   17.379 ą 0.195M/s
> > >
> > > Note how xxx-fast variants are measured with significantly higher
> > > throughput, even though it's exactly the same in-kernel overhead:
> > >
> > > fentry              :   14.544 ą 0.112M/s
> > > fentry-fast         :   31.475 ą 0.254M/s
> > >
> > > kprobe-multi        :   10.263 ą 0.061M/s
> > > kprobe-multi-fast   :   14.999 ą 0.204M/s
> > >
> > > One huge and not yet explained deviation is a slowdown of
> > > kretprobe-multi, we should look into that separately.
> > >
> > > kretprobe           :    5.475 ą 0.028M/s
> > > kretprobe-multi     :    5.703 ą 0.036M/s
> > > kretprobe-fast      :    7.646 ą 0.084M/s
> > > kretprobe-multi-fast:    4.354 ą 0.066M/s
> > >
> > > Kprobe cases don't seem to have this illogical slowdown:
> > >
> > > kprobe              :    9.641 ą 0.027M/s
> > > kprobe-multi        :   10.263 ą 0.061M/s
> > > kprobe-fast         :   14.286 ą 0.377M/s
> > > kprobe-multi-fast   :   14.999 ą 0.204M/s
> >
> > hum, I see that as well:
> >
> > uprobe-base         :  230.624 ą 0.527M/s
> > base                :   16.320 ą 0.087M/s
> > tp                  :   10.057 ą 0.122M/s
> > rawtp               :   14.851 ą 0.300M/s
> > kprobe              :   10.993 ą 0.104M/s
> > kprobe-multi        :   11.053 ą 0.038M/s
> > kretprobe           :    6.679 ą 0.015M/s
> > kretprobe-multi     :    6.466 ą 0.015M/s
> > fentry              :   14.949 ą 0.064M/s
> > fexit               :   10.530 ą 1.275M/s
> > fmodret             :   11.145 ą 0.245M/s
> > kprobe-fast         :   20.080 ą 0.468M/s
> > kprobe-multi-fast   :   17.603 ą 0.102M/s
> > kretprobe-fast      :    9.943 ą 0.056M/s
> > kretprobe-multi-fast:    5.185 ą 0.022M/s
> > fentry-fast         :   46.697 ą 0.260M/s
> > fexit-fast          :   19.250 ą 0.108M/s
> >
> >
> > I even see decline in kprobe-multi-fast:
> >
> >   kprobe-fast         :   20.080 ą 0.468M/s
> >   kprobe-multi-fast   :   17.603 ą 0.102M/s
> >
> >   kretprobe-fast      :    9.943 ą 0.056M/s
> >   kretprobe-multi-fast:    5.185 ą 0.022M/s
> >
> >
> > I've got some IBT related code showing up in the perf profile
> > and when I disabled it I got better results for kprobe-multi-fast
> > but kretprobe-multi-fast is still bad
> 
> ok, I think it's a self-inflicted false alarm. Note how this benchmark
> is using bpf_get_smp_processor_id() as an attach point? Well, guess
> what, each triggered program is then calling
> bpf_get_smp_processor_id() again to get the per-CPU counter to
> increment. So this benchmark was effectively heavily testing recursion
> protection, and I guess kretprobe hurts the most.

heh, nice :-) that explains the bpf_prog_inc_misses_counter hits
I see in perf profile

     1.46%  bench    [kernel.vmlinux]                                              [k] bpf_prog_inc_misses_counter
            |
            ---bpf_prog_inc_misses_counter
               kprobe_multi_link_prog_run.isra.0
               fprobe_exit_handler
               rethook_trampoline_handler
               arch_rethook_trampoline_callback
               arch_rethook_trampoline
               bpf_prog_7790468e40a289ea_bench_trigger_kretprobe_multi_fast
               kprobe_multi_link_prog_run.isra.0
               fprobe_exit_handler
               rethook_trampoline_handler
               arch_rethook_trampoline_callback
               arch_rethook_trampoline
               bpf_prog_db14b3b08bc9e952_trigger_driver
               __bpf_prog_test_run_raw_tp
               bpf_prog_test_run_raw_tp
               __sys_bpf
               __x64_sys_bpf
               do_syscall_64
               entry_SYSCALL_64
               syscall
               start_thread
               clone3

jirka

> 
> I realized that when I moved to bpf_get_numa_node_id() and seeing much
> higher numbers:
> 
> BEFORE
> ======
> $ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
> kprobe-batch        :   14.018 ± 0.333M/s
> fentry-batch        :   31.470 ± 0.791M/s
> 
> AFTER
> =====
> $ benchs/run_bench_trigger.sh kprobe-batch fentry-batch
> kprobe-batch         :   19.391 ± 0.868M/s
> fentry-batch         :   48.785 ± 0.932M/s
> 
> That's when I realized something is off here. Now results make a bit
> more sense (machine on which I test is different, so absolute values
> differ from before):
> 
> $ benchs/run_bench_trigger.sh kprobe kprobe-multi kprobe-batch
> kprobe-multi-batch kretprobe kretprobe-multi kretprobe-batch
> kretprobe-multi-batch
> kprobe               :    8.114 ± 0.199M/s
> kprobe-multi         :    8.739 ± 0.052M/s
> kprobe-batch         :   19.379 ± 0.709M/s
> kprobe-multi-batch   :   24.969 ± 0.226M/s
> kretprobe            :    4.859 ± 0.027M/s
> kretprobe-multi      :    4.940 ± 0.083M/s
> kretprobe-batch      :    8.460 ± 0.044M/s
> kretprobe-multi-batch:    8.761 ± 0.282M/s
> 
> 
> >
> >
> > uprobe-base         :  234.024 ą 0.225M/s
> > base                :   16.383 ą 0.029M/s
> > tp                  :    9.973 ą 0.017M/s
> > rawtp               :   14.889 ą 0.047M/s
> > kprobe              :   10.970 ą 0.011M/s
> > kprobe-multi        :   11.640 ą 0.009M/s
> > kretprobe           :    6.667 ą 0.005M/s
> > kretprobe-multi     :    6.704 ą 0.005M/s
> > fentry              :   14.968 ą 0.024M/s
> > fexit               :   11.860 ą 0.012M/s
> > fmodret             :   12.656 ą 0.036M/s
> > kprobe-fast         :   20.340 ą 0.043M/s
> > kprobe-multi-fast   :   21.203 ą 0.019M/s
> > kretprobe-fast      :    9.956 ą 0.021M/s
> > kretprobe-multi-fast:    5.611 ą 0.006M/s
> > fentry-fast         :   46.825 ą 0.041M/s
> > fexit-fast          :   19.746 ą 0.024M/s
> >
> > slightly better:
> >
> >   kprobe-fast         :   20.340 ą 0.043M/s
> >   kprobe-multi-fast   :   21.203 ą 0.019M/s
> >
> > still almost half perf:
> >   kretprobe-fast      :    9.956 ą 0.021M/s
> >   kretprobe-multi-fast:    5.611 ą 0.006M/s
> >
> > jirka
> >
> > >
> > > Cc: Jiri Olsa <jolsa@kernel.org>
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > ---
> > >  tools/testing/selftests/bpf/bench.c           |  18 +++
> > >  .../selftests/bpf/benchs/bench_trigger.c      | 123 +++++++++++++++++-
> > >  .../selftests/bpf/benchs/run_bench_trigger.sh |   8 +-
> > >  .../selftests/bpf/progs/trigger_bench.c       |  56 +++++++-
> > >  4 files changed, 201 insertions(+), 4 deletions(-)
> > >
> 
> [...]
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index b2b4c391eb0a..67212b89f876 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -280,6 +280,7 @@  extern struct argp bench_strncmp_argp;
 extern struct argp bench_hashmap_lookup_argp;
 extern struct argp bench_local_storage_create_argp;
 extern struct argp bench_htab_mem_argp;
+extern struct argp bench_trigger_fast_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -292,6 +293,7 @@  static const struct argp_child bench_parsers[] = {
 	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
 	{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
 	{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
+	{ &bench_trigger_fast_argp, 0, "BPF triggering benchmark", 0 },
 	{},
 };
 
@@ -502,6 +504,12 @@  extern const struct bench bench_trig_fentry;
 extern const struct bench bench_trig_fexit;
 extern const struct bench bench_trig_fentry_sleep;
 extern const struct bench bench_trig_fmodret;
+extern const struct bench bench_trig_kprobe_fast;
+extern const struct bench bench_trig_kretprobe_fast;
+extern const struct bench bench_trig_kprobe_multi_fast;
+extern const struct bench bench_trig_kretprobe_multi_fast;
+extern const struct bench bench_trig_fentry_fast;
+extern const struct bench bench_trig_fexit_fast;
 extern const struct bench bench_trig_uprobe_base;
 extern const struct bench bench_trig_uprobe_nop;
 extern const struct bench bench_trig_uretprobe_nop;
@@ -539,6 +547,7 @@  static const struct bench *benchs[] = {
 	&bench_rename_rawtp,
 	&bench_rename_fentry,
 	&bench_rename_fexit,
+	/* syscall-driven triggering benchmarks */
 	&bench_trig_base,
 	&bench_trig_tp,
 	&bench_trig_rawtp,
@@ -550,6 +559,14 @@  static const struct bench *benchs[] = {
 	&bench_trig_fexit,
 	&bench_trig_fentry_sleep,
 	&bench_trig_fmodret,
+	/* fast, mostly in-kernel triggers */
+	&bench_trig_kprobe_fast,
+	&bench_trig_kretprobe_fast,
+	&bench_trig_kprobe_multi_fast,
+	&bench_trig_kretprobe_multi_fast,
+	&bench_trig_fentry_fast,
+	&bench_trig_fexit_fast,
+	/* uprobes */
 	&bench_trig_uprobe_base,
 	&bench_trig_uprobe_nop,
 	&bench_trig_uretprobe_nop,
@@ -557,6 +574,7 @@  static const struct bench *benchs[] = {
 	&bench_trig_uretprobe_push,
 	&bench_trig_uprobe_ret,
 	&bench_trig_uretprobe_ret,
+	/* ringbuf/perfbuf benchmarks */
 	&bench_rb_libbpf,
 	&bench_rb_custom,
 	&bench_pb_libbpf,
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 8fbc78d5f8a4..d6c87180c887 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,11 +1,54 @@ 
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 #define _GNU_SOURCE
+#include <argp.h>
 #include <unistd.h>
+#include <stdint.h>
 #include "bench.h"
 #include "trigger_bench.skel.h"
 #include "trace_helpers.h"
 
+static struct {
+	__u32 batch_iters;
+} args = {
+	.batch_iters = 100,
+};
+
+enum {
+	ARG_TRIG_BATCH_ITERS = 7000,
+};
+
+static const struct argp_option opts[] = {
+	{ "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0,
+		"Number of in-kernel iterations per one driver test run"},
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	long ret;
+
+	switch (key) {
+	case ARG_TRIG_BATCH_ITERS:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid --trig-batch-iters value");
+			argp_usage(state);
+		}
+		args.batch_iters = ret;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_trigger_fast_argp = {
+	.options = opts,
+	.parser = parse_arg,
+};
+
 /* adjust slot shift in inc_hits() if changing */
 #define MAX_BUCKETS 256
 
@@ -70,6 +113,16 @@  static void *trigger_producer(void *input)
 	return NULL;
 }
 
+static void *trigger_producer_fast(void *input)
+{
+	int fd = bpf_program__fd(ctx.skel->progs.trigger_driver);
+
+	while (true)
+		bpf_prog_test_run_opts(fd, NULL);
+
+	return NULL;
+}
+
 static void trigger_measure(struct bench_res *res)
 {
 	res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
@@ -77,13 +130,23 @@  static void trigger_measure(struct bench_res *res)
 
 static void setup_ctx(void)
 {
+	int err;
+
 	setup_libbpf();
 
-	ctx.skel = trigger_bench__open_and_load();
+	ctx.skel = trigger_bench__open();
 	if (!ctx.skel) {
 		fprintf(stderr, "failed to open skeleton\n");
 		exit(1);
 	}
+
+	ctx.skel->rodata->batch_iters = args.batch_iters;
+
+	err = trigger_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
 }
 
 static void attach_bpf(struct bpf_program *prog)
@@ -157,6 +220,44 @@  static void trigger_fmodret_setup(void)
 	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
 }
 
+/* Fast, mostly in-kernel triggering setups */
+
+static void trigger_kprobe_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_fast);
+}
+
+static void trigger_kretprobe_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_fast);
+}
+
+static void trigger_kprobe_multi_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_fast);
+}
+
+static void trigger_kretprobe_multi_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_fast);
+}
+
+static void trigger_fentry_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_fentry_fast);
+}
+
+static void trigger_fexit_fast_setup(void)
+{
+	setup_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_fexit_fast);
+}
+
 /* make sure call is not inlined and not avoided by compiler, so __weak and
  * inline asm volatile in the body of the function
  *
@@ -385,6 +486,26 @@  const struct bench bench_trig_fmodret = {
 	.report_final = hits_drops_report_final,
 };
 
+/* fast (staying mostly in kernel) kprobe/fentry benchmarks */
+#define BENCH_TRIG_FAST(KIND, NAME)					\
+const struct bench bench_trig_##KIND = {				\
+	.name = "trig-" NAME,						\
+	.setup = trigger_##KIND##_setup,				\
+	.producer_thread = trigger_producer_fast,			\
+	.measure = trigger_measure,					\
+	.report_progress = hits_drops_report_progress,			\
+	.report_final = hits_drops_report_final,			\
+	.argp = &bench_trigger_fast_argp,				\
+}
+
+BENCH_TRIG_FAST(kprobe_fast, "kprobe-fast");
+BENCH_TRIG_FAST(kretprobe_fast, "kretprobe-fast");
+BENCH_TRIG_FAST(kprobe_multi_fast, "kprobe-multi-fast");
+BENCH_TRIG_FAST(kretprobe_multi_fast, "kretprobe-multi-fast");
+BENCH_TRIG_FAST(fentry_fast, "fentry-fast");
+BENCH_TRIG_FAST(fexit_fast, "fexit-fast");
+
+/* uprobe benchmarks */
 const struct bench bench_trig_uprobe_base = {
 	.name = "trig-uprobe-base",
 	.setup = NULL, /* no uprobe/uretprobe is attached */
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
index 78e83f243294..fee069ac930b 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
@@ -2,8 +2,12 @@ 
 
 set -eufo pipefail
 
-for i in base tp rawtp kprobe fentry fmodret
+for i in uprobe-base base tp rawtp \
+	 kprobe kprobe-multi kretprobe kretprobe-multi \
+	 fentry fexit fmodret \
+	 kprobe-fast kprobe-multi-fast kretprobe-fast kretprobe-multi-fast \
+	 fentry-fast fexit-fast
 do
 	summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
-	printf "%-10s: %s\n" $i "$summary"
+	printf "%-20s: %s\n" $i "$summary"
 done
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 42ec202015ed..2886c2cb3570 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -1,6 +1,5 @@ 
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2020 Facebook
-
 #include <linux/bpf.h>
 #include <asm/unistd.h>
 #include <bpf/bpf_helpers.h>
@@ -103,3 +102,58 @@  int bench_trigger_uprobe(void *ctx)
 	inc_counter();
 	return 0;
 }
+
+const volatile int batch_iters = 0;
+
+SEC("raw_tp")
+int trigger_driver(void *ctx)
+{
+	int i;
+
+	for (i = 0; i < batch_iters; i++)
+		(void)bpf_get_smp_processor_id(); /* attach here to benchmark */
+
+	return 0;
+}
+
+SEC("kprobe/bpf_get_smp_processor_id")
+int bench_trigger_kprobe_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("kretprobe/bpf_get_smp_processor_id")
+int bench_trigger_kretprobe_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("kprobe.multi/bpf_get_smp_processor_id")
+int bench_trigger_kprobe_multi_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("kretprobe.multi/bpf_get_smp_processor_id")
+int bench_trigger_kretprobe_multi_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("fentry/bpf_get_smp_processor_id")
+int bench_trigger_fentry_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
+SEC("fexit/bpf_get_smp_processor_id")
+int bench_trigger_fexit_fast(void *ctx)
+{
+	inc_counter();
+	return 0;
+}