diff mbox series

[v2,bpf-next,1/2] selftests/bpf: scale benchmark counting by using per-CPU counters

Message ID 20240315051813.1320559-1-andrii@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [v2,bpf-next,1/2] selftests/bpf: scale benchmark counting by using per-CPU counters | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 12 maintainers not CCed: haoluo@google.com linux-kselftest@vger.kernel.org john.fastabend@gmail.com eddyz87@gmail.com sdf@google.com song@kernel.org kpsingh@kernel.org yonghong.song@linux.dev shuah@kernel.org martin.lau@linux.dev jolsa@kernel.org mykolal@fb.com
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch fail ERROR: do not initialise statics to 0 WARNING: Prefer 'unsigned int' to bare use of 'unsigned' WARNING: Prefer __aligned(128) over __attribute__((aligned(128))) WARNING: line length of 88 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc

Commit Message

Andrii Nakryiko March 15, 2024, 5:18 a.m. UTC
When benchmarking with multiple threads (-pN, where N>1), we start
contending on single atomic counter that both BPF trigger benchmarks are
using, as well as "baseline" tests in user space (trig-base and
trig-uprobe-base benchmarks). As such, we start bottlenecking on
something completely irrelevant to benchmark at hand.

Scale counting up by using per-CPU counters on BPF side. On use space
side we do the next best thing: hash thread ID to approximate per-CPU
behavior. It seems to work quite well in practice.

To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
16, and 32 threads:
  - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
  - trig-base (get_pgid() syscall, atomic counter in user-space);
  - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
    counter on BPF side).

Command used:

  for b in uprobe-base base fentry; do \
    for p in 1 2 4 8 16 32; do \
      printf "%-11s %2d: %s\n" $b $p \
        "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
    done; \
  done

Before these changes, aggregate throughput across all threads doesn't
scale well with number of threads, it actually even falls sharply for
uprobe-base due to a very high contention:

  uprobe-base  1:  138.998 ± 0.650M/s
  uprobe-base  2:   70.526 ± 1.147M/s
  uprobe-base  4:   63.114 ± 0.302M/s
  uprobe-base  8:   54.177 ± 0.138M/s
  uprobe-base 16:   45.439 ± 0.057M/s
  uprobe-base 32:   37.163 ± 0.242M/s
  base         1:   16.940 ± 0.182M/s
  base         2:   19.231 ± 0.105M/s
  base         4:   21.479 ± 0.038M/s
  base         8:   23.030 ± 0.037M/s
  base        16:   22.034 ± 0.004M/s
  base        32:   18.152 ± 0.013M/s
  fentry       1:   14.794 ± 0.054M/s
  fentry       2:   17.341 ± 0.055M/s
  fentry       4:   23.792 ± 0.024M/s
  fentry       8:   21.557 ± 0.047M/s
  fentry      16:   21.121 ± 0.004M/s
  fentry      32:   17.067 ± 0.023M/s

After these changes, we see almost perfect linear scaling, as expected.
The sub-linear scaling when going from 8 to 16 threads is interesting
and consistent on my test machine, but I haven't investigated what is
causing it this peculiar slowdown (across all benchmarks, could be due
to hyperthreading effects, not sure).

  uprobe-base  1:  139.980 ± 0.648M/s
  uprobe-base  2:  270.244 ± 0.379M/s
  uprobe-base  4:  532.044 ± 1.519M/s
  uprobe-base  8: 1004.571 ± 3.174M/s
  uprobe-base 16: 1720.098 ± 0.744M/s
  uprobe-base 32: 3506.659 ± 8.549M/s
  base         1:   16.869 ± 0.071M/s
  base         2:   33.007 ± 0.092M/s
  base         4:   64.670 ± 0.203M/s
  base         8:  121.969 ± 0.210M/s
  base        16:  207.832 ± 0.112M/s
  base        32:  424.227 ± 1.477M/s
  fentry       1:   14.777 ± 0.087M/s
  fentry       2:   28.575 ± 0.146M/s
  fentry       4:   56.234 ± 0.176M/s
  fentry       8:  106.095 ± 0.385M/s
  fentry      16:  181.440 ± 0.032M/s
  fentry      32:  369.131 ± 0.693M/s

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../selftests/bpf/benchs/bench_trigger.c      | 40 ++++++++++++++++---
 .../selftests/bpf/progs/trigger_bench.c       | 39 ++++++++++++------
 2 files changed, 62 insertions(+), 17 deletions(-)

Comments

Alexei Starovoitov March 15, 2024, 4:08 p.m. UTC | #1
On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> When benchmarking with multiple threads (-pN, where N>1), we start
> contending on single atomic counter that both BPF trigger benchmarks are
> using, as well as "baseline" tests in user space (trig-base and
> trig-uprobe-base benchmarks). As such, we start bottlenecking on
> something completely irrelevant to benchmark at hand.
>
> Scale counting up by using per-CPU counters on BPF side. On use space
> side we do the next best thing: hash thread ID to approximate per-CPU
> behavior. It seems to work quite well in practice.
>
> To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
> 16, and 32 threads:
>   - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
>   - trig-base (get_pgid() syscall, atomic counter in user-space);
>   - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
>     counter on BPF side).
>
> Command used:
>
>   for b in uprobe-base base fentry; do \
>     for p in 1 2 4 8 16 32; do \
>       printf "%-11s %2d: %s\n" $b $p \
>         "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
>     done; \
>   done
>
> Before these changes, aggregate throughput across all threads doesn't
> scale well with number of threads, it actually even falls sharply for
> uprobe-base due to a very high contention:
>
>   uprobe-base  1:  138.998 ± 0.650M/s
>   uprobe-base  2:   70.526 ± 1.147M/s
>   uprobe-base  4:   63.114 ± 0.302M/s
>   uprobe-base  8:   54.177 ± 0.138M/s
>   uprobe-base 16:   45.439 ± 0.057M/s
>   uprobe-base 32:   37.163 ± 0.242M/s
>   base         1:   16.940 ± 0.182M/s
>   base         2:   19.231 ± 0.105M/s
>   base         4:   21.479 ± 0.038M/s
>   base         8:   23.030 ± 0.037M/s
>   base        16:   22.034 ± 0.004M/s
>   base        32:   18.152 ± 0.013M/s
>   fentry       1:   14.794 ± 0.054M/s
>   fentry       2:   17.341 ± 0.055M/s
>   fentry       4:   23.792 ± 0.024M/s
>   fentry       8:   21.557 ± 0.047M/s
>   fentry      16:   21.121 ± 0.004M/s
>   fentry      32:   17.067 ± 0.023M/s
>
> After these changes, we see almost perfect linear scaling, as expected.
> The sub-linear scaling when going from 8 to 16 threads is interesting
> and consistent on my test machine, but I haven't investigated what is
> causing it this peculiar slowdown (across all benchmarks, could be due
> to hyperthreading effects, not sure).
>
>   uprobe-base  1:  139.980 ± 0.648M/s
>   uprobe-base  2:  270.244 ± 0.379M/s
>   uprobe-base  4:  532.044 ± 1.519M/s
>   uprobe-base  8: 1004.571 ± 3.174M/s
>   uprobe-base 16: 1720.098 ± 0.744M/s
>   uprobe-base 32: 3506.659 ± 8.549M/s
>   base         1:   16.869 ± 0.071M/s
>   base         2:   33.007 ± 0.092M/s
>   base         4:   64.670 ± 0.203M/s
>   base         8:  121.969 ± 0.210M/s
>   base        16:  207.832 ± 0.112M/s
>   base        32:  424.227 ± 1.477M/s
>   fentry       1:   14.777 ± 0.087M/s
>   fentry       2:   28.575 ± 0.146M/s
>   fentry       4:   56.234 ± 0.176M/s
>   fentry       8:  106.095 ± 0.385M/s
>   fentry      16:  181.440 ± 0.032M/s
>   fentry      32:  369.131 ± 0.693M/s
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  .../selftests/bpf/benchs/bench_trigger.c      | 40 ++++++++++++++++---
>  .../selftests/bpf/progs/trigger_bench.c       | 39 ++++++++++++------
>  2 files changed, 62 insertions(+), 17 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> index ace0d1011a8e..8fbc78d5f8a4 100644
> --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> @@ -1,15 +1,45 @@
>  // SPDX-License-Identifier: GPL-2.0
>  /* Copyright (c) 2020 Facebook */
> +#define _GNU_SOURCE
> +#include <unistd.h>
>  #include "bench.h"
>  #include "trigger_bench.skel.h"
>  #include "trace_helpers.h"
>
> +/* adjust slot shift in inc_hits() if changing */
> +#define MAX_BUCKETS 256
> +
>  /* BPF triggering benchmarks */
>  static struct trigger_ctx {
>         struct trigger_bench *skel;
>  } ctx;
>
> -static struct counter base_hits;
> +static struct counter base_hits[MAX_BUCKETS];
> +
> +static __always_inline void inc_counter(struct counter *counters)
> +{
> +       static __thread int tid = 0;
> +       unsigned slot;
> +
> +       if (unlikely(tid == 0))
> +               tid = gettid();
> +
> +       /* multiplicative hashing, it's fast */
> +       slot = 2654435769U * tid;
> +       slot >>= 24;
> +
> +       atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
> +}
> +
> +static __always_inline long sum_and_reset_counters(struct counter *counters)

Why __always_inline ?
It's a slow path. Inlining or not shouldn't make any difference.

> +{
> +       int i;
> +       long sum = 0;
> +
> +       for (i = 0; i < MAX_BUCKETS; i++)
> +               sum += atomic_swap(&counters[i].value, 0);
> +       return sum;
> +}
Andrii Nakryiko March 15, 2024, 4:22 p.m. UTC | #2
On Fri, Mar 15, 2024 at 9:08 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 10:18 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > When benchmarking with multiple threads (-pN, where N>1), we start
> > contending on single atomic counter that both BPF trigger benchmarks are
> > using, as well as "baseline" tests in user space (trig-base and
> > trig-uprobe-base benchmarks). As such, we start bottlenecking on
> > something completely irrelevant to benchmark at hand.
> >
> > Scale counting up by using per-CPU counters on BPF side. On use space
> > side we do the next best thing: hash thread ID to approximate per-CPU
> > behavior. It seems to work quite well in practice.
> >
> > To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
> > 16, and 32 threads:
> >   - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
> >   - trig-base (get_pgid() syscall, atomic counter in user-space);
> >   - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
> >     counter on BPF side).
> >
> > Command used:
> >
> >   for b in uprobe-base base fentry; do \
> >     for p in 1 2 4 8 16 32; do \
> >       printf "%-11s %2d: %s\n" $b $p \
> >         "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
> >     done; \
> >   done
> >
> > Before these changes, aggregate throughput across all threads doesn't
> > scale well with number of threads, it actually even falls sharply for
> > uprobe-base due to a very high contention:
> >
> >   uprobe-base  1:  138.998 ± 0.650M/s
> >   uprobe-base  2:   70.526 ± 1.147M/s
> >   uprobe-base  4:   63.114 ± 0.302M/s
> >   uprobe-base  8:   54.177 ± 0.138M/s
> >   uprobe-base 16:   45.439 ± 0.057M/s
> >   uprobe-base 32:   37.163 ± 0.242M/s
> >   base         1:   16.940 ± 0.182M/s
> >   base         2:   19.231 ± 0.105M/s
> >   base         4:   21.479 ± 0.038M/s
> >   base         8:   23.030 ± 0.037M/s
> >   base        16:   22.034 ± 0.004M/s
> >   base        32:   18.152 ± 0.013M/s
> >   fentry       1:   14.794 ± 0.054M/s
> >   fentry       2:   17.341 ± 0.055M/s
> >   fentry       4:   23.792 ± 0.024M/s
> >   fentry       8:   21.557 ± 0.047M/s
> >   fentry      16:   21.121 ± 0.004M/s
> >   fentry      32:   17.067 ± 0.023M/s
> >
> > After these changes, we see almost perfect linear scaling, as expected.
> > The sub-linear scaling when going from 8 to 16 threads is interesting
> > and consistent on my test machine, but I haven't investigated what is
> > causing it this peculiar slowdown (across all benchmarks, could be due
> > to hyperthreading effects, not sure).
> >
> >   uprobe-base  1:  139.980 ± 0.648M/s
> >   uprobe-base  2:  270.244 ± 0.379M/s
> >   uprobe-base  4:  532.044 ± 1.519M/s
> >   uprobe-base  8: 1004.571 ± 3.174M/s
> >   uprobe-base 16: 1720.098 ± 0.744M/s
> >   uprobe-base 32: 3506.659 ± 8.549M/s
> >   base         1:   16.869 ± 0.071M/s
> >   base         2:   33.007 ± 0.092M/s
> >   base         4:   64.670 ± 0.203M/s
> >   base         8:  121.969 ± 0.210M/s
> >   base        16:  207.832 ± 0.112M/s
> >   base        32:  424.227 ± 1.477M/s
> >   fentry       1:   14.777 ± 0.087M/s
> >   fentry       2:   28.575 ± 0.146M/s
> >   fentry       4:   56.234 ± 0.176M/s
> >   fentry       8:  106.095 ± 0.385M/s
> >   fentry      16:  181.440 ± 0.032M/s
> >   fentry      32:  369.131 ± 0.693M/s
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  .../selftests/bpf/benchs/bench_trigger.c      | 40 ++++++++++++++++---
> >  .../selftests/bpf/progs/trigger_bench.c       | 39 ++++++++++++------
> >  2 files changed, 62 insertions(+), 17 deletions(-)
> >
> > diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> > index ace0d1011a8e..8fbc78d5f8a4 100644
> > --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> > +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> > @@ -1,15 +1,45 @@
> >  // SPDX-License-Identifier: GPL-2.0
> >  /* Copyright (c) 2020 Facebook */
> > +#define _GNU_SOURCE
> > +#include <unistd.h>
> >  #include "bench.h"
> >  #include "trigger_bench.skel.h"
> >  #include "trace_helpers.h"
> >
> > +/* adjust slot shift in inc_hits() if changing */
> > +#define MAX_BUCKETS 256
> > +
> >  /* BPF triggering benchmarks */
> >  static struct trigger_ctx {
> >         struct trigger_bench *skel;
> >  } ctx;
> >
> > -static struct counter base_hits;
> > +static struct counter base_hits[MAX_BUCKETS];
> > +
> > +static __always_inline void inc_counter(struct counter *counters)
> > +{
> > +       static __thread int tid = 0;
> > +       unsigned slot;
> > +
> > +       if (unlikely(tid == 0))
> > +               tid = gettid();
> > +
> > +       /* multiplicative hashing, it's fast */
> > +       slot = 2654435769U * tid;
> > +       slot >>= 24;
> > +
> > +       atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
> > +}
> > +
> > +static __always_inline long sum_and_reset_counters(struct counter *counters)
>
> Why __always_inline ?
> It's a slow path. Inlining or not shouldn't make any difference.

copy/paste of inc_counter signature originally, you are right, it's
not necessary

>
> > +{
> > +       int i;
> > +       long sum = 0;
> > +
> > +       for (i = 0; i < MAX_BUCKETS; i++)
> > +               sum += atomic_swap(&counters[i].value, 0);
> > +       return sum;
> > +}
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index ace0d1011a8e..8fbc78d5f8a4 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,15 +1,45 @@ 
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <unistd.h>
 #include "bench.h"
 #include "trigger_bench.skel.h"
 #include "trace_helpers.h"
 
+/* adjust slot shift in inc_hits() if changing */
+#define MAX_BUCKETS 256
+
 /* BPF triggering benchmarks */
 static struct trigger_ctx {
 	struct trigger_bench *skel;
 } ctx;
 
-static struct counter base_hits;
+static struct counter base_hits[MAX_BUCKETS];
+
+static __always_inline void inc_counter(struct counter *counters)
+{
+	static __thread int tid = 0;
+	unsigned slot;
+
+	if (unlikely(tid == 0))
+		tid = gettid();
+
+	/* multiplicative hashing, it's fast */
+	slot = 2654435769U * tid;
+	slot >>= 24;
+
+	atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
+}
+
+static __always_inline long sum_and_reset_counters(struct counter *counters)
+{
+	int i;
+	long sum = 0;
+
+	for (i = 0; i < MAX_BUCKETS; i++)
+		sum += atomic_swap(&counters[i].value, 0);
+	return sum;
+}
 
 static void trigger_validate(void)
 {
@@ -23,14 +53,14 @@  static void *trigger_base_producer(void *input)
 {
 	while (true) {
 		(void)syscall(__NR_getpgid);
-		atomic_inc(&base_hits.value);
+		inc_counter(base_hits);
 	}
 	return NULL;
 }
 
 static void trigger_base_measure(struct bench_res *res)
 {
-	res->hits = atomic_swap(&base_hits.value, 0);
+	res->hits = sum_and_reset_counters(base_hits);
 }
 
 static void *trigger_producer(void *input)
@@ -42,7 +72,7 @@  static void *trigger_producer(void *input)
 
 static void trigger_measure(struct bench_res *res)
 {
-	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+	res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
 }
 
 static void setup_ctx(void)
@@ -164,7 +194,7 @@  static void *uprobe_base_producer(void *input)
 {
 	while (true) {
 		uprobe_target_nop();
-		atomic_inc(&base_hits.value);
+		inc_counter(base_hits);
 	}
 	return NULL;
 }
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 5fda43901033..42ec202015ed 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -9,12 +9,27 @@ 
 
 char _license[] SEC("license") = "GPL";
 
-long hits = 0;
+#define CPU_MASK 255
+#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */
+
+/* matches struct counter in bench.h */
+struct counter {
+	long value;
+} __attribute__((aligned(128)));
+
+struct counter hits[MAX_CPUS];
+
+static __always_inline void inc_counter(void)
+{
+	int cpu = bpf_get_smp_processor_id();
+
+	__sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1);
+}
 
 SEC("tp/syscalls/sys_enter_getpgid")
 int bench_trigger_tp(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
@@ -22,69 +37,69 @@  SEC("raw_tp/sys_enter")
 int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id)
 {
 	if (id == __NR_getpgid)
-		__sync_add_and_fetch(&hits, 1);
+		inc_counter();
 	return 0;
 }
 
 SEC("kprobe/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kretprobe/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kretprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kretprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fexit/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fexit(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry_sleep(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fmod_ret/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fmodret(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return -22;
 }
 
 SEC("uprobe")
 int bench_trigger_uprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }