diff mbox series

[bpf-next,6/6] selftest/bpf/benchs: Add benchmark for hashmap lookups

Message ID 20230127181457.21389-7-aspsk@isovalent.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series New benchmark for hashmap lookups | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 9 maintainers not CCed: sdf@google.com kpsingh@kernel.org jolsa@kernel.org mykolal@fb.com song@kernel.org linux-kselftest@vger.kernel.org shuah@kernel.org haoluo@google.com yhs@fb.com
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning CHECK: Prefer using the BIT macro CHECK: spaces preferred around that '*' (ctx:VxV) CHECK: spaces preferred around that '/' (ctx:VxV) CHECK: spaces preferred around that '<<' (ctx:VxV) WARNING: Prefer __aligned(256) over __attribute__((__aligned__(256))) WARNING: Prefer __aligned(8) over __attribute__((__aligned__(8))) WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: externs should be avoided in .c files WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: quoted string split across lines
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 3
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain }}
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-8 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-9 success Logs for set-matrix

Commit Message

Anton Protopopov Jan. 27, 2023, 6:14 p.m. UTC
Add a new benchmark which measures hashmap lookup operations speed.  A user can
control the following parameters of the benchmark:

    * key_size (max 1024): the key size to use
    * max_entries: the hashmap max entries
    * nr_entries: the number of entries to insert/lookup
    * nr_loops: the number of loops for the benchmark
    * map_flags The hashmap flags passed to BPF_MAP_CREATE

The BPF program performing the benchmarks calls two nested bpf_loop:

    bpf_loop(nr_loops/nr_entries)
            bpf_loop(nr_entries)
                     bpf_map_lookup()

So the nr_loops determines the number of actual map lookups. All lookups are
successful.

Example (the output is generated on a AMD Ryzen 9 3950X machine):

    for nr_entries in `seq 4096 4096 65536`; do echo -n "$((nr_entries*100/65536))% full: "; sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=4 --nr_entries=$nr_entries --max_entries=65536 --nr_loops=1000000 --map_flags=0x40 | grep cpu; done
    6% full: cpu01: lookup 50.739M ± 0.018M events/sec (approximated from 32 samples of ~19ms)
    12% full: cpu01: lookup 47.751M ± 0.015M events/sec (approximated from 32 samples of ~20ms)
    18% full: cpu01: lookup 45.153M ± 0.013M events/sec (approximated from 32 samples of ~22ms)
    25% full: cpu01: lookup 43.826M ± 0.014M events/sec (approximated from 32 samples of ~22ms)
    31% full: cpu01: lookup 41.971M ± 0.012M events/sec (approximated from 32 samples of ~23ms)
    37% full: cpu01: lookup 41.034M ± 0.015M events/sec (approximated from 32 samples of ~24ms)
    43% full: cpu01: lookup 39.946M ± 0.012M events/sec (approximated from 32 samples of ~25ms)
    50% full: cpu01: lookup 38.256M ± 0.014M events/sec (approximated from 32 samples of ~26ms)
    56% full: cpu01: lookup 36.580M ± 0.018M events/sec (approximated from 32 samples of ~27ms)
    62% full: cpu01: lookup 36.252M ± 0.012M events/sec (approximated from 32 samples of ~27ms)
    68% full: cpu01: lookup 35.200M ± 0.012M events/sec (approximated from 32 samples of ~28ms)
    75% full: cpu01: lookup 34.061M ± 0.009M events/sec (approximated from 32 samples of ~29ms)
    81% full: cpu01: lookup 34.374M ± 0.010M events/sec (approximated from 32 samples of ~29ms)
    87% full: cpu01: lookup 33.244M ± 0.011M events/sec (approximated from 32 samples of ~30ms)
    93% full: cpu01: lookup 32.182M ± 0.013M events/sec (approximated from 32 samples of ~31ms)
    100% full: cpu01: lookup 31.497M ± 0.016M events/sec (approximated from 32 samples of ~31ms)

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
---
 tools/testing/selftests/bpf/Makefile          |   5 +-
 tools/testing/selftests/bpf/bench.c           |   7 +
 tools/testing/selftests/bpf/bench.h           |   3 +
 .../bpf/benchs/bench_bpf_hashmap_lookup.c     | 277 ++++++++++++++++++
 .../selftests/bpf/progs/bpf_hashmap_lookup.c  |  61 ++++
 5 files changed, 352 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c

Comments

Andrii Nakryiko Jan. 31, 2023, 12:16 a.m. UTC | #1
On Fri, Jan 27, 2023 at 10:14 AM Anton Protopopov <aspsk@isovalent.com> wrote:
>
> Add a new benchmark which measures hashmap lookup operations speed.  A user can
> control the following parameters of the benchmark:
>
>     * key_size (max 1024): the key size to use
>     * max_entries: the hashmap max entries
>     * nr_entries: the number of entries to insert/lookup
>     * nr_loops: the number of loops for the benchmark
>     * map_flags The hashmap flags passed to BPF_MAP_CREATE
>
> The BPF program performing the benchmarks calls two nested bpf_loop:
>
>     bpf_loop(nr_loops/nr_entries)
>             bpf_loop(nr_entries)
>                      bpf_map_lookup()
>
> So the nr_loops determines the number of actual map lookups. All lookups are
> successful.
>
> Example (the output is generated on a AMD Ryzen 9 3950X machine):
>
>     for nr_entries in `seq 4096 4096 65536`; do echo -n "$((nr_entries*100/65536))% full: "; sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=4 --nr_entries=$nr_entries --max_entries=65536 --nr_loops=1000000 --map_flags=0x40 | grep cpu; done
>     6% full: cpu01: lookup 50.739M ± 0.018M events/sec (approximated from 32 samples of ~19ms)
>     12% full: cpu01: lookup 47.751M ± 0.015M events/sec (approximated from 32 samples of ~20ms)
>     18% full: cpu01: lookup 45.153M ± 0.013M events/sec (approximated from 32 samples of ~22ms)
>     25% full: cpu01: lookup 43.826M ± 0.014M events/sec (approximated from 32 samples of ~22ms)
>     31% full: cpu01: lookup 41.971M ± 0.012M events/sec (approximated from 32 samples of ~23ms)
>     37% full: cpu01: lookup 41.034M ± 0.015M events/sec (approximated from 32 samples of ~24ms)
>     43% full: cpu01: lookup 39.946M ± 0.012M events/sec (approximated from 32 samples of ~25ms)
>     50% full: cpu01: lookup 38.256M ± 0.014M events/sec (approximated from 32 samples of ~26ms)
>     56% full: cpu01: lookup 36.580M ± 0.018M events/sec (approximated from 32 samples of ~27ms)
>     62% full: cpu01: lookup 36.252M ± 0.012M events/sec (approximated from 32 samples of ~27ms)
>     68% full: cpu01: lookup 35.200M ± 0.012M events/sec (approximated from 32 samples of ~28ms)
>     75% full: cpu01: lookup 34.061M ± 0.009M events/sec (approximated from 32 samples of ~29ms)
>     81% full: cpu01: lookup 34.374M ± 0.010M events/sec (approximated from 32 samples of ~29ms)
>     87% full: cpu01: lookup 33.244M ± 0.011M events/sec (approximated from 32 samples of ~30ms)
>     93% full: cpu01: lookup 32.182M ± 0.013M events/sec (approximated from 32 samples of ~31ms)
>     100% full: cpu01: lookup 31.497M ± 0.016M events/sec (approximated from 32 samples of ~31ms)
>
> Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
> ---
>  tools/testing/selftests/bpf/Makefile          |   5 +-
>  tools/testing/selftests/bpf/bench.c           |   7 +
>  tools/testing/selftests/bpf/bench.h           |   3 +
>  .../bpf/benchs/bench_bpf_hashmap_lookup.c     | 277 ++++++++++++++++++
>  .../selftests/bpf/progs/bpf_hashmap_lookup.c  |  61 ++++
>  5 files changed, 352 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c
>

[...]

> +static error_t parse_arg(int key, char *arg, struct argp_state *state)
> +{
> +       long ret;
> +
> +       switch (key) {
> +       case ARG_KEY_SIZE:
> +               ret = strtol(arg, NULL, 10);
> +               if (ret < 1 || ret > MAX_KEY_SIZE) {
> +                       fprintf(stderr, "invalid key_size");
> +                       argp_usage(state);
> +               }
> +               args.key_size = ret;
> +               break;
> +       case ARG_MAP_FLAGS:
> +               if (!strncasecmp(arg, "0x", 2))
> +                       ret = strtol(arg, NULL, 0x10);
> +               else
> +                       ret = strtol(arg, NULL, 10);

if you pass base as zero, strtol() will do this for you

> +               if (ret < 0 || ret > UINT_MAX) {
> +                       fprintf(stderr, "invalid map_flags");
> +                       argp_usage(state);
> +               }
> +               args.map_flags = ret;
> +               break;

[...]
Martin KaFai Lau Jan. 31, 2023, 12:22 a.m. UTC | #2
On 1/27/23 10:14 AM, Anton Protopopov wrote:
> +/* The number of slots to store times */
> +#define NR_SLOTS 32
> +
> +/* Configured by userspace */
> +u64 nr_entries;
> +u64 nr_loops;
> +u32 __attribute__((__aligned__(8))) key[256];
> +
> +/* Filled by us */
> +u64 __attribute__((__aligned__(256))) percpu_times_index[NR_SLOTS]; > +u64 __attribute__((__aligned__(256))) percpu_times[256][NR_SLOTS];
> +
> +static inline void patch_key(u32 i)
> +{
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +	key[0] = i + 1;
> +#else
> +	key[0] = __builtin_bswap32(i + 1);
> +#endif
> +	/* the rest of key is random and is configured by userspace */
> +}
> +
> +static int lookup_callback(__u32 index, u32 *unused)
> +{
> +	patch_key(index);
> +	return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1;
> +}
> +
> +static int loop_lookup_callback(__u32 index, u32 *unused)
> +{
> +	return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1;
> +}
> +
> +SEC("fentry/" SYS_PREFIX "sys_getpgid")
> +int benchmark(void *ctx)
> +{
> +	u32 cpu = bpf_get_smp_processor_id();
> +	u32 times_index;
> +	u64 start_time;
> +
> +	times_index = percpu_times_index[cpu & 255] % NR_SLOTS;

percpu_times_index only has NR_SLOTS (32) elements?

> +	start_time = bpf_ktime_get_ns();
> +	bpf_loop(nr_loops, loop_lookup_callback, NULL, 0);
> +	percpu_times[cpu & 255][times_index] = bpf_ktime_get_ns() - start_time;
> +	percpu_times_index[cpu & 255] += 1;
> +	return 0;
> +}
Anton Protopopov Jan. 31, 2023, 11:01 a.m. UTC | #3
On 23/01/30 04:16, Andrii Nakryiko wrote:
> On Fri, Jan 27, 2023 at 10:14 AM Anton Protopopov <aspsk@isovalent.com> wrote:
> >
> [...]
> > +
> > +       switch (key) {
> > +       case ARG_KEY_SIZE:
> > +               ret = strtol(arg, NULL, 10);
> > +               if (ret < 1 || ret > MAX_KEY_SIZE) {
> > +                       fprintf(stderr, "invalid key_size");
> > +                       argp_usage(state);
> > +               }
> > +               args.key_size = ret;
> > +               break;
> > +       case ARG_MAP_FLAGS:
> > +               if (!strncasecmp(arg, "0x", 2))
> > +                       ret = strtol(arg, NULL, 0x10);
> > +               else
> > +                       ret = strtol(arg, NULL, 10);
> 
> if you pass base as zero, strtol() will do this for you

Thanks, I didn't remeber this!

> > +               if (ret < 0 || ret > UINT_MAX) {
> > +                       fprintf(stderr, "invalid map_flags");
> > +                       argp_usage(state);
> > +               }
> > +               args.map_flags = ret;
> > +               break;
> 
> [...]
Anton Protopopov Jan. 31, 2023, 11:05 a.m. UTC | #4
On 23/01/30 04:22, Martin KaFai Lau wrote:
> On 1/27/23 10:14 AM, Anton Protopopov wrote:
> > +/* The number of slots to store times */
> > +#define NR_SLOTS 32
> > +
> > +/* Configured by userspace */
> > +u64 nr_entries;
> > +u64 nr_loops;
> > +u32 __attribute__((__aligned__(8))) key[256];
> > +
> > +/* Filled by us */
> > +u64 __attribute__((__aligned__(256))) percpu_times_index[NR_SLOTS]; > +u64 __attribute__((__aligned__(256))) percpu_times[256][NR_SLOTS];
> > +
> > +static inline void patch_key(u32 i)
> > +{
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > +	key[0] = i + 1;
> > +#else
> > +	key[0] = __builtin_bswap32(i + 1);
> > +#endif
> > +	/* the rest of key is random and is configured by userspace */
> > +}
> > +
> > +static int lookup_callback(__u32 index, u32 *unused)
> > +{
> > +	patch_key(index);
> > +	return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1;
> > +}
> > +
> > +static int loop_lookup_callback(__u32 index, u32 *unused)
> > +{
> > +	return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1;
> > +}
> > +
> > +SEC("fentry/" SYS_PREFIX "sys_getpgid")
> > +int benchmark(void *ctx)
> > +{
> > +	u32 cpu = bpf_get_smp_processor_id();
> > +	u32 times_index;
> > +	u64 start_time;
> > +
> > +	times_index = percpu_times_index[cpu & 255] % NR_SLOTS;
> 
> percpu_times_index only has NR_SLOTS (32) elements?

Yes, the idea was the following. One measurement (bpf prog execution) takes
about 20-80 ms (depending on the key/map size). So in 2-3 seconds we can get
about NR_SLOTS elements. For me 32 looked like enough to get stats for this
benchmark. Do you think this is better to make the NR_SLOTS
bigger/configurable?

> > +	start_time = bpf_ktime_get_ns();
> > +	bpf_loop(nr_loops, loop_lookup_callback, NULL, 0);
> > +	percpu_times[cpu & 255][times_index] = bpf_ktime_get_ns() - start_time;
> > +	percpu_times_index[cpu & 255] += 1;
> > +	return 0;
> > +}
>
Martin KaFai Lau Jan. 31, 2023, 10:50 p.m. UTC | #5
On 1/31/23 3:05 AM, Anton Protopopov wrote:
> On 23/01/30 04:22, Martin KaFai Lau wrote:
>> On 1/27/23 10:14 AM, Anton Protopopov wrote:
>>> +/* The number of slots to store times */
>>> +#define NR_SLOTS 32
>>> +
>>> +/* Configured by userspace */
>>> +u64 nr_entries;
>>> +u64 nr_loops;
>>> +u32 __attribute__((__aligned__(8))) key[256];
>>> +
>>> +/* Filled by us */
>>> +u64 __attribute__((__aligned__(256))) percpu_times_index[NR_SLOTS]; > +u64 __attribute__((__aligned__(256))) percpu_times[256][NR_SLOTS];
>>> +
>>> +static inline void patch_key(u32 i)
>>> +{
>>> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
>>> +	key[0] = i + 1;
>>> +#else
>>> +	key[0] = __builtin_bswap32(i + 1);
>>> +#endif
>>> +	/* the rest of key is random and is configured by userspace */
>>> +}
>>> +
>>> +static int lookup_callback(__u32 index, u32 *unused)
>>> +{
>>> +	patch_key(index);
>>> +	return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1;
>>> +}
>>> +
>>> +static int loop_lookup_callback(__u32 index, u32 *unused)
>>> +{
>>> +	return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1;
>>> +}
>>> +
>>> +SEC("fentry/" SYS_PREFIX "sys_getpgid")
>>> +int benchmark(void *ctx)
>>> +{
>>> +	u32 cpu = bpf_get_smp_processor_id();
>>> +	u32 times_index;
>>> +	u64 start_time;
>>> +
>>> +	times_index = percpu_times_index[cpu & 255] % NR_SLOTS;
>>
>> percpu_times_index only has NR_SLOTS (32) elements?
> 
> Yes, the idea was the following. One measurement (bpf prog execution) takes
> about 20-80 ms (depending on the key/map size). So in 2-3 seconds we can get
> about NR_SLOTS elements. For me 32 looked like enough to get stats for this
> benchmark. Do you think this is better to make the NR_SLOTS
> bigger/configurable?

I thought percpu_times_index[] is the next slot to use for a particular cpu in 
percpu_times[256][NR_SLOTS]. 256 is the max number of cpu supported? It is doing 
"cpu" & 255 also. Should it be sized as percpu_times_index[256] instead then?

May be #define what 256 is here such that it can have a self describe name.

> 
>>> +	start_time = bpf_ktime_get_ns();
>>> +	bpf_loop(nr_loops, loop_lookup_callback, NULL, 0);
>>> +	percpu_times[cpu & 255][times_index] = bpf_ktime_get_ns() - start_time;
>>> +	percpu_times_index[cpu & 255] += 1;
>>> +	return 0;
>>> +}
>>
Anton Protopopov Feb. 1, 2023, 9:12 a.m. UTC | #6
On 23/01/31 02:50, Martin KaFai Lau wrote:
> On 1/31/23 3:05 AM, Anton Protopopov wrote:
> > On 23/01/30 04:22, Martin KaFai Lau wrote:
> > > On 1/27/23 10:14 AM, Anton Protopopov wrote:
> > > > +/* The number of slots to store times */
> > > > +#define NR_SLOTS 32
> > > > +
> > > > +/* Configured by userspace */
> > > > +u64 nr_entries;
> > > > +u64 nr_loops;
> > > > +u32 __attribute__((__aligned__(8))) key[256];
> > > > +
> > > > +/* Filled by us */
> > > > +u64 __attribute__((__aligned__(256))) percpu_times_index[NR_SLOTS]; > +u64 __attribute__((__aligned__(256))) percpu_times[256][NR_SLOTS];
> > > > +
> > > > +static inline void patch_key(u32 i)
> > > > +{
> > > > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > > > +	key[0] = i + 1;
> > > > +#else
> > > > +	key[0] = __builtin_bswap32(i + 1);
> > > > +#endif
> > > > +	/* the rest of key is random and is configured by userspace */
> > > > +}
> > > > +
> > > > +static int lookup_callback(__u32 index, u32 *unused)
> > > > +{
> > > > +	patch_key(index);
> > > > +	return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1;
> > > > +}
> > > > +
> > > > +static int loop_lookup_callback(__u32 index, u32 *unused)
> > > > +{
> > > > +	return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1;
> > > > +}
> > > > +
> > > > +SEC("fentry/" SYS_PREFIX "sys_getpgid")
> > > > +int benchmark(void *ctx)
> > > > +{
> > > > +	u32 cpu = bpf_get_smp_processor_id();
> > > > +	u32 times_index;
> > > > +	u64 start_time;
> > > > +
> > > > +	times_index = percpu_times_index[cpu & 255] % NR_SLOTS;
> > > 
> > > percpu_times_index only has NR_SLOTS (32) elements?
> > 
> > Yes, the idea was the following. One measurement (bpf prog execution) takes
> > about 20-80 ms (depending on the key/map size). So in 2-3 seconds we can get
> > about NR_SLOTS elements. For me 32 looked like enough to get stats for this
> > benchmark. Do you think this is better to make the NR_SLOTS
> > bigger/configurable?
> 
> I thought percpu_times_index[] is the next slot to use for a particular cpu
> in percpu_times[256][NR_SLOTS]. 256 is the max number of cpu supported? It
> is doing "cpu" & 255 also. Should it be sized as percpu_times_index[256]
> instead then?
> 
> May be #define what 256 is here such that it can have a self describe name.

Oh, thanks! Of course, percpu_times_index is of wrong size,
I will fix this and use names instead of numbers.

> > 
> > > > +	start_time = bpf_ktime_get_ns();
> > > > +	bpf_loop(nr_loops, loop_lookup_callback, NULL, 0);
> > > > +	percpu_times[cpu & 255][times_index] = bpf_ktime_get_ns() - start_time;
> > > > +	percpu_times_index[cpu & 255] += 1;
> > > > +	return 0;
> > > > +}
> > > 
>
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a554b41fe872..cf08587b8639 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -604,6 +604,7 @@  $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
 $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
 $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h
 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h
+$(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -618,7 +619,9 @@  $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_strncmp.o \
 		 $(OUTPUT)/bench_bpf_hashmap_full_update.o \
 		 $(OUTPUT)/bench_local_storage.o \
-		 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o
+		 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o \
+		 $(OUTPUT)/bench_bpf_hashmap_lookup.o \
+		 #
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 2f34db60f819..568a0d12197d 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -272,6 +272,7 @@  extern struct argp bench_bpf_loop_argp;
 extern struct argp bench_local_storage_argp;
 extern struct argp bench_local_storage_rcu_tasks_trace_argp;
 extern struct argp bench_strncmp_argp;
+extern struct argp bench_hashmap_lookup_argp;
 
 static struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -281,6 +282,7 @@  static struct argp_child bench_parsers[] = {
 	{ &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 },
 	{ &bench_local_storage_rcu_tasks_trace_argp, 0,
 		"local_storage RCU Tasks Trace slowdown benchmark", 0 },
+	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
 	{},
 };
 
@@ -403,6 +405,9 @@  struct argp *bench_name_to_argp(const char *bench_name)
 	if (_SCMP("bpf-loop"))
 		return &bench_bpf_loop_argp;
 
+	if (_SCMP("bpf-hashmap-lookup"))
+		return &bench_hashmap_lookup_argp;
+
 	/* no extra arguments */
 	if (_SCMP("count-global") ||
 	    _SCMP("count-local") ||
@@ -587,6 +592,7 @@  extern const struct bench bench_local_storage_cache_seq_get;
 extern const struct bench bench_local_storage_cache_interleaved_get;
 extern const struct bench bench_local_storage_cache_hashmap_control;
 extern const struct bench bench_local_storage_tasks_trace;
+extern const struct bench bench_bpf_hashmap_lookup;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -626,6 +632,7 @@  static const struct bench *benchs[] = {
 	&bench_local_storage_cache_interleaved_get,
 	&bench_local_storage_cache_hashmap_control,
 	&bench_local_storage_tasks_trace,
+	&bench_bpf_hashmap_lookup,
 };
 
 static void setup_benchmark()
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index e322654b5e8a..f0e477518ab9 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -121,4 +121,7 @@  enum {
 	ARG_NR_PROCS,
 	ARG_KTHREAD_PID,
 	ARG_QUIET,
+	ARG_KEY_SIZE,
+	ARG_MAP_FLAGS,
+	ARG_MAX_ENTRIES,
 };
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
new file mode 100644
index 000000000000..79805d008125
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
@@ -0,0 +1,277 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <sys/random.h>
+#include <argp.h>
+#include "bench.h"
+#include "bpf_hashmap_lookup.skel.h"
+#include "bpf_util.h"
+
+/* BPF triggering benchmarks */
+static struct ctx {
+	struct bpf_hashmap_lookup *skel;
+} ctx;
+
+/* only available to kernel, so define it here */
+#define BPF_MAX_LOOPS (1<<23)
+
+#define MAX_KEY_SIZE 1024 /* the size of the key map */
+
+static struct {
+	__u32 key_size;
+	__u32 map_flags;
+	__u32 max_entries;
+	__u32 nr_entries;
+	__u32 nr_loops;
+} args = {
+	.key_size = 4,
+	.map_flags = 0,
+	.max_entries = 1000,
+	.nr_entries = 500,
+	.nr_loops = 1000000,
+};
+
+static const struct argp_option opts[] = {
+	{ "key_size", ARG_KEY_SIZE, "KEY_SIZE", 0,
+	  "The hashmap key size (max 1024)"},
+	{ "map_flags", ARG_MAP_FLAGS, "MAP_FLAGS", 0,
+	  "The hashmap flags passed to BPF_MAP_CREATE"},
+	{ "max_entries", ARG_MAX_ENTRIES, "MAX_ENTRIES", 0,
+	  "The hashmap max entries"},
+	{ "nr_entries", ARG_NR_ENTRIES, "NR_ENTRIES", 0,
+	  "The number of entries to insert/lookup"},
+	{ "nr_loops", ARG_NR_LOOPS, "NR_LOOPS", 0,
+	  "The number of loops for the benchmark"},
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	long ret;
+
+	switch (key) {
+	case ARG_KEY_SIZE:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > MAX_KEY_SIZE) {
+			fprintf(stderr, "invalid key_size");
+			argp_usage(state);
+		}
+		args.key_size = ret;
+		break;
+	case ARG_MAP_FLAGS:
+		if (!strncasecmp(arg, "0x", 2))
+			ret = strtol(arg, NULL, 0x10);
+		else
+			ret = strtol(arg, NULL, 10);
+		if (ret < 0 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid map_flags");
+			argp_usage(state);
+		}
+		args.map_flags = ret;
+		break;
+	case ARG_MAX_ENTRIES:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid max_entries");
+			argp_usage(state);
+		}
+		args.max_entries = ret;
+		break;
+	case ARG_NR_ENTRIES:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid nr_entries");
+			argp_usage(state);
+		}
+		args.nr_entries = ret;
+		break;
+	case ARG_NR_LOOPS:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > BPF_MAX_LOOPS) {
+			fprintf(stderr, "invalid nr_loops: %ld (min=1 max=%u)\n",
+				ret, BPF_MAX_LOOPS);
+			argp_usage(state);
+		}
+		args.nr_loops = ret;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_hashmap_lookup_argp = {
+	.options = opts,
+	.parser = parse_arg,
+};
+
+static void validate(void)
+{
+	if (env.consumer_cnt != 1) {
+		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+		exit(1);
+	}
+
+	if (args.nr_entries > args.max_entries) {
+		fprintf(stderr, "args.nr_entries is too big! (max %u, got %u)\n",
+			args.max_entries, args.nr_entries);
+		exit(1);
+	}
+}
+
+static void *producer(void *input)
+{
+	while (true) {
+		/* trigger the bpf program */
+		syscall(__NR_getpgid);
+	}
+	return NULL;
+}
+
+static void *consumer(void *input)
+{
+	return NULL;
+}
+
+static void measure(struct bench_res *res)
+{
+}
+
+static inline void patch_key(u32 i, u32 *key)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	*key = i + 1;
+#else
+	*key = __builtin_bswap32(i + 1);
+#endif
+	/* the rest of key is random */
+}
+
+static void setup(void)
+{
+	struct bpf_link *link;
+	int map_fd;
+	int ret;
+	int i;
+
+	setup_libbpf();
+
+	ctx.skel = bpf_hashmap_lookup__open();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	bpf_map__set_max_entries(ctx.skel->maps.hash_map_bench, args.max_entries);
+	bpf_map__set_key_size(ctx.skel->maps.hash_map_bench, args.key_size);
+	bpf_map__set_value_size(ctx.skel->maps.hash_map_bench, 8);
+	bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, args.map_flags);
+
+	ctx.skel->bss->nr_entries = args.nr_entries;
+	ctx.skel->bss->nr_loops = args.nr_loops / args.nr_entries;
+
+	if (args.key_size > 4) {
+		for (i = 1; i < args.key_size/4; i++)
+			ctx.skel->bss->key[i] = 2654435761 * i;
+	}
+
+	ret = bpf_hashmap_lookup__load(ctx.skel);
+	if (ret) {
+		bpf_hashmap_lookup__destroy(ctx.skel);
+		fprintf(stderr, "failed to load map: %s", strerror(-ret));
+		exit(1);
+	}
+
+	/* fill in the hash_map */
+	map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
+	for (u64 i = 0; i < args.nr_entries; i++) {
+		patch_key(i, ctx.skel->bss->key);
+		bpf_map_update_elem(map_fd, ctx.skel->bss->key, &i, BPF_ANY);
+	}
+
+	link = bpf_program__attach(ctx.skel->progs.benchmark);
+	if (!link) {
+		fprintf(stderr, "failed to attach program!\n");
+		exit(1);
+	}
+}
+
+static inline double events_from_time(u64 time)
+{
+	if (time)
+		return args.nr_loops * 1000000000llu / time / 1000000.0L;
+
+	return 0;
+}
+
+static int compute_events(u64 *times, double *events_mean, double *events_stddev, u64 *mean_time)
+{
+	int i, n = 0;
+
+	*events_mean = 0;
+	*events_stddev = 0;
+	*mean_time = 0;
+
+	for (i = 0; i < 32; i++) {
+		if (!times[i])
+			break;
+		*mean_time += times[i];
+		*events_mean += events_from_time(times[i]);
+		n += 1;
+	}
+	if (!n)
+		return 0;
+
+	*mean_time /= n;
+	*events_mean /= n;
+
+	if (n > 1) {
+		for (i = 0; i < n; i++) {
+			double events_i = *events_mean - events_from_time(times[i]);
+			*events_stddev += events_i * events_i / (n - 1);
+		}
+		*events_stddev = sqrt(*events_stddev);
+	}
+
+	return n;
+}
+
+static void hashmap_report_final(struct bench_res res[], int res_cnt)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	double events_mean, events_stddev;
+	u64 mean_time;
+	int i, n;
+
+	for (i = 0; i < nr_cpus; i++) {
+		n = compute_events(ctx.skel->bss->percpu_times[i], &events_mean,
+				   &events_stddev, &mean_time);
+		if (n == 0)
+			continue;
+
+		if (env.quiet) {
+			/* we expect only one cpu to be present */
+			if (env.affinity)
+				printf("%.3lf\n", events_mean);
+			else
+				printf("cpu%02d %.3lf\n", i, events_mean);
+		} else {
+			printf("cpu%02d: lookup %.3lfM ± %.3lfM events/sec"
+			       " (approximated from %d samples of ~%lums)\n",
+			       i, events_mean, 2*events_stddev,
+			       n, mean_time / 1000000);
+		}
+	}
+}
+
+const struct bench bench_bpf_hashmap_lookup = {
+	.name = "bpf-hashmap-lookup",
+	.validate = validate,
+	.setup = setup,
+	.producer_thread = producer,
+	.consumer_thread = consumer,
+	.measure = measure,
+	.report_progress = NULL,
+	.report_final = hashmap_report_final,
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c b/tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c
new file mode 100644
index 000000000000..79e41e3fb1bf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c
@@ -0,0 +1,61 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+} hash_map_bench SEC(".maps");
+
+/* The number of slots to store times */
+#define NR_SLOTS 32
+
+/* Configured by userspace */
+u64 nr_entries;
+u64 nr_loops;
+u32 __attribute__((__aligned__(8))) key[256];
+
+/* Filled by us */
+u64 __attribute__((__aligned__(256))) percpu_times_index[NR_SLOTS];
+u64 __attribute__((__aligned__(256))) percpu_times[256][NR_SLOTS];
+
+static inline void patch_key(u32 i)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	key[0] = i + 1;
+#else
+	key[0] = __builtin_bswap32(i + 1);
+#endif
+	/* the rest of key is random and is configured by userspace */
+}
+
+static int lookup_callback(__u32 index, u32 *unused)
+{
+	patch_key(index);
+	return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1;
+}
+
+static int loop_lookup_callback(__u32 index, u32 *unused)
+{
+	return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1;
+}
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int benchmark(void *ctx)
+{
+	u32 cpu = bpf_get_smp_processor_id();
+	u32 times_index;
+	u64 start_time;
+
+	times_index = percpu_times_index[cpu & 255] % NR_SLOTS;
+	start_time = bpf_ktime_get_ns();
+	bpf_loop(nr_loops, loop_lookup_callback, NULL, 0);
+	percpu_times[cpu & 255][times_index] = bpf_ktime_get_ns() - start_time;
+	percpu_times_index[cpu & 255] += 1;
+	return 0;
+}