diff mbox series

[bpf-next,2/2] selftests/bpf: Add benchmark for bpf memory allocator

Message ID 20231221141501.3588586-3-houtao@huaweicloud.com (mailing list archive)
State New, archived
Delegated to: BPF
Headers show
Series bpf: Add benchmark for bpf memory allocator | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers warning 3 maintainers not CCed: shuah@kernel.org linux-kselftest@vger.kernel.org mykolal@fb.com
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: externs should be avoided in .c files WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: quoted string split across lines
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Hou Tao Dec. 21, 2023, 2:15 p.m. UTC
From: Hou Tao <houtao1@huawei.com>

Add a benchmark to test the performance and memory usage for
bpf_obj_new/bpf_obj_drop() and its percpu variant. The benchmark tests
the performance of bpf_obj_new() or bpf_percpu_obj_new() through the
following steps:
1) find the inner array by using the cpu number as key
2) allocate at most 64 128-bytes-sized objects through bpf_obj_new()
3) stash these objectes into the inner array through bpf_kptr_xchg()
4) account the time used in step 1)~3)
5) calculate the performance in M/s: alloc_cnt * 1000 / alloc_tim_ns
6) calculate the memory usage by reading slub field in memory.stat file
   and get the final value after subtracting the base value.

The performance test for bpf_obj_drop() or bpf_percpu_obj_drop() is
similar. For simplicity, both the number of allocated object in each
batch and the size of the allocated object are fixed as 64 and 128
respectively. When increasing the value of batch, the performance will
degrade a bit but not too much. When increasing the size of the object,
the allocation performance will degraded a lot, and the total used
memory will also increase for per-cpu allocation, but the free
performance doesn't change too much.

The following is the test results conducted on a 8-CPU VM with 16GB memory:

$ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a; done |grep Summary
Summary: per-prod alloc 11.29 ± 0.14M/s free 33.76 ± 0.33M/s, total memory usage    0.01 ± 0.00MiB
Summary: per-prod alloc  7.49 ± 0.12M/s free 34.42 ± 0.56M/s, total memory usage    0.03 ± 0.00MiB
Summary: per-prod alloc  6.66 ± 0.08M/s free 34.27 ± 0.41M/s, total memory usage    0.06 ± 0.00MiB

$ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a --percpu; done |grep Summary
Summary: per-prod alloc 14.64 ± 0.60M/s free 36.94 ± 0.35M/s, total memory usage  188.02 ± 7.43MiB
Summary: per-prod alloc 12.39 ± 1.32M/s free 36.40 ± 0.38M/s, total memory usage  808.90 ± 25.56MiB
Summary: per-prod alloc 10.80 ± 0.17M/s free 35.45 ± 0.25M/s, total memory usage 2330.24 ± 480.56MiB

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 tools/testing/selftests/bpf/Makefile          |   2 +
 tools/testing/selftests/bpf/bench.c           |   4 +
 tools/testing/selftests/bpf/bench.h           |   7 +
 .../selftests/bpf/benchs/bench_bpf_ma.c       | 273 ++++++++++++++++++
 .../selftests/bpf/progs/bench_bpf_ma.c        | 222 ++++++++++++++
 5 files changed, 508 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_ma.c
 create mode 100644 tools/testing/selftests/bpf/progs/bench_bpf_ma.c

Comments

Song Liu Jan. 8, 2024, 9:45 p.m. UTC | #1
On Thu, Dec 21, 2023 at 6:14 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> From: Hou Tao <houtao1@huawei.com>
>
[...]
>
> The following is the test results conducted on a 8-CPU VM with 16GB memory:
>
> $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a; done |grep Summary
> Summary: per-prod alloc 11.29 ± 0.14M/s free 33.76 ± 0.33M/s, total memory usage    0.01 ± 0.00MiB
> Summary: per-prod alloc  7.49 ± 0.12M/s free 34.42 ± 0.56M/s, total memory usage    0.03 ± 0.00MiB
> Summary: per-prod alloc  6.66 ± 0.08M/s free 34.27 ± 0.41M/s, total memory usage    0.06 ± 0.00MiB
>
> $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a --percpu; done |grep Summary
> Summary: per-prod alloc 14.64 ± 0.60M/s free 36.94 ± 0.35M/s, total memory usage  188.02 ± 7.43MiB
> Summary: per-prod alloc 12.39 ± 1.32M/s free 36.40 ± 0.38M/s, total memory usage  808.90 ± 25.56MiB
> Summary: per-prod alloc 10.80 ± 0.17M/s free 35.45 ± 0.25M/s, total memory usage 2330.24 ± 480.56MiB

This is not likely related to this patch, but do we expect this much
memory usage?
I guess the 2.3GiB number is from bigger ALLOC_OBJ_SIZE and
ALLOC_BATCH_CNT? I am getting 0 MiB with this test on my VM.

>
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  tools/testing/selftests/bpf/Makefile          |   2 +
>  tools/testing/selftests/bpf/bench.c           |   4 +
>  tools/testing/selftests/bpf/bench.h           |   7 +
>  .../selftests/bpf/benchs/bench_bpf_ma.c       | 273 ++++++++++++++++++
>  .../selftests/bpf/progs/bench_bpf_ma.c        | 222 ++++++++++++++

Maybe add a run_bench_bpf_ma.sh script in selftests/bpf/benchs?

>  5 files changed, 508 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_ma.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bench_bpf_ma.c
>
[...]
> diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
> index a6fcf111221f..206cf3de5df2 100644
> --- a/tools/testing/selftests/bpf/bench.h
> +++ b/tools/testing/selftests/bpf/bench.h
> @@ -53,6 +53,13 @@ struct bench_res {
>                         unsigned long gp_ct;
>                         unsigned int stime;
>                 } rcu;
> +               struct {
> +                       unsigned long alloc;
> +                       unsigned long free;

nit: maybe add _ct or _cnt postfix to match "rcu" above or the skel?

> +                       unsigned long alloc_ns;
> +                       unsigned long free_ns;
> +                       unsigned long mem_bytes;
> +               } ma;
>         };
>  };
>
[...]
> +
> +static void bpf_ma_validate(void)
> +{
> +}

Empty validate() function seems not necessary.

[...]

> +
> +static void bpf_ma_report_final(struct bench_res res[], int res_cnt)
> +{
> +       double mem_mean = 0.0, mem_stddev = 0.0;
> +       double alloc_mean = 0.0, alloc_stddev = 0.0;
> +       double free_mean = 0.0, free_stddev = 0.0;
> +       double alloc_ns = 0.0, free_ns = 0.0;
> +       int i;
> +
> +       for (i = 0; i < res_cnt; i++) {
> +               alloc_ns += res[i].ma.alloc_ns;
> +               free_ns += res[i].ma.free_ns;
> +       }
> +       for (i = 0; i < res_cnt; i++) {
> +               if (alloc_ns)
> +                       alloc_mean += res[i].ma.alloc * 1000.0 / alloc_ns;
> +               if (free_ns)
> +                       free_mean += res[i].ma.free * 1000.0 / free_ns;
> +               mem_mean += res[i].ma.mem_bytes / 1048576.0 / (0.0 + res_cnt);
> +       }
> +       if (res_cnt > 1) {
> +               for (i = 0; i < res_cnt; i++) {
> +                       double sample;
> +
> +                       sample = res[i].ma.alloc_ns ? res[i].ma.alloc * 1000.0 /
> +                                                     res[i].ma.alloc_ns : 0.0;
> +                       alloc_stddev += (alloc_mean - sample) * (alloc_mean - sample) /
> +                                       (res_cnt - 1.0);
> +
> +                       sample = res[i].ma.free_ns ? res[i].ma.free * 1000.0 /
> +                                                    res[i].ma.free_ns : 0.0;
> +                       free_stddev += (free_mean - sample) * (free_mean - sample) /
> +                                      (res_cnt - 1.0);
> +
> +                       sample = res[i].ma.mem_bytes / 1048576.0;
> +                       mem_stddev += (mem_mean - sample) * (mem_mean - sample) /
> +                                     (res_cnt - 1.0);
> +               }

nit: We can probably refactor common code for stddev calculation into
some helpers.

> +               alloc_stddev = sqrt(alloc_stddev);
> +               free_stddev = sqrt(free_stddev);
> +               mem_stddev = sqrt(mem_stddev);
> +       }
> +
> +       printf("Summary: per-prod alloc %7.2lf \u00B1 %3.2lfM/s free %7.2lf \u00B1 %3.2lfM/s, "
> +              "total memory usage %7.2lf \u00B1 %3.2lfMiB\n",
> +              alloc_mean, alloc_stddev, free_mean, free_stddev,
> +              mem_mean, mem_stddev);
> +}
> +
> +const struct bench bench_bpf_mem_alloc = {
> +       .name = "bpf_ma",
> +       .argp = &bench_bpf_mem_alloc_argp,
> +       .validate = bpf_ma_validate,
> +       .setup = bpf_ma_setup,
> +       .producer_thread = bpf_ma_producer,
> +       .measure = bpf_ma_measure,
> +       .report_progress = bpf_ma_report_progress,
> +       .report_final = bpf_ma_report_final,
> +};
> diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_ma.c b/tools/testing/selftests/bpf/progs/bench_bpf_ma.c

[...]

> +
> +/* Return the number of allocated objects */
> +static __always_inline unsigned int batch_alloc(struct bpf_map *map)
> +{
> +       struct bin_data *old, *new;
> +       struct map_value *value;
> +       unsigned int i, key;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               new = bpf_obj_new(typeof(*new));
> +               if (!new)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, new);
> +               if (old)
> +                       bpf_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}
> +
> +/* Return the number of freed objects */
> +static __always_inline unsigned int batch_free(struct bpf_map *map)
> +{
> +       struct map_value *value;
> +       unsigned int i, key;
> +       void *old;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, NULL);
> +               if (!old)
> +                       return i;
> +               bpf_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}
> +
> +/* Return the number of allocated objects */
> +static __always_inline unsigned int batch_percpu_alloc(struct bpf_map *map)
> +{
> +       struct percpu_bin_data *old, *new;
> +       struct percpu_map_value *value;
> +       unsigned int i, key;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               new = bpf_percpu_obj_new(typeof(*new));
> +               if (!new)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, new);
> +               if (old)
> +                       bpf_percpu_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}
> +
> +/* Return the number of freed objects */
> +static __always_inline unsigned int batch_percpu_free(struct bpf_map *map)
> +{
> +       struct percpu_map_value *value;
> +       unsigned int i, key;
> +       void *old;
> +
> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
> +               key = i;
> +               value = bpf_map_lookup_elem(map, &key);
> +               if (!value)
> +                       return i;
> +
> +               old = bpf_kptr_xchg(&value->data, NULL);
> +               if (!old)
> +                       return i;
> +               bpf_percpu_obj_drop(old);
> +       }
> +
> +       return ALLOC_BATCH_CNT;
> +}

nit: These four functions have quite duplicated code. We can probably
refactor them a bit.

> +
> +SEC("?fentry/" SYS_PREFIX "sys_getpgid")
> +int bench_batch_alloc_free(void *ctx)
> +{
> +       u64 start, delta;
> +       unsigned int cnt;
> +       void *map;

s/void */struct bpf_map */?

> +       int key;
> +
> +       key = bpf_get_smp_processor_id();
> +       map = bpf_map_lookup_elem((void *)&outer_array, &key);
> +       if (!map)
> +               return 0;
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_alloc(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&alloc_cnt, cnt);
> +       __sync_fetch_and_add(&alloc_ns, delta);
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_free(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&free_cnt, cnt);
> +       __sync_fetch_and_add(&free_ns, delta);
> +
> +       return 0;
> +}
> +
> +SEC("?fentry/" SYS_PREFIX "sys_getpgid")
> +int bench_batch_percpu_alloc_free(void *ctx)
> +{
> +       u64 start, delta;
> +       unsigned int cnt;
> +       void *map;

ditto

> +       int key;
> +
> +       key = bpf_get_smp_processor_id();
> +       map = bpf_map_lookup_elem((void *)&percpu_outer_array, &key);
> +       if (!map)
> +               return 0;
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_percpu_alloc(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&alloc_cnt, cnt);
> +       __sync_fetch_and_add(&alloc_ns, delta);
> +
> +       start = bpf_ktime_get_boot_ns();
> +       cnt = batch_percpu_free(map);
> +       delta = bpf_ktime_get_boot_ns() - start;
> +       __sync_fetch_and_add(&free_cnt, cnt);
> +       __sync_fetch_and_add(&free_ns, delta);
> +
> +       return 0;
> +}

nit: ditto duplicated code.
Hou Tao Jan. 9, 2024, 1:16 a.m. UTC | #2
On 1/9/2024 5:45 AM, Song Liu wrote:
> On Thu, Dec 21, 2023 at 6:14 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> From: Hou Tao <houtao1@huawei.com>
>>
> [...]
>> The following is the test results conducted on a 8-CPU VM with 16GB memory:
>>
>> $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a; done |grep Summary
>> Summary: per-prod alloc 11.29 ± 0.14M/s free 33.76 ± 0.33M/s, total memory usage    0.01 ± 0.00MiB
>> Summary: per-prod alloc  7.49 ± 0.12M/s free 34.42 ± 0.56M/s, total memory usage    0.03 ± 0.00MiB
>> Summary: per-prod alloc  6.66 ± 0.08M/s free 34.27 ± 0.41M/s, total memory usage    0.06 ± 0.00MiB
>>
>> $ for i in 1 4 8; do ./bench -w3 -d10 bpf_ma -p${i} -a --percpu; done |grep Summary
>> Summary: per-prod alloc 14.64 ± 0.60M/s free 36.94 ± 0.35M/s, total memory usage  188.02 ± 7.43MiB
>> Summary: per-prod alloc 12.39 ± 1.32M/s free 36.40 ± 0.38M/s, total memory usage  808.90 ± 25.56MiB
>> Summary: per-prod alloc 10.80 ± 0.17M/s free 35.45 ± 0.25M/s, total memory usage 2330.24 ± 480.56MiB
> This is not likely related to this patch, but do we expect this much
> memory usage?
> I guess the 2.3GiB number is from bigger ALLOC_OBJ_SIZE and
> ALLOC_BATCH_CNT? I am getting 0 MiB with this test on my VM.

I think the reduction of memory usage is due to the merge of patch set
"bpf: Reduce memory usage for bpf_global_percpu_ma", because I got the
similar result as you did after apply the patch set [1].

1:
https://lore.kernel.org/bpf/cb8edf4b-f585-4e3e-9bed-10f5b36e427c@huaweicloud.com/

>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  tools/testing/selftests/bpf/Makefile          |   2 +
>>  tools/testing/selftests/bpf/bench.c           |   4 +
>>  tools/testing/selftests/bpf/bench.h           |   7 +
>>  .../selftests/bpf/benchs/bench_bpf_ma.c       | 273 ++++++++++++++++++
>>  .../selftests/bpf/progs/bench_bpf_ma.c        | 222 ++++++++++++++
> Maybe add a run_bench_bpf_ma.sh script in selftests/bpf/benchs?

Forgot that. Will do in v2.
>
>>  5 files changed, 508 insertions(+)
>>  create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_ma.c
>>  create mode 100644 tools/testing/selftests/bpf/progs/bench_bpf_ma.c
>>
> [...]
>> diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
>> index a6fcf111221f..206cf3de5df2 100644
>> --- a/tools/testing/selftests/bpf/bench.h
>> +++ b/tools/testing/selftests/bpf/bench.h
>> @@ -53,6 +53,13 @@ struct bench_res {
>>                         unsigned long gp_ct;
>>                         unsigned int stime;
>>                 } rcu;
>> +               struct {
>> +                       unsigned long alloc;
>> +                       unsigned long free;
> nit: maybe add _ct or _cnt postfix to match "rcu" above or the skel?

Will do in v2.
>
>> +                       unsigned long alloc_ns;
>> +                       unsigned long free_ns;
>> +                       unsigned long mem_bytes;
>> +               } ma;
>>         };
>>  };
>>
> [...]
>> +
>> +static void bpf_ma_validate(void)
>> +{
>> +}
> Empty validate() function seems not necessary.

Yes. Will remove it.
>
> [...]
>
>> +
>> +static void bpf_ma_report_final(struct bench_res res[], int res_cnt)
>> +{
>> +       double mem_mean = 0.0, mem_stddev = 0.0;
>> +       double alloc_mean = 0.0, alloc_stddev = 0.0;
>> +       double free_mean = 0.0, free_stddev = 0.0;
>> +       double alloc_ns = 0.0, free_ns = 0.0;
>> +       int i;
>> +
>> +       for (i = 0; i < res_cnt; i++) {
>> +               alloc_ns += res[i].ma.alloc_ns;
>> +               free_ns += res[i].ma.free_ns;
>> +       }
>> +       for (i = 0; i < res_cnt; i++) {
>> +               if (alloc_ns)
>> +                       alloc_mean += res[i].ma.alloc * 1000.0 / alloc_ns;
>> +               if (free_ns)
>> +                       free_mean += res[i].ma.free * 1000.0 / free_ns;
>> +               mem_mean += res[i].ma.mem_bytes / 1048576.0 / (0.0 + res_cnt);
>> +       }
>> +       if (res_cnt > 1) {
>> +               for (i = 0; i < res_cnt; i++) {
>> +                       double sample;
>> +
>> +                       sample = res[i].ma.alloc_ns ? res[i].ma.alloc * 1000.0 /
>> +                                                     res[i].ma.alloc_ns : 0.0;
>> +                       alloc_stddev += (alloc_mean - sample) * (alloc_mean - sample) /
>> +                                       (res_cnt - 1.0);
>> +
>> +                       sample = res[i].ma.free_ns ? res[i].ma.free * 1000.0 /
>> +                                                    res[i].ma.free_ns : 0.0;
>> +                       free_stddev += (free_mean - sample) * (free_mean - sample) /
>> +                                      (res_cnt - 1.0);
>> +
>> +                       sample = res[i].ma.mem_bytes / 1048576.0;
>> +                       mem_stddev += (mem_mean - sample) * (mem_mean - sample) /
>> +                                     (res_cnt - 1.0);
>> +               }
> nit: We can probably refactor common code for stddev calculation into
> some helpers.

Will try. The calculation of free/alloc ratio is the same, but the names
of related fields are different. Maybe need to define alloc_cnt/free_cnt
as an array first.
>
>> +               alloc_stddev = sqrt(alloc_stddev);
>> +               free_stddev = sqrt(free_stddev);
>> +               mem_stddev = sqrt(mem_stddev);
>> +       }
>> +
>> +       printf("Summary: per-prod alloc %7.2lf \u00B1 %3.2lfM/s free %7.2lf \u00B1 %3.2lfM/s, "
>> +              "total memory usage %7.2lf \u00B1 %3.2lfMiB\n",
>> +              alloc_mean, alloc_stddev, free_mean, free_stddev,
>> +              mem_mean, mem_stddev);
>> +}
>> +
>> +const struct bench bench_bpf_mem_alloc = {
>> +       .name = "bpf_ma",
>> +       .argp = &bench_bpf_mem_alloc_argp,
>> +       .validate = bpf_ma_validate,
>> +       .setup = bpf_ma_setup,
>> +       .producer_thread = bpf_ma_producer,
>> +       .measure = bpf_ma_measure,
>> +       .report_progress = bpf_ma_report_progress,
>> +       .report_final = bpf_ma_report_final,
>> +};
>> diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_ma.c b/tools/testing/selftests/bpf/progs/bench_bpf_ma.c
> [...]
>
>> +

SNIP
>> +/* Return the number of freed objects */
>> +static __always_inline unsigned int batch_percpu_free(struct bpf_map *map)
>> +{
>> +       struct percpu_map_value *value;
>> +       unsigned int i, key;
>> +       void *old;
>> +
>> +       for (i = 0; i < ALLOC_BATCH_CNT; i++) {
>> +               key = i;
>> +               value = bpf_map_lookup_elem(map, &key);
>> +               if (!value)
>> +                       return i;
>> +
>> +               old = bpf_kptr_xchg(&value->data, NULL);
>> +               if (!old)
>> +                       return i;
>> +               bpf_percpu_obj_drop(old);
>> +       }
>> +
>> +       return ALLOC_BATCH_CNT;
>> +}
> nit: These four functions have quite duplicated code. We can probably
> refactor them a bit.

Will do. The main difference is that these functions use different
helpers to allocate and free memory. I think we could pass a bool to the
common allocation and free inline functions.

>
>> +
>> +SEC("?fentry/" SYS_PREFIX "sys_getpgid")
>> +int bench_batch_alloc_free(void *ctx)
>> +{
>> +       u64 start, delta;
>> +       unsigned int cnt;
>> +       void *map;
> s/void */struct bpf_map */?

Will fix it in v2.
>
>> +       int key;
>> +
>> +       key = bpf_get_smp_processor_id();
>> +       map = bpf_map_lookup_elem((void *)&outer_array, &key);
>> +       if (!map)
>> +               return 0;
>> +
>> +       start = bpf_ktime_get_boot_ns();
>> +       cnt = batch_alloc(map);
>> +       delta = bpf_ktime_get_boot_ns() - start;
>> +       __sync_fetch_and_add(&alloc_cnt, cnt);
>> +       __sync_fetch_and_add(&alloc_ns, delta);
>> +
>> +       start = bpf_ktime_get_boot_ns();
>> +       cnt = batch_free(map);
>> +       delta = bpf_ktime_get_boot_ns() - start;
>> +       __sync_fetch_and_add(&free_cnt, cnt);
>> +       __sync_fetch_and_add(&free_ns, delta);
>> +
>> +       return 0;
>> +}
>> +
>> +SEC("?fentry/" SYS_PREFIX "sys_getpgid")
>> +int bench_batch_percpu_alloc_free(void *ctx)
>> +{
>> +       u64 start, delta;
>> +       unsigned int cnt;
>> +       void *map;
> ditto

Will update in v2.
>
>> +       int key;
>> +
>> +       key = bpf_get_smp_processor_id();
>> +       map = bpf_map_lookup_elem((void *)&percpu_outer_array, &key);
>> +       if (!map)
>> +               return 0;
>> +
>> +       start = bpf_ktime_get_boot_ns();
>> +       cnt = batch_percpu_alloc(map);
>> +       delta = bpf_ktime_get_boot_ns() - start;
>> +       __sync_fetch_and_add(&alloc_cnt, cnt);
>> +       __sync_fetch_and_add(&alloc_ns, delta);
>> +
>> +       start = bpf_ktime_get_boot_ns();
>> +       cnt = batch_percpu_free(map);
>> +       delta = bpf_ktime_get_boot_ns() - start;
>> +       __sync_fetch_and_add(&free_cnt, cnt);
>> +       __sync_fetch_and_add(&free_ns, delta);
>> +
>> +       return 0;
>> +}
> nit: ditto duplicated code.

Will factor it out as a common function. Thanks for all these suggestions.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d5d781f5427a..05e079f2f7ee 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -692,6 +692,7 @@  $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tas
 $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h
 $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h
 $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h
+$(OUTPUT)/bench_bpf_ma.o: $(OUTPUT)/bench_bpf_ma.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -711,6 +712,7 @@  $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_bpf_hashmap_lookup.o \
 		 $(OUTPUT)/bench_local_storage_create.o \
 		 $(OUTPUT)/bench_htab_mem.o \
+		 $(OUTPUT)/bench_bpf_ma.o \
 		 #
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 4832cd4b1c3d..3bb19b719ac3 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -281,6 +281,7 @@  extern struct argp bench_strncmp_argp;
 extern struct argp bench_hashmap_lookup_argp;
 extern struct argp bench_local_storage_create_argp;
 extern struct argp bench_htab_mem_argp;
+extern struct argp bench_bpf_mem_alloc_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -293,6 +294,7 @@  static const struct argp_child bench_parsers[] = {
 	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
 	{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
 	{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
+	{ &bench_bpf_mem_alloc_argp, 0, "bpf memory allocator benchmark", 0 },
 	{},
 };
 
@@ -524,6 +526,7 @@  extern const struct bench bench_local_storage_tasks_trace;
 extern const struct bench bench_bpf_hashmap_lookup;
 extern const struct bench bench_local_storage_create;
 extern const struct bench bench_htab_mem;
+extern const struct bench bench_bpf_mem_alloc;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -566,6 +569,7 @@  static const struct bench *benchs[] = {
 	&bench_bpf_hashmap_lookup,
 	&bench_local_storage_create,
 	&bench_htab_mem,
+	&bench_bpf_mem_alloc,
 };
 
 static void find_benchmark(void)
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index a6fcf111221f..206cf3de5df2 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -53,6 +53,13 @@  struct bench_res {
 			unsigned long gp_ct;
 			unsigned int stime;
 		} rcu;
+		struct {
+			unsigned long alloc;
+			unsigned long free;
+			unsigned long alloc_ns;
+			unsigned long free_ns;
+			unsigned long mem_bytes;
+		} ma;
 	};
 };
 
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_ma.c b/tools/testing/selftests/bpf/benchs/bench_bpf_ma.c
new file mode 100644
index 000000000000..35d3a5c80cda
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_ma.c
@@ -0,0 +1,273 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#include <argp.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <fcntl.h>
+
+#include "bench.h"
+#include "bpf_util.h"
+#include "bench_bpf_ma.skel.h"
+
+static struct bpf_ma_ctx {
+	struct bench_bpf_ma *skel;
+	u64 base_bytes;
+} ctx;
+
+static struct bpf_ma_args {
+	bool percpu;
+} args = {
+	.percpu = false,
+};
+
+enum {
+	ARG_PERCPU = 20000,
+};
+
+static const struct argp_option opts[] = {
+	{ "percpu", ARG_PERCPU, NULL, 0, "percpu alloc/free" },
+	{},
+};
+
+static error_t bpf_ma_parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case ARG_PERCPU:
+		args.percpu = true;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_bpf_mem_alloc_argp = {
+	.options = opts,
+	.parser = bpf_ma_parse_arg,
+};
+
+static void read_field_in_mem_stat(const char *field, u64 *value)
+{
+	size_t field_len;
+	char line[256];
+	FILE *file;
+
+	*value = 0;
+
+	file = fopen("/sys/fs/cgroup/memory.stat", "r");
+	if (!file) {
+		/* cgroup v1 ? */
+		return;
+	}
+
+	field_len = strlen(field);
+	while (fgets(line, sizeof(line), file)) {
+		if (!strncmp(line, field, field_len)) {
+			*value = strtoull(line + field_len, NULL, 0);
+			break;
+		}
+	}
+
+	fclose(file);
+}
+
+static void bpf_ma_validate(void)
+{
+}
+
+static int bpf_ma_update_outer_map(void)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
+	struct bpf_map *outer_map, *inner_map;
+	unsigned int i, ksize, vsize, max_nr;
+	int fd, err;
+
+	if (env.nr_cpus <= 1)
+		return 0;
+
+	fd = bpf_object__btf_fd(ctx.skel->obj);
+	if (fd < 0) {
+		fprintf(stderr, "no btf_fd error %d\n", fd);
+		return -1;
+	}
+	opts.btf_fd = fd;
+
+	inner_map = args.percpu ? ctx.skel->maps.percpu_inner_array : ctx.skel->maps.inner_array;
+	opts.btf_key_type_id = bpf_map__btf_key_type_id(inner_map);
+	opts.btf_value_type_id = bpf_map__btf_value_type_id(inner_map);
+
+	ksize = bpf_map__key_size(inner_map);
+	vsize = bpf_map__value_size(inner_map);
+	max_nr = bpf_map__max_entries(inner_map);
+
+	outer_map = args.percpu ? ctx.skel->maps.percpu_outer_array : ctx.skel->maps.outer_array;
+	for (i = 1; i < env.nr_cpus; i++) {
+		char name[32];
+
+		snprintf(name, sizeof(name), "inner_array_%u", i);
+		fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, name, ksize, vsize, max_nr, &opts);
+		if (fd < 0) {
+			fprintf(stderr, "create #%d array error %d\n", i, fd);
+			return -1;
+		}
+
+		err = bpf_map_update_elem(bpf_map__fd(outer_map), &i, &fd, 0);
+		if (err) {
+			fprintf(stderr, "add #%d array error %d\n", i, err);
+			close(fd);
+			return -1;
+		}
+		close(fd);
+	}
+
+	return 0;
+}
+
+static void bpf_ma_setup(void)
+{
+	struct bpf_program *prog;
+	struct bpf_map *outer_map;
+	int err;
+
+	setup_libbpf();
+
+	ctx.skel = bench_bpf_ma__open();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		goto cleanup;
+	}
+
+	outer_map = args.percpu ? ctx.skel->maps.percpu_outer_array : ctx.skel->maps.outer_array;
+	bpf_map__set_max_entries(outer_map, env.nr_cpus);
+
+	prog = args.percpu ? ctx.skel->progs.bench_batch_percpu_alloc_free :
+			     ctx.skel->progs.bench_batch_alloc_free;
+	bpf_program__set_autoload(prog, true);
+
+	err = bench_bpf_ma__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton\n");
+		goto cleanup;
+	}
+
+	if (bpf_ma_update_outer_map())
+		goto cleanup;
+
+	err = bench_bpf_ma__attach(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to attach skeleton\n");
+		goto cleanup;
+	}
+
+	read_field_in_mem_stat(args.percpu ? "percpu " : "slab ", &ctx.base_bytes);
+	return;
+
+cleanup:
+	bench_bpf_ma__destroy(ctx.skel);
+	exit(1);
+}
+
+static void *bpf_ma_producer(void *arg)
+{
+	while (true)
+		(void)syscall(__NR_getpgid, 0);
+	return NULL;
+}
+
+static void bpf_ma_measure(struct bench_res *res)
+{
+	u64 bytes;
+
+	res->ma.alloc = atomic_swap(&ctx.skel->bss->alloc_cnt, 0);
+	res->ma.alloc_ns = atomic_swap(&ctx.skel->bss->alloc_ns, 0);
+	res->ma.free = atomic_swap(&ctx.skel->bss->free_cnt, 0);
+	res->ma.free_ns = atomic_swap(&ctx.skel->bss->free_ns, 0);
+
+	if (args.percpu)
+		read_field_in_mem_stat("percpu ", &bytes);
+	else
+		read_field_in_mem_stat("slab ", &bytes);
+	/* Is memory reclamation in-progress ? */
+	if (bytes < ctx.base_bytes)
+		ctx.base_bytes = bytes;
+	res->ma.mem_bytes = bytes - ctx.base_bytes;
+}
+
+static void bpf_ma_report_progress(int iter, struct bench_res *res, long delta_ns)
+{
+	double alloc = 0.0, free = 0.0, mem;
+
+	if (res->ma.alloc_ns)
+		alloc = res->ma.alloc * 1000.0 / res->ma.alloc_ns;
+	if (res->ma.free_ns)
+		free = res->ma.free * 1000.0 / res->ma.free_ns;
+	mem = res->ma.mem_bytes / 1048576.0;
+
+	printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0);
+	printf("per-prod alloc %7.2lfM/s free %7.2lfM/s, total memory usage %7.2lfMiB\n",
+	       alloc, free, mem);
+}
+
+static void bpf_ma_report_final(struct bench_res res[], int res_cnt)
+{
+	double mem_mean = 0.0, mem_stddev = 0.0;
+	double alloc_mean = 0.0, alloc_stddev = 0.0;
+	double free_mean = 0.0, free_stddev = 0.0;
+	double alloc_ns = 0.0, free_ns = 0.0;
+	int i;
+
+	for (i = 0; i < res_cnt; i++) {
+		alloc_ns += res[i].ma.alloc_ns;
+		free_ns += res[i].ma.free_ns;
+	}
+	for (i = 0; i < res_cnt; i++) {
+		if (alloc_ns)
+			alloc_mean += res[i].ma.alloc * 1000.0 / alloc_ns;
+		if (free_ns)
+			free_mean += res[i].ma.free * 1000.0 / free_ns;
+		mem_mean += res[i].ma.mem_bytes / 1048576.0 / (0.0 + res_cnt);
+	}
+	if (res_cnt > 1) {
+		for (i = 0; i < res_cnt; i++) {
+			double sample;
+
+			sample = res[i].ma.alloc_ns ? res[i].ma.alloc * 1000.0 /
+						      res[i].ma.alloc_ns : 0.0;
+			alloc_stddev += (alloc_mean - sample) * (alloc_mean - sample) /
+					(res_cnt - 1.0);
+
+			sample = res[i].ma.free_ns ? res[i].ma.free * 1000.0 /
+						     res[i].ma.free_ns : 0.0;
+			free_stddev += (free_mean - sample) * (free_mean - sample) /
+				       (res_cnt - 1.0);
+
+			sample = res[i].ma.mem_bytes / 1048576.0;
+			mem_stddev += (mem_mean - sample) * (mem_mean - sample) /
+				      (res_cnt - 1.0);
+		}
+		alloc_stddev = sqrt(alloc_stddev);
+		free_stddev = sqrt(free_stddev);
+		mem_stddev = sqrt(mem_stddev);
+	}
+
+	printf("Summary: per-prod alloc %7.2lf \u00B1 %3.2lfM/s free %7.2lf \u00B1 %3.2lfM/s, "
+	       "total memory usage %7.2lf \u00B1 %3.2lfMiB\n",
+	       alloc_mean, alloc_stddev, free_mean, free_stddev,
+	       mem_mean, mem_stddev);
+}
+
+const struct bench bench_bpf_mem_alloc = {
+	.name = "bpf_ma",
+	.argp = &bench_bpf_mem_alloc_argp,
+	.validate = bpf_ma_validate,
+	.setup = bpf_ma_setup,
+	.producer_thread = bpf_ma_producer,
+	.measure = bpf_ma_measure,
+	.report_progress = bpf_ma_report_progress,
+	.report_final = bpf_ma_report_final,
+};
diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_ma.c b/tools/testing/selftests/bpf/progs/bench_bpf_ma.c
new file mode 100644
index 000000000000..d936fd6a76b8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bench_bpf_ma.c
@@ -0,0 +1,222 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+#include "bpf_experimental.h"
+#include "bpf_misc.h"
+
+#define ALLOC_OBJ_SIZE 128
+#define ALLOC_BATCH_CNT 64
+
+char _license[] SEC("license") = "GPL";
+
+long alloc_cnt = 0, free_cnt = 0;
+long alloc_ns = 0, free_ns = 0;
+
+struct bin_data {
+	char data[ALLOC_OBJ_SIZE - sizeof(void *)];
+};
+
+struct percpu_bin_data {
+	char data[ALLOC_OBJ_SIZE - sizeof(void *)];
+};
+
+struct percpu_map_value {
+	struct percpu_bin_data __percpu_kptr * data;
+};
+
+struct map_value {
+	struct bin_data __kptr * data;
+};
+
+struct inner_array_type {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct map_value);
+	__uint(max_entries, ALLOC_BATCH_CNT);
+} inner_array SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(key_size, 4);
+	__uint(value_size, 4);
+	/* benchmark will update max_entries accordingly */
+	__uint(max_entries, 1);
+	__array(values, struct inner_array_type);
+} outer_array SEC(".maps") = {
+	.values = {
+		[0] = &inner_array,
+	},
+};
+
+struct percpu_inner_array_type {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct percpu_map_value);
+	__uint(max_entries, ALLOC_BATCH_CNT);
+} percpu_inner_array SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(key_size, 4);
+	__uint(value_size, 4);
+	/* benchmark will update max_entries accordingly */
+	__uint(max_entries, 1);
+	__array(values, struct percpu_inner_array_type);
+} percpu_outer_array SEC(".maps") = {
+	.values = {
+		[0] = &percpu_inner_array,
+	},
+};
+
+/* Return the number of allocated objects */
+static __always_inline unsigned int batch_alloc(struct bpf_map *map)
+{
+	struct bin_data *old, *new;
+	struct map_value *value;
+	unsigned int i, key;
+
+	for (i = 0; i < ALLOC_BATCH_CNT; i++) {
+		key = i;
+		value = bpf_map_lookup_elem(map, &key);
+		if (!value)
+			return i;
+
+		new = bpf_obj_new(typeof(*new));
+		if (!new)
+			return i;
+
+		old = bpf_kptr_xchg(&value->data, new);
+		if (old)
+			bpf_obj_drop(old);
+	}
+
+	return ALLOC_BATCH_CNT;
+}
+
+/* Return the number of freed objects */
+static __always_inline unsigned int batch_free(struct bpf_map *map)
+{
+	struct map_value *value;
+	unsigned int i, key;
+	void *old;
+
+	for (i = 0; i < ALLOC_BATCH_CNT; i++) {
+		key = i;
+		value = bpf_map_lookup_elem(map, &key);
+		if (!value)
+			return i;
+
+		old = bpf_kptr_xchg(&value->data, NULL);
+		if (!old)
+			return i;
+		bpf_obj_drop(old);
+	}
+
+	return ALLOC_BATCH_CNT;
+}
+
+/* Return the number of allocated objects */
+static __always_inline unsigned int batch_percpu_alloc(struct bpf_map *map)
+{
+	struct percpu_bin_data *old, *new;
+	struct percpu_map_value *value;
+	unsigned int i, key;
+
+	for (i = 0; i < ALLOC_BATCH_CNT; i++) {
+		key = i;
+		value = bpf_map_lookup_elem(map, &key);
+		if (!value)
+			return i;
+
+		new = bpf_percpu_obj_new(typeof(*new));
+		if (!new)
+			return i;
+
+		old = bpf_kptr_xchg(&value->data, new);
+		if (old)
+			bpf_percpu_obj_drop(old);
+	}
+
+	return ALLOC_BATCH_CNT;
+}
+
+/* Return the number of freed objects */
+static __always_inline unsigned int batch_percpu_free(struct bpf_map *map)
+{
+	struct percpu_map_value *value;
+	unsigned int i, key;
+	void *old;
+
+	for (i = 0; i < ALLOC_BATCH_CNT; i++) {
+		key = i;
+		value = bpf_map_lookup_elem(map, &key);
+		if (!value)
+			return i;
+
+		old = bpf_kptr_xchg(&value->data, NULL);
+		if (!old)
+			return i;
+		bpf_percpu_obj_drop(old);
+	}
+
+	return ALLOC_BATCH_CNT;
+}
+
+SEC("?fentry/" SYS_PREFIX "sys_getpgid")
+int bench_batch_alloc_free(void *ctx)
+{
+	u64 start, delta;
+	unsigned int cnt;
+	void *map;
+	int key;
+
+	key = bpf_get_smp_processor_id();
+	map = bpf_map_lookup_elem((void *)&outer_array, &key);
+	if (!map)
+		return 0;
+
+	start = bpf_ktime_get_boot_ns();
+	cnt = batch_alloc(map);
+	delta = bpf_ktime_get_boot_ns() - start;
+	__sync_fetch_and_add(&alloc_cnt, cnt);
+	__sync_fetch_and_add(&alloc_ns, delta);
+
+	start = bpf_ktime_get_boot_ns();
+	cnt = batch_free(map);
+	delta = bpf_ktime_get_boot_ns() - start;
+	__sync_fetch_and_add(&free_cnt, cnt);
+	__sync_fetch_and_add(&free_ns, delta);
+
+	return 0;
+}
+
+SEC("?fentry/" SYS_PREFIX "sys_getpgid")
+int bench_batch_percpu_alloc_free(void *ctx)
+{
+	u64 start, delta;
+	unsigned int cnt;
+	void *map;
+	int key;
+
+	key = bpf_get_smp_processor_id();
+	map = bpf_map_lookup_elem((void *)&percpu_outer_array, &key);
+	if (!map)
+		return 0;
+
+	start = bpf_ktime_get_boot_ns();
+	cnt = batch_percpu_alloc(map);
+	delta = bpf_ktime_get_boot_ns() - start;
+	__sync_fetch_and_add(&alloc_cnt, cnt);
+	__sync_fetch_and_add(&alloc_ns, delta);
+
+	start = bpf_ktime_get_boot_ns();
+	cnt = batch_percpu_free(map);
+	delta = bpf_ktime_get_boot_ns() - start;
+	__sync_fetch_and_add(&free_cnt, cnt);
+	__sync_fetch_and_add(&free_ns, delta);
+
+	return 0;
+}