diff mbox series

[v2,bpf-next,2/2] bpf: inline bpf_get_branch_snapshot() helper

Message ID 20240402190542.757858-3-andrii@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Inline bpf_get_branch_snapshot() BPF helper | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 955 this patch: 955
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 9 maintainers not CCed: john.fastabend@gmail.com sdf@google.com kpsingh@kernel.org martin.lau@linux.dev yonghong.song@linux.dev haoluo@google.com jolsa@kernel.org song@kernel.org eddyz87@gmail.com
netdev/build_clang success Errors and warnings before: 955 this patch: 955
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 966 this patch: 966
netdev/checkpatch warning CHECK: multiple assignments should be avoided WARNING: line length of 102 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Andrii Nakryiko April 2, 2024, 7:05 p.m. UTC
Inline bpf_get_branch_snapshot() helper using architecture-agnostic
inline BPF code which calls directly into underlying callback of
perf_snapshot_branch_stack static call. This callback is set early
during kernel initialization and is never updated or reset, so it's ok
to fetch actual implementation using static_call_query() and call
directly into it.

This change eliminates a full function call and saves one LBR entry
in PERF_SAMPLE_BRANCH_ANY LBR mode.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

Comments

Andrii Nakryiko April 2, 2024, 11:22 p.m. UTC | #1
On Tue, Apr 2, 2024 at 12:05 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> Inline bpf_get_branch_snapshot() helper using architecture-agnostic
> inline BPF code which calls directly into underlying callback of
> perf_snapshot_branch_stack static call. This callback is set early
> during kernel initialization and is never updated or reset, so it's ok
> to fetch actual implementation using static_call_query() and call
> directly into it.
>
> This change eliminates a full function call and saves one LBR entry
> in PERF_SAMPLE_BRANCH_ANY LBR mode.
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index fcb62300f407..49789da56f4b 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -20157,6 +20157,61 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         goto next_insn;
>                 }
>
> +               /* Implement bpf_get_branch_snapshot inline. */
> +               if (prog->jit_requested && BITS_PER_LONG == 64 &&
> +                   insn->imm == BPF_FUNC_get_branch_snapshot) {
> +                       /* We are dealing with the following func protos:
> +                        * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
> +                        * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
> +                        */
> +                       const u32 br_entry_size = sizeof(struct perf_branch_entry);
> +
> +                       /* struct perf_branch_entry is part of UAPI and is
> +                        * used as an array element, so extremely unlikely to
> +                        * ever grow or shrink
> +                        */
> +                       BUILD_BUG_ON(br_entry_size != 24);
> +
> +                       /* if (unlikely(flags)) return -EINVAL */
> +                       insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
> +
> +                       /* Transform size (bytes) into number of entries (cnt = size / 24).
> +                        * But to avoid expensive division instruction, we implement
> +                        * divide-by-3 through multiplication, followed by further
> +                        * division by 8 through 3-bit right shift.
> +                        * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
> +                        * p. 227, chapter "Unsigned Divison by 3" for details and proofs.
> +                        *
> +                        * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
> +                        */
> +                       insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
> +                       insn_buf[2] = BPF_ALU64_REG(BPF_REG_2, BPF_REG_0, 0);

Doh, this should be:

insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);

I'll wait a bit for any other feedback, will retest everything on real
hardware again, and will submit v2 tomorrow.

pw-bot: cr


> +                       insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
> +
> +                       /* call perf_snapshot_branch_stack implementation */
> +                       insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
> +                       /* if (entry_cnt == 0) return -ENOENT */
> +                       insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
> +                       /* return entry_cnt * sizeof(struct perf_branch_entry) */
> +                       insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
> +                       insn_buf[7] = BPF_JMP_A(3);
> +                       /* return -EINVAL; */
> +                       insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
> +                       insn_buf[9] = BPF_JMP_A(1);
> +                       /* return -ENOENT; */
> +                       insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
> +                       cnt = 11;
> +
> +                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +
> +                       delta    += cnt - 1;
> +                       env->prog = prog = new_prog;
> +                       insn      = new_prog->insnsi + i + delta;
> +                       continue;
> +               }
> +
>                 /* Implement bpf_kptr_xchg inline */
>                 if (prog->jit_requested && BITS_PER_LONG == 64 &&
>                     insn->imm == BPF_FUNC_kptr_xchg &&
> --
> 2.43.0
>
Alexei Starovoitov April 3, 2024, 5:51 p.m. UTC | #2
On Tue, Apr 2, 2024 at 12:05 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
> Inline bpf_get_branch_snapshot() helper using architecture-agnostic
> inline BPF code which calls directly into underlying callback of
> perf_snapshot_branch_stack static call. This callback is set early
> during kernel initialization and is never updated or reset, so it's ok
> to fetch actual implementation using static_call_query() and call
> directly into it.
>
> This change eliminates a full function call and saves one LBR entry
> in PERF_SAMPLE_BRANCH_ANY LBR mode.
>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index fcb62300f407..49789da56f4b 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -20157,6 +20157,61 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         goto next_insn;
>                 }
>
> +               /* Implement bpf_get_branch_snapshot inline. */
> +               if (prog->jit_requested && BITS_PER_LONG == 64 &&
> +                   insn->imm == BPF_FUNC_get_branch_snapshot) {
> +                       /* We are dealing with the following func protos:
> +                        * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
> +                        * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
> +                        */
> +                       const u32 br_entry_size = sizeof(struct perf_branch_entry);
> +
> +                       /* struct perf_branch_entry is part of UAPI and is
> +                        * used as an array element, so extremely unlikely to
> +                        * ever grow or shrink
> +                        */
> +                       BUILD_BUG_ON(br_entry_size != 24);
> +
> +                       /* if (unlikely(flags)) return -EINVAL */
> +                       insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);

optimizing this out with is_reg_const(R3) is probably overkill?
Just mentioning in case you need to skip this branch.

> +
> +                       /* Transform size (bytes) into number of entries (cnt = size / 24).
> +                        * But to avoid expensive division instruction, we implement
> +                        * divide-by-3 through multiplication, followed by further
> +                        * division by 8 through 3-bit right shift.
> +                        * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
> +                        * p. 227, chapter "Unsigned Divison by 3" for details and proofs.
> +                        *
> +                        * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
> +                        */
> +                       insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
> +                       insn_buf[2] = BPF_ALU64_REG(BPF_REG_2, BPF_REG_0, 0);
> +                       insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
> +
> +                       /* call perf_snapshot_branch_stack implementation */
> +                       insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
> +                       /* if (entry_cnt == 0) return -ENOENT */
> +                       insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
> +                       /* return entry_cnt * sizeof(struct perf_branch_entry) */
> +                       insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
> +                       insn_buf[7] = BPF_JMP_A(3);
> +                       /* return -EINVAL; */
> +                       insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
> +                       insn_buf[9] = BPF_JMP_A(1);
> +                       /* return -ENOENT; */
> +                       insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
> +                       cnt = 11;
> +
> +                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +
> +                       delta    += cnt - 1;
> +                       env->prog = prog = new_prog;
> +                       insn      = new_prog->insnsi + i + delta;
> +                       continue;
> +               }
> +
>                 /* Implement bpf_kptr_xchg inline */
>                 if (prog->jit_requested && BITS_PER_LONG == 64 &&
>                     insn->imm == BPF_FUNC_kptr_xchg &&
> --
> 2.43.0
>
Andrii Nakryiko April 3, 2024, 6:09 p.m. UTC | #3
On Wed, Apr 3, 2024 at 10:51 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Apr 2, 2024 at 12:05 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > Inline bpf_get_branch_snapshot() helper using architecture-agnostic
> > inline BPF code which calls directly into underlying callback of
> > perf_snapshot_branch_stack static call. This callback is set early
> > during kernel initialization and is never updated or reset, so it's ok
> > to fetch actual implementation using static_call_query() and call
> > directly into it.
> >
> > This change eliminates a full function call and saves one LBR entry
> > in PERF_SAMPLE_BRANCH_ANY LBR mode.
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 55 insertions(+)
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index fcb62300f407..49789da56f4b 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -20157,6 +20157,61 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> >                         goto next_insn;
> >                 }
> >
> > +               /* Implement bpf_get_branch_snapshot inline. */
> > +               if (prog->jit_requested && BITS_PER_LONG == 64 &&
> > +                   insn->imm == BPF_FUNC_get_branch_snapshot) {
> > +                       /* We are dealing with the following func protos:
> > +                        * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
> > +                        * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
> > +                        */
> > +                       const u32 br_entry_size = sizeof(struct perf_branch_entry);
> > +
> > +                       /* struct perf_branch_entry is part of UAPI and is
> > +                        * used as an array element, so extremely unlikely to
> > +                        * ever grow or shrink
> > +                        */
> > +                       BUILD_BUG_ON(br_entry_size != 24);
> > +
> > +                       /* if (unlikely(flags)) return -EINVAL */
> > +                       insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
>
> optimizing this out with is_reg_const(R3) is probably overkill?
> Just mentioning in case you need to skip this branch.

probably an overkill, we'd need to make sure that any code path has R3
as constant zero.

Also note that these not taken branches don't pollute even the most
detailed PERF_SAMPLE_BRANCH_ANY mode, CPU only records non-linear code
execution changes, so it's fine from my perspective.

>
> > +
> > +                       /* Transform size (bytes) into number of entries (cnt = size / 24).
> > +                        * But to avoid expensive division instruction, we implement
> > +                        * divide-by-3 through multiplication, followed by further
> > +                        * division by 8 through 3-bit right shift.
> > +                        * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
> > +                        * p. 227, chapter "Unsigned Divison by 3" for details and proofs.
> > +                        *
> > +                        * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
> > +                        */
> > +                       insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
> > +                       insn_buf[2] = BPF_ALU64_REG(BPF_REG_2, BPF_REG_0, 0);
> > +                       insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
> > +
> > +                       /* call perf_snapshot_branch_stack implementation */
> > +                       insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
> > +                       /* if (entry_cnt == 0) return -ENOENT */
> > +                       insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
> > +                       /* return entry_cnt * sizeof(struct perf_branch_entry) */
> > +                       insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
> > +                       insn_buf[7] = BPF_JMP_A(3);
> > +                       /* return -EINVAL; */
> > +                       insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
> > +                       insn_buf[9] = BPF_JMP_A(1);
> > +                       /* return -ENOENT; */
> > +                       insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
> > +                       cnt = 11;
> > +
> > +                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > +                       if (!new_prog)
> > +                               return -ENOMEM;
> > +
> > +                       delta    += cnt - 1;
> > +                       env->prog = prog = new_prog;
> > +                       insn      = new_prog->insnsi + i + delta;
> > +                       continue;
> > +               }
> > +
> >                 /* Implement bpf_kptr_xchg inline */
> >                 if (prog->jit_requested && BITS_PER_LONG == 64 &&
> >                     insn->imm == BPF_FUNC_kptr_xchg &&
> > --
> > 2.43.0
> >
John Fastabend April 3, 2024, 10:10 p.m. UTC | #4
Andrii Nakryiko wrote:
> On Tue, Apr 2, 2024 at 12:05 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> > Inline bpf_get_branch_snapshot() helper using architecture-agnostic
> > inline BPF code which calls directly into underlying callback of
> > perf_snapshot_branch_stack static call. This callback is set early
> > during kernel initialization and is never updated or reset, so it's ok
> > to fetch actual implementation using static_call_query() and call
> > directly into it.
> >
> > This change eliminates a full function call and saves one LBR entry
> > in PERF_SAMPLE_BRANCH_ANY LBR mode.
> >
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 55 insertions(+)
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index fcb62300f407..49789da56f4b 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -20157,6 +20157,61 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
> >                         goto next_insn;
> >                 }
> >
> > +               /* Implement bpf_get_branch_snapshot inline. */
> > +               if (prog->jit_requested && BITS_PER_LONG == 64 &&
> > +                   insn->imm == BPF_FUNC_get_branch_snapshot) {
> > +                       /* We are dealing with the following func protos:
> > +                        * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
> > +                        * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
> > +                        */
> > +                       const u32 br_entry_size = sizeof(struct perf_branch_entry);
> > +
> > +                       /* struct perf_branch_entry is part of UAPI and is
> > +                        * used as an array element, so extremely unlikely to
> > +                        * ever grow or shrink
> > +                        */
> > +                       BUILD_BUG_ON(br_entry_size != 24);
> > +
> > +                       /* if (unlikely(flags)) return -EINVAL */
> > +                       insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
> > +
> > +                       /* Transform size (bytes) into number of entries (cnt = size / 24).
> > +                        * But to avoid expensive division instruction, we implement
> > +                        * divide-by-3 through multiplication, followed by further
> > +                        * division by 8 through 3-bit right shift.
> > +                        * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
> > +                        * p. 227, chapter "Unsigned Divison by 3" for details and proofs.
> > +                        *
> > +                        * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
> > +                        */

Nice bit of magic. Thanks for the reference.

> > +                       insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
> > +                       insn_buf[2] = BPF_ALU64_REG(BPF_REG_2, BPF_REG_0, 0);
> 
> Doh, this should be:
> 
> insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
> 
> I'll wait a bit for any other feedback, will retest everything on real
> hardware again, and will submit v2 tomorrow.
> 
> pw-bot: cr
> 

LGTM. With above fix,

Acked-by: John Fastabend <john.fastabend@gmail.com>

> 
> > +                       insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
> > +
> > +                       /* call perf_snapshot_branch_stack implementation */
> > +                       insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
> > +                       /* if (entry_cnt == 0) return -ENOENT */
> > +                       insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
> > +                       /* return entry_cnt * sizeof(struct perf_branch_entry) */
> > +                       insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
> > +                       insn_buf[7] = BPF_JMP_A(3);
> > +                       /* return -EINVAL; */
> > +                       insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
> > +                       insn_buf[9] = BPF_JMP_A(1);
> > +                       /* return -ENOENT; */
> > +                       insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
> > +                       cnt = 11;
> > +
> > +                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
> > +                       if (!new_prog)
> > +                               return -ENOMEM;
> > +
> > +                       delta    += cnt - 1;
> > +                       env->prog = prog = new_prog;
> > +                       insn      = new_prog->insnsi + i + delta;
> > +                       continue;
> > +               }
> > +
> >                 /* Implement bpf_kptr_xchg inline */
> >                 if (prog->jit_requested && BITS_PER_LONG == 64 &&
> >                     insn->imm == BPF_FUNC_kptr_xchg &&
> > --
> > 2.43.0
> >
>
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fcb62300f407..49789da56f4b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20157,6 +20157,61 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto next_insn;
 		}
 
+		/* Implement bpf_get_branch_snapshot inline. */
+		if (prog->jit_requested && BITS_PER_LONG == 64 &&
+		    insn->imm == BPF_FUNC_get_branch_snapshot) {
+			/* We are dealing with the following func protos:
+			 * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+			 * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+			 */
+			const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+			/* struct perf_branch_entry is part of UAPI and is
+			 * used as an array element, so extremely unlikely to
+			 * ever grow or shrink
+			 */
+			BUILD_BUG_ON(br_entry_size != 24);
+
+			/* if (unlikely(flags)) return -EINVAL */
+			insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+			/* Transform size (bytes) into number of entries (cnt = size / 24).
+			 * But to avoid expensive division instruction, we implement
+			 * divide-by-3 through multiplication, followed by further
+			 * division by 8 through 3-bit right shift.
+			 * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+			 * p. 227, chapter "Unsigned Divison by 3" for details and proofs.
+			 *
+			 * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+			 */
+			insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+			insn_buf[2] = BPF_ALU64_REG(BPF_REG_2, BPF_REG_0, 0);
+			insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+			/* call perf_snapshot_branch_stack implementation */
+			insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+			/* if (entry_cnt == 0) return -ENOENT */
+			insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+			/* return entry_cnt * sizeof(struct perf_branch_entry) */
+			insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+			insn_buf[7] = BPF_JMP_A(3);
+			/* return -EINVAL; */
+			insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+			insn_buf[9] = BPF_JMP_A(1);
+			/* return -ENOENT; */
+			insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+			cnt = 11;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		/* Implement bpf_kptr_xchg inline */
 		if (prog->jit_requested && BITS_PER_LONG == 64 &&
 		    insn->imm == BPF_FUNC_kptr_xchg &&