diff mbox series

[bpf-next,v4,09/10] bpf, x86: Jit support for nested bpf_prog_call

Message ID 20241010175638.1899406-1-yonghong.song@linux.dev (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Support private stack for bpf progs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 fail Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 206 this patch: 206
netdev/build_tools success Errors and warnings before: 0 (+1) this patch: 0 (+1)
netdev/cc_maintainers warning 16 maintainers not CCed: dave.hansen@linux.intel.com john.fastabend@gmail.com dsahern@kernel.org jolsa@kernel.org x86@kernel.org kpsingh@kernel.org song@kernel.org haoluo@google.com bp@alien8.de netdev@vger.kernel.org sdf@fomichev.me martin.lau@linux.dev hpa@zytor.com tglx@linutronix.de eddyz87@gmail.com mingo@redhat.com
netdev/build_clang success Errors and warnings before: 257 this patch: 257
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6964 this patch: 6964
netdev/checkpatch warning CHECK: No space is necessary after a cast CHECK: Unnecessary parentheses around prog->active WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0

Commit Message

Yonghong Song Oct. 10, 2024, 5:56 p.m. UTC
Two functions are added in the kernel
  - int notrace __bpf_prog_enter_recur_limited(struct bpf_prog *prog)
  - void notrace __bpf_prog_exit_recur_limited(struct bpf_prog *prog)
and they are called in bpf progs through jit.

Func __bpf_prog_enter_recur_limited() will return 0 if maximum recursion
level has been reached in which case, bpf prog will return to the caller
directly. Otherwise, it will return the current recursion level. The
recursion level will be used by jit to calculated proper frame pointer
for that recursion level.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 arch/x86/net/bpf_jit_comp.c | 94 +++++++++++++++++++++++++++++++++----
 include/linux/bpf.h         |  2 +
 kernel/bpf/trampoline.c     | 16 +++++++
 3 files changed, 104 insertions(+), 8 deletions(-)

Comments

Alexei Starovoitov Oct. 10, 2024, 8:53 p.m. UTC | #1
On Thu, Oct 10, 2024 at 10:59 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>  static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
> -                               enum bpf_priv_stack_mode priv_stack_mode)
> +                               enum bpf_priv_stack_mode priv_stack_mode,
> +                               bool is_subprog, u8 *image, u8 *temp)
>  {
>         u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8);
>         u8 *prog = *pprog;
>
> -       if (priv_stack_mode == PRIV_STACK_ROOT_PROG)
> -               emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> -       else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth)
> +       if (priv_stack_mode == PRIV_STACK_ROOT_PROG) {
> +               int offs;
> +               u8 *func;
> +
> +               if (!bpf_prog->aux->has_prog_call) {
> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> +               } else {
> +                       EMIT1(0x57);            /* push rdi */
> +                       if (is_subprog) {
> +                               /* subprog may have up to 5 arguments */
> +                               EMIT1(0x56);            /* push rsi */
> +                               EMIT1(0x52);            /* push rdx */
> +                               EMIT1(0x51);            /* push rcx */
> +                               EMIT2(0x41, 0x50);      /* push r8 */
> +                       }
> +                       emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
> +                                      (u32) (long) bpf_prog);
> +                       func = (u8 *)__bpf_prog_enter_recur_limited;
> +                       offs = prog - temp;
> +                       offs += x86_call_depth_emit_accounting(&prog, func, image + offs);
> +                       emit_call(&prog, func, image + offs);
> +                       if (is_subprog) {
> +                               EMIT2(0x41, 0x58);      /* pop r8 */
> +                               EMIT1(0x59);            /* pop rcx */
> +                               EMIT1(0x5a);            /* pop rdx */
> +                               EMIT1(0x5e);            /* pop rsi */
> +                       }
> +                       EMIT1(0x5f);            /* pop rdi */
> +
> +                       EMIT4(0x48, 0x83, 0xf8, 0x0);   /* cmp rax,0x0 */
> +                       EMIT2(X86_JNE, num_bytes_of_emit_return() + 1);
> +
> +                       /* return if stack recursion has been reached */
> +                       EMIT1(0xC9);    /* leave */
> +                       emit_return(&prog, image + (prog - temp));
> +
> +                       /* cnt -= 1 */
> +                       emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K,
> +                                         BPF_REG_0, 1);
> +
> +                       /* accum_stack_depth = cnt * subtree_stack_depth */
> +                       emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0,
> +                                         bpf_prog->aux->subtree_stack_depth);
> +
> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> +
> +                       /* r9 += accum_stack_depth */
> +                       emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9,
> +                                         BPF_REG_0);

That's way too much asm for logic that can stay in C.

bpf_trampoline_enter() should select __bpf_prog_enter_recur_limited()
for appropriate prog_type/attach_type/etc.

JITs don't need to change.
Yonghong Song Oct. 11, 2024, 4:20 a.m. UTC | #2
On 10/10/24 1:53 PM, Alexei Starovoitov wrote:
> On Thu, Oct 10, 2024 at 10:59 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>>   static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
>> -                               enum bpf_priv_stack_mode priv_stack_mode)
>> +                               enum bpf_priv_stack_mode priv_stack_mode,
>> +                               bool is_subprog, u8 *image, u8 *temp)
>>   {
>>          u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8);
>>          u8 *prog = *pprog;
>>
>> -       if (priv_stack_mode == PRIV_STACK_ROOT_PROG)
>> -               emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>> -       else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth)
>> +       if (priv_stack_mode == PRIV_STACK_ROOT_PROG) {
>> +               int offs;
>> +               u8 *func;
>> +
>> +               if (!bpf_prog->aux->has_prog_call) {
>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>> +               } else {
>> +                       EMIT1(0x57);            /* push rdi */
>> +                       if (is_subprog) {
>> +                               /* subprog may have up to 5 arguments */
>> +                               EMIT1(0x56);            /* push rsi */
>> +                               EMIT1(0x52);            /* push rdx */
>> +                               EMIT1(0x51);            /* push rcx */
>> +                               EMIT2(0x41, 0x50);      /* push r8 */
>> +                       }
>> +                       emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
>> +                                      (u32) (long) bpf_prog);
>> +                       func = (u8 *)__bpf_prog_enter_recur_limited;
>> +                       offs = prog - temp;
>> +                       offs += x86_call_depth_emit_accounting(&prog, func, image + offs);
>> +                       emit_call(&prog, func, image + offs);
>> +                       if (is_subprog) {
>> +                               EMIT2(0x41, 0x58);      /* pop r8 */
>> +                               EMIT1(0x59);            /* pop rcx */
>> +                               EMIT1(0x5a);            /* pop rdx */
>> +                               EMIT1(0x5e);            /* pop rsi */
>> +                       }
>> +                       EMIT1(0x5f);            /* pop rdi */
>> +
>> +                       EMIT4(0x48, 0x83, 0xf8, 0x0);   /* cmp rax,0x0 */
>> +                       EMIT2(X86_JNE, num_bytes_of_emit_return() + 1);
>> +
>> +                       /* return if stack recursion has been reached */
>> +                       EMIT1(0xC9);    /* leave */
>> +                       emit_return(&prog, image + (prog - temp));
>> +
>> +                       /* cnt -= 1 */
>> +                       emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K,
>> +                                         BPF_REG_0, 1);
>> +
>> +                       /* accum_stack_depth = cnt * subtree_stack_depth */
>> +                       emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0,
>> +                                         bpf_prog->aux->subtree_stack_depth);
>> +
>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>> +
>> +                       /* r9 += accum_stack_depth */
>> +                       emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9,
>> +                                         BPF_REG_0);
> That's way too much asm for logic that can stay in C.
>
> bpf_trampoline_enter() should select __bpf_prog_enter_recur_limited()
> for appropriate prog_type/attach_type/etc.

The above jit code not just for the main prog, but also for callback fn's
since callback fn could call bpf prog as well. So putting in bpf trampoline
not enough.

But I can improve the above by putting the most logic
    cnt -= 1; accum_stack_depth = cnt * subtree_stack_depth; r9 += accum_stack_depth
inside __bpf_prog_enter_recur_limited().

>
> JITs don't need to change.
Alexei Starovoitov Oct. 11, 2024, 4:29 a.m. UTC | #3
On Thu, Oct 10, 2024 at 9:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 10/10/24 1:53 PM, Alexei Starovoitov wrote:
> > On Thu, Oct 10, 2024 at 10:59 AM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>   static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
> >> -                               enum bpf_priv_stack_mode priv_stack_mode)
> >> +                               enum bpf_priv_stack_mode priv_stack_mode,
> >> +                               bool is_subprog, u8 *image, u8 *temp)
> >>   {
> >>          u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8);
> >>          u8 *prog = *pprog;
> >>
> >> -       if (priv_stack_mode == PRIV_STACK_ROOT_PROG)
> >> -               emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> >> -       else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth)
> >> +       if (priv_stack_mode == PRIV_STACK_ROOT_PROG) {
> >> +               int offs;
> >> +               u8 *func;
> >> +
> >> +               if (!bpf_prog->aux->has_prog_call) {
> >> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> >> +               } else {
> >> +                       EMIT1(0x57);            /* push rdi */
> >> +                       if (is_subprog) {
> >> +                               /* subprog may have up to 5 arguments */
> >> +                               EMIT1(0x56);            /* push rsi */
> >> +                               EMIT1(0x52);            /* push rdx */
> >> +                               EMIT1(0x51);            /* push rcx */
> >> +                               EMIT2(0x41, 0x50);      /* push r8 */
> >> +                       }
> >> +                       emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
> >> +                                      (u32) (long) bpf_prog);
> >> +                       func = (u8 *)__bpf_prog_enter_recur_limited;
> >> +                       offs = prog - temp;
> >> +                       offs += x86_call_depth_emit_accounting(&prog, func, image + offs);
> >> +                       emit_call(&prog, func, image + offs);
> >> +                       if (is_subprog) {
> >> +                               EMIT2(0x41, 0x58);      /* pop r8 */
> >> +                               EMIT1(0x59);            /* pop rcx */
> >> +                               EMIT1(0x5a);            /* pop rdx */
> >> +                               EMIT1(0x5e);            /* pop rsi */
> >> +                       }
> >> +                       EMIT1(0x5f);            /* pop rdi */
> >> +
> >> +                       EMIT4(0x48, 0x83, 0xf8, 0x0);   /* cmp rax,0x0 */
> >> +                       EMIT2(X86_JNE, num_bytes_of_emit_return() + 1);
> >> +
> >> +                       /* return if stack recursion has been reached */
> >> +                       EMIT1(0xC9);    /* leave */
> >> +                       emit_return(&prog, image + (prog - temp));
> >> +
> >> +                       /* cnt -= 1 */
> >> +                       emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K,
> >> +                                         BPF_REG_0, 1);
> >> +
> >> +                       /* accum_stack_depth = cnt * subtree_stack_depth */
> >> +                       emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0,
> >> +                                         bpf_prog->aux->subtree_stack_depth);
> >> +
> >> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> >> +
> >> +                       /* r9 += accum_stack_depth */
> >> +                       emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9,
> >> +                                         BPF_REG_0);
> > That's way too much asm for logic that can stay in C.
> >
> > bpf_trampoline_enter() should select __bpf_prog_enter_recur_limited()
> > for appropriate prog_type/attach_type/etc.
>
> The above jit code not just for the main prog, but also for callback fn's
> since callback fn could call bpf prog as well. So putting in bpf trampoline
> not enough.

callback can call the prog only if bpf_call_prog() kfunc exists
and that's one more reason to avoid going that direction.
Yonghong Song Oct. 11, 2024, 3:38 p.m. UTC | #4
On 10/10/24 9:29 PM, Alexei Starovoitov wrote:
> On Thu, Oct 10, 2024 at 9:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> On 10/10/24 1:53 PM, Alexei Starovoitov wrote:
>>> On Thu, Oct 10, 2024 at 10:59 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>    static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
>>>> -                               enum bpf_priv_stack_mode priv_stack_mode)
>>>> +                               enum bpf_priv_stack_mode priv_stack_mode,
>>>> +                               bool is_subprog, u8 *image, u8 *temp)
>>>>    {
>>>>           u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8);
>>>>           u8 *prog = *pprog;
>>>>
>>>> -       if (priv_stack_mode == PRIV_STACK_ROOT_PROG)
>>>> -               emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>>>> -       else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth)
>>>> +       if (priv_stack_mode == PRIV_STACK_ROOT_PROG) {
>>>> +               int offs;
>>>> +               u8 *func;
>>>> +
>>>> +               if (!bpf_prog->aux->has_prog_call) {
>>>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>>>> +               } else {
>>>> +                       EMIT1(0x57);            /* push rdi */
>>>> +                       if (is_subprog) {
>>>> +                               /* subprog may have up to 5 arguments */
>>>> +                               EMIT1(0x56);            /* push rsi */
>>>> +                               EMIT1(0x52);            /* push rdx */
>>>> +                               EMIT1(0x51);            /* push rcx */
>>>> +                               EMIT2(0x41, 0x50);      /* push r8 */
>>>> +                       }
>>>> +                       emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
>>>> +                                      (u32) (long) bpf_prog);
>>>> +                       func = (u8 *)__bpf_prog_enter_recur_limited;
>>>> +                       offs = prog - temp;
>>>> +                       offs += x86_call_depth_emit_accounting(&prog, func, image + offs);
>>>> +                       emit_call(&prog, func, image + offs);
>>>> +                       if (is_subprog) {
>>>> +                               EMIT2(0x41, 0x58);      /* pop r8 */
>>>> +                               EMIT1(0x59);            /* pop rcx */
>>>> +                               EMIT1(0x5a);            /* pop rdx */
>>>> +                               EMIT1(0x5e);            /* pop rsi */
>>>> +                       }
>>>> +                       EMIT1(0x5f);            /* pop rdi */
>>>> +
>>>> +                       EMIT4(0x48, 0x83, 0xf8, 0x0);   /* cmp rax,0x0 */
>>>> +                       EMIT2(X86_JNE, num_bytes_of_emit_return() + 1);
>>>> +
>>>> +                       /* return if stack recursion has been reached */
>>>> +                       EMIT1(0xC9);    /* leave */
>>>> +                       emit_return(&prog, image + (prog - temp));
>>>> +
>>>> +                       /* cnt -= 1 */
>>>> +                       emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K,
>>>> +                                         BPF_REG_0, 1);
>>>> +
>>>> +                       /* accum_stack_depth = cnt * subtree_stack_depth */
>>>> +                       emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0,
>>>> +                                         bpf_prog->aux->subtree_stack_depth);
>>>> +
>>>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>>>> +
>>>> +                       /* r9 += accum_stack_depth */
>>>> +                       emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9,
>>>> +                                         BPF_REG_0);
>>> That's way too much asm for logic that can stay in C.
>>>
>>> bpf_trampoline_enter() should select __bpf_prog_enter_recur_limited()
>>> for appropriate prog_type/attach_type/etc.
>> The above jit code not just for the main prog, but also for callback fn's
>> since callback fn could call bpf prog as well. So putting in bpf trampoline
>> not enough.
> callback can call the prog only if bpf_call_prog() kfunc exists
> and that's one more reason to avoid going that direction.

Okay, I will add verifier check to prevent bpf_call_prog() in callback functions.
Alexei Starovoitov Oct. 11, 2024, 3:40 p.m. UTC | #5
On Fri, Oct 11, 2024 at 8:39 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 10/10/24 9:29 PM, Alexei Starovoitov wrote:
> > On Thu, Oct 10, 2024 at 9:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>
> >> On 10/10/24 1:53 PM, Alexei Starovoitov wrote:
> >>> On Thu, Oct 10, 2024 at 10:59 AM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>>>    static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
> >>>> -                               enum bpf_priv_stack_mode priv_stack_mode)
> >>>> +                               enum bpf_priv_stack_mode priv_stack_mode,
> >>>> +                               bool is_subprog, u8 *image, u8 *temp)
> >>>>    {
> >>>>           u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8);
> >>>>           u8 *prog = *pprog;
> >>>>
> >>>> -       if (priv_stack_mode == PRIV_STACK_ROOT_PROG)
> >>>> -               emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> >>>> -       else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth)
> >>>> +       if (priv_stack_mode == PRIV_STACK_ROOT_PROG) {
> >>>> +               int offs;
> >>>> +               u8 *func;
> >>>> +
> >>>> +               if (!bpf_prog->aux->has_prog_call) {
> >>>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> >>>> +               } else {
> >>>> +                       EMIT1(0x57);            /* push rdi */
> >>>> +                       if (is_subprog) {
> >>>> +                               /* subprog may have up to 5 arguments */
> >>>> +                               EMIT1(0x56);            /* push rsi */
> >>>> +                               EMIT1(0x52);            /* push rdx */
> >>>> +                               EMIT1(0x51);            /* push rcx */
> >>>> +                               EMIT2(0x41, 0x50);      /* push r8 */
> >>>> +                       }
> >>>> +                       emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
> >>>> +                                      (u32) (long) bpf_prog);
> >>>> +                       func = (u8 *)__bpf_prog_enter_recur_limited;
> >>>> +                       offs = prog - temp;
> >>>> +                       offs += x86_call_depth_emit_accounting(&prog, func, image + offs);
> >>>> +                       emit_call(&prog, func, image + offs);
> >>>> +                       if (is_subprog) {
> >>>> +                               EMIT2(0x41, 0x58);      /* pop r8 */
> >>>> +                               EMIT1(0x59);            /* pop rcx */
> >>>> +                               EMIT1(0x5a);            /* pop rdx */
> >>>> +                               EMIT1(0x5e);            /* pop rsi */
> >>>> +                       }
> >>>> +                       EMIT1(0x5f);            /* pop rdi */
> >>>> +
> >>>> +                       EMIT4(0x48, 0x83, 0xf8, 0x0);   /* cmp rax,0x0 */
> >>>> +                       EMIT2(X86_JNE, num_bytes_of_emit_return() + 1);
> >>>> +
> >>>> +                       /* return if stack recursion has been reached */
> >>>> +                       EMIT1(0xC9);    /* leave */
> >>>> +                       emit_return(&prog, image + (prog - temp));
> >>>> +
> >>>> +                       /* cnt -= 1 */
> >>>> +                       emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K,
> >>>> +                                         BPF_REG_0, 1);
> >>>> +
> >>>> +                       /* accum_stack_depth = cnt * subtree_stack_depth */
> >>>> +                       emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0,
> >>>> +                                         bpf_prog->aux->subtree_stack_depth);
> >>>> +
> >>>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
> >>>> +
> >>>> +                       /* r9 += accum_stack_depth */
> >>>> +                       emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9,
> >>>> +                                         BPF_REG_0);
> >>> That's way too much asm for logic that can stay in C.
> >>>
> >>> bpf_trampoline_enter() should select __bpf_prog_enter_recur_limited()
> >>> for appropriate prog_type/attach_type/etc.
> >> The above jit code not just for the main prog, but also for callback fn's
> >> since callback fn could call bpf prog as well. So putting in bpf trampoline
> >> not enough.
> > callback can call the prog only if bpf_call_prog() kfunc exists
> > and that's one more reason to avoid going that direction.
>
> Okay, I will add verifier check to prevent bpf_call_prog() in callback functions.

We're talking past each other.
It's a nack to introduce bpf_call_prog kfunc.
Yonghong Song Oct. 11, 2024, 4:14 p.m. UTC | #6
On 10/11/24 8:40 AM, Alexei Starovoitov wrote:
> On Fri, Oct 11, 2024 at 8:39 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> On 10/10/24 9:29 PM, Alexei Starovoitov wrote:
>>> On Thu, Oct 10, 2024 at 9:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>> On 10/10/24 1:53 PM, Alexei Starovoitov wrote:
>>>>> On Thu, Oct 10, 2024 at 10:59 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>>>     static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
>>>>>> -                               enum bpf_priv_stack_mode priv_stack_mode)
>>>>>> +                               enum bpf_priv_stack_mode priv_stack_mode,
>>>>>> +                               bool is_subprog, u8 *image, u8 *temp)
>>>>>>     {
>>>>>>            u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8);
>>>>>>            u8 *prog = *pprog;
>>>>>>
>>>>>> -       if (priv_stack_mode == PRIV_STACK_ROOT_PROG)
>>>>>> -               emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>>>>>> -       else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth)
>>>>>> +       if (priv_stack_mode == PRIV_STACK_ROOT_PROG) {
>>>>>> +               int offs;
>>>>>> +               u8 *func;
>>>>>> +
>>>>>> +               if (!bpf_prog->aux->has_prog_call) {
>>>>>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>>>>>> +               } else {
>>>>>> +                       EMIT1(0x57);            /* push rdi */
>>>>>> +                       if (is_subprog) {
>>>>>> +                               /* subprog may have up to 5 arguments */
>>>>>> +                               EMIT1(0x56);            /* push rsi */
>>>>>> +                               EMIT1(0x52);            /* push rdx */
>>>>>> +                               EMIT1(0x51);            /* push rcx */
>>>>>> +                               EMIT2(0x41, 0x50);      /* push r8 */
>>>>>> +                       }
>>>>>> +                       emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
>>>>>> +                                      (u32) (long) bpf_prog);
>>>>>> +                       func = (u8 *)__bpf_prog_enter_recur_limited;
>>>>>> +                       offs = prog - temp;
>>>>>> +                       offs += x86_call_depth_emit_accounting(&prog, func, image + offs);
>>>>>> +                       emit_call(&prog, func, image + offs);
>>>>>> +                       if (is_subprog) {
>>>>>> +                               EMIT2(0x41, 0x58);      /* pop r8 */
>>>>>> +                               EMIT1(0x59);            /* pop rcx */
>>>>>> +                               EMIT1(0x5a);            /* pop rdx */
>>>>>> +                               EMIT1(0x5e);            /* pop rsi */
>>>>>> +                       }
>>>>>> +                       EMIT1(0x5f);            /* pop rdi */
>>>>>> +
>>>>>> +                       EMIT4(0x48, 0x83, 0xf8, 0x0);   /* cmp rax,0x0 */
>>>>>> +                       EMIT2(X86_JNE, num_bytes_of_emit_return() + 1);
>>>>>> +
>>>>>> +                       /* return if stack recursion has been reached */
>>>>>> +                       EMIT1(0xC9);    /* leave */
>>>>>> +                       emit_return(&prog, image + (prog - temp));
>>>>>> +
>>>>>> +                       /* cnt -= 1 */
>>>>>> +                       emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K,
>>>>>> +                                         BPF_REG_0, 1);
>>>>>> +
>>>>>> +                       /* accum_stack_depth = cnt * subtree_stack_depth */
>>>>>> +                       emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0,
>>>>>> +                                         bpf_prog->aux->subtree_stack_depth);
>>>>>> +
>>>>>> +                       emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
>>>>>> +
>>>>>> +                       /* r9 += accum_stack_depth */
>>>>>> +                       emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9,
>>>>>> +                                         BPF_REG_0);
>>>>> That's way too much asm for logic that can stay in C.
>>>>>
>>>>> bpf_trampoline_enter() should select __bpf_prog_enter_recur_limited()
>>>>> for appropriate prog_type/attach_type/etc.
>>>> The above jit code not just for the main prog, but also for callback fn's
>>>> since callback fn could call bpf prog as well. So putting in bpf trampoline
>>>> not enough.
>>> callback can call the prog only if bpf_call_prog() kfunc exists
>>> and that's one more reason to avoid going that direction.
>> Okay, I will add verifier check to prevent bpf_call_prog() in callback functions.
> We're talking past each other.
> It's a nack to introduce bpf_call_prog kfunc.

Okay. Will remove it in the next revision.
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 297dd64f4b6a..a763e018e87f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -501,7 +501,8 @@  static void emit_prologue_tail_call(u8 **pprog, bool is_subprog)
 }
 
 static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
-				enum bpf_priv_stack_mode priv_stack_mode);
+				enum bpf_priv_stack_mode priv_stack_mode,
+				bool is_subprog, u8 *image, u8 *temp);
 
 /*
  * Emit x86-64 prologue code for BPF program.
@@ -510,7 +511,8 @@  static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, struct bpf_prog *bpf_prog,
 			  bool tail_call_reachable,
-			  enum bpf_priv_stack_mode priv_stack_mode)
+			  enum bpf_priv_stack_mode priv_stack_mode, u8 *image,
+			  u8 *temp)
 {
 	bool ebpf_from_cbpf = bpf_prog_was_classic(bpf_prog);
 	bool is_exception_cb = bpf_prog->aux->exception_cb;
@@ -554,7 +556,7 @@  static void emit_prologue(u8 **pprog, u32 stack_depth, struct bpf_prog *bpf_prog
 	/* X86_TAIL_CALL_OFFSET is here */
 	EMIT_ENDBR();
 
-	emit_priv_frame_ptr(&prog, bpf_prog, priv_stack_mode);
+	emit_priv_frame_ptr(&prog, bpf_prog, priv_stack_mode, is_subprog, image, temp);
 
 	/* sub rsp, rounded_stack_depth */
 	if (stack_depth)
@@ -696,6 +698,15 @@  static void emit_return(u8 **pprog, u8 *ip)
 	*pprog = prog;
 }
 
+static int num_bytes_of_emit_return(void)
+{
+	if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+		return 5;
+	if (IS_ENABLED(CONFIG_MITIGATION_SLS))
+		return 2;
+	return 1;
+}
+
 #define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack)	(-16 - round_up(stack, 8))
 
 /*
@@ -1527,17 +1538,67 @@  static void emit_root_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
 }
 
 static void emit_priv_frame_ptr(u8 **pprog, struct bpf_prog *bpf_prog,
-				enum bpf_priv_stack_mode priv_stack_mode)
+				enum bpf_priv_stack_mode priv_stack_mode,
+				bool is_subprog, u8 *image, u8 *temp)
 {
 	u32 orig_stack_depth = round_up(bpf_prog->aux->stack_depth, 8);
 	u8 *prog = *pprog;
 
-	if (priv_stack_mode == PRIV_STACK_ROOT_PROG)
-		emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
-	else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth)
+	if (priv_stack_mode == PRIV_STACK_ROOT_PROG) {
+		int offs;
+		u8 *func;
+
+		if (!bpf_prog->aux->has_prog_call) {
+			emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
+		} else {
+			EMIT1(0x57);		/* push rdi */
+			if (is_subprog) {
+				/* subprog may have up to 5 arguments */
+				EMIT1(0x56);		/* push rsi */
+				EMIT1(0x52);		/* push rdx */
+				EMIT1(0x51);		/* push rcx */
+				EMIT2(0x41, 0x50);	/* push r8 */
+			}
+			emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
+				       (u32) (long) bpf_prog);
+			func = (u8 *)__bpf_prog_enter_recur_limited;
+			offs = prog - temp;
+			offs += x86_call_depth_emit_accounting(&prog, func, image + offs);
+			emit_call(&prog, func, image + offs);
+			if (is_subprog) {
+				EMIT2(0x41, 0x58);	/* pop r8 */
+				EMIT1(0x59);		/* pop rcx */
+				EMIT1(0x5a);		/* pop rdx */
+				EMIT1(0x5e);		/* pop rsi */
+			}
+			EMIT1(0x5f);		/* pop rdi */
+
+			EMIT4(0x48, 0x83, 0xf8, 0x0);   /* cmp rax,0x0 */
+			EMIT2(X86_JNE, num_bytes_of_emit_return() + 1);
+
+			/* return if stack recursion has been reached */
+			EMIT1(0xC9);    /* leave */
+			emit_return(&prog, image + (prog - temp));
+
+			/* cnt -= 1 */
+			emit_alu_helper_1(&prog, BPF_ALU64 | BPF_SUB | BPF_K,
+					  BPF_REG_0, 1);
+
+			/* accum_stack_depth = cnt * subtree_stack_depth */
+			emit_alu_helper_3(&prog, BPF_ALU64 | BPF_MUL | BPF_K, BPF_REG_0,
+					  bpf_prog->aux->subtree_stack_depth);
+
+			emit_root_priv_frame_ptr(&prog, bpf_prog, orig_stack_depth);
+
+			/* r9 += accum_stack_depth */
+			emit_alu_helper_2(&prog, BPF_ALU64 | BPF_ADD | BPF_X, X86_REG_R9,
+					  BPF_REG_0);
+		}
+	} else if (priv_stack_mode == PRIV_STACK_SUB_PROG && orig_stack_depth) {
 		/* r9 += orig_stack_depth */
 		emit_alu_helper_1(&prog, BPF_ALU64 | BPF_ADD | BPF_K, X86_REG_R9,
 				  orig_stack_depth);
+	}
 
 	*pprog = prog;
 }
@@ -1578,7 +1639,7 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	detect_reg_usage(insn, insn_cnt, callee_regs_used);
 
 	emit_prologue(&prog, stack_depth, bpf_prog, tail_call_reachable,
-		      priv_stack_mode);
+		      priv_stack_mode, image, temp);
 	/* Exception callback will clobber callee regs for its own use, and
 	 * restore the original callee regs from main prog's stack frame.
 	 */
@@ -2519,6 +2580,23 @@  st:			if (is_imm8(insn->off))
 				if (arena_vm_start)
 					pop_r12(&prog);
 			}
+
+			if (bpf_prog->aux->has_prog_call) {
+				u8 *func, *ip;
+				int offs;
+
+				ip = image + addrs[i - 1];
+				/* save and restore the return value */
+				EMIT1(0x50);    /* push rax */
+				emit_mov_imm64(&prog, BPF_REG_1, (long) bpf_prog >> 32,
+					       (u32) (long) bpf_prog);
+				func = (u8 *)__bpf_prog_exit_recur_limited;
+				offs = prog - temp;
+				offs += x86_call_depth_emit_accounting(&prog, func, ip + offs);
+				emit_call(&prog, func, ip + offs);
+				EMIT1(0x58);    /* pop rax */
+			}
+
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
 			break;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 952cb398eb30..605004cba9f7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1148,6 +1148,8 @@  u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
 					     struct bpf_tramp_run_ctx *run_ctx);
+int notrace __bpf_prog_enter_recur_limited(struct bpf_prog *prog);
+void notrace __bpf_prog_exit_recur_limited(struct bpf_prog *prog);
 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
 typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f8302a5ca400..d9e7260e4b39 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -960,6 +960,22 @@  void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
 	rcu_read_unlock_trace();
 }
 
+int notrace __bpf_prog_enter_recur_limited(struct bpf_prog *prog)
+{
+	int cnt = this_cpu_inc_return(*(prog->active));
+
+	if (cnt > BPF_MAX_PRIV_STACK_NEST_LEVEL) {
+		bpf_prog_inc_misses_counter(prog);
+		return 0;
+	}
+	return cnt;
+}
+
+void notrace __bpf_prog_exit_recur_limited(struct bpf_prog *prog)
+{
+	this_cpu_dec(*(prog->active));
+}
+
 static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
 					      struct bpf_tramp_run_ctx *run_ctx)
 {