Message ID | 20220508032117.2783209-3-kuifeng@fb.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Attach a cookie to a tracing program. | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Clearly marked for bpf-next, async |
netdev/apply | fail | Patch does not apply to bpf-next |
bpf/vmtest-bpf-next-PR | fail | merge-conflict |
On Sat, May 7, 2022 at 8:21 PM Kui-Feng Lee <kuifeng@fb.com> wrote: > > BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on > stacks and set/reset the current bpf_run_ctx before/after calling a > bpf_prog. > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > --- Please preserve received Acked-by/Reviewed-by/etc tags that you got on previous iterations, unless you feel like you did some major changes that might invalidate reviewer's "approval". Still looks good to me: Acked-by: Andrii Nakryiko <andrii@kernel.org> > arch/x86/net/bpf_jit_comp.c | 38 +++++++++++++++++++++++++++++++++++++ > include/linux/bpf.h | 17 +++++++++++++---- > kernel/bpf/syscall.c | 7 +++++-- > kernel/bpf/trampoline.c | 20 +++++++++++++++---- > 4 files changed, 72 insertions(+), 10 deletions(-) > [...]
On Sat, May 07, 2022 at 08:21:14PM -0700, Kui-Feng Lee wrote: > > + /* Prepare struct bpf_tramp_run_ctx. > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > + */ > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct bpf_tramp_run_ctx)); > + > if (fentry->nr_links) > if (invoke_bpf(m, &prog, fentry, regs_off, > flags & BPF_TRAMP_F_RET_FENTRY_RET)) > @@ -2098,6 +2121,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i > } > > if (flags & BPF_TRAMP_F_CALL_ORIG) { > + /* pop struct bpf_tramp_run_ctx > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > + */ > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct bpf_tramp_run_ctx)); > + > restore_regs(m, &prog, nr_args, regs_off); > > /* call original function */ > @@ -2110,6 +2138,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i > im->ip_after_call = prog; > memcpy(prog, x86_nops[5], X86_PATCH_SIZE); > prog += X86_PATCH_SIZE; > + > + /* Prepare struct bpf_tramp_run_ctx. > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > + */ > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct bpf_tramp_run_ctx)); > } > > if (fmod_ret->nr_links) { > @@ -2133,6 +2166,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i > goto cleanup; > } > > + /* pop struct bpf_tramp_run_ctx > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > + */ > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct bpf_tramp_run_ctx)); > + What is the point of all of these additional sub/add rsp ? It seems unconditionally increasing stack_size by sizeof(struct bpf_tramp_run_ctx) will achieve the same and above 4 extra insns won't be needed.
On Mon, 2022-05-09 at 14:04 -0700, Alexei Starovoitov wrote: > On Sat, May 07, 2022 at 08:21:14PM -0700, Kui-Feng Lee wrote: > > > > + /* Prepare struct bpf_tramp_run_ctx. > > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > > + */ > > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct bpf_tramp_run_ctx)); > > + > > if (fentry->nr_links) > > if (invoke_bpf(m, &prog, fentry, regs_off, > > flags & BPF_TRAMP_F_RET_FENTRY_RET)) > > @@ -2098,6 +2121,11 @@ int arch_prepare_bpf_trampoline(struct > > bpf_tramp_image *im, void *image, void *i > > } > > > > if (flags & BPF_TRAMP_F_CALL_ORIG) { > > + /* pop struct bpf_tramp_run_ctx > > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > > + */ > > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct > > bpf_tramp_run_ctx)); > > + > > restore_regs(m, &prog, nr_args, regs_off); > > > > /* call original function */ > > @@ -2110,6 +2138,11 @@ int arch_prepare_bpf_trampoline(struct > > bpf_tramp_image *im, void *image, void *i > > im->ip_after_call = prog; > > memcpy(prog, x86_nops[5], X86_PATCH_SIZE); > > prog += X86_PATCH_SIZE; > > + > > + /* Prepare struct bpf_tramp_run_ctx. > > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > > + */ > > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct > > bpf_tramp_run_ctx)); > > } > > > > if (fmod_ret->nr_links) { > > @@ -2133,6 +2166,11 @@ int arch_prepare_bpf_trampoline(struct > > bpf_tramp_image *im, void *image, void *i > > goto cleanup; > > } > > > > + /* pop struct bpf_tramp_run_ctx > > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > > + */ > > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct bpf_tramp_run_ctx)); > > + > > What is the point of all of these additional sub/add rsp ? > It seems unconditionally increasing stack_size by sizeof(struct > bpf_tramp_run_ctx) > will achieve the same and above 4 extra insns won't be needed. I think you are right.
On Tue, 2022-05-10 at 01:29 +0000, Kui-Feng Lee wrote: > On Mon, 2022-05-09 at 14:04 -0700, Alexei Starovoitov wrote: > > On Sat, May 07, 2022 at 08:21:14PM -0700, Kui-Feng Lee wrote: > > > > > > + /* Prepare struct bpf_tramp_run_ctx. > > > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > > > + */ > > > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct > > > bpf_tramp_run_ctx)); > > > + > > > if (fentry->nr_links) > > > if (invoke_bpf(m, &prog, fentry, regs_off, > > > flags & > > > BPF_TRAMP_F_RET_FENTRY_RET)) > > > @@ -2098,6 +2121,11 @@ int arch_prepare_bpf_trampoline(struct > > > bpf_tramp_image *im, void *image, void *i > > > } > > > > > > if (flags & BPF_TRAMP_F_CALL_ORIG) { > > > + /* pop struct bpf_tramp_run_ctx > > > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > > > + */ > > > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct > > > bpf_tramp_run_ctx)); > > > + > > > restore_regs(m, &prog, nr_args, regs_off); > > > > > > /* call original function */ > > > @@ -2110,6 +2138,11 @@ int arch_prepare_bpf_trampoline(struct > > > bpf_tramp_image *im, void *image, void *i > > > im->ip_after_call = prog; > > > memcpy(prog, x86_nops[5], X86_PATCH_SIZE); > > > prog += X86_PATCH_SIZE; > > > + > > > + /* Prepare struct bpf_tramp_run_ctx. > > > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > > > + */ > > > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct > > > bpf_tramp_run_ctx)); > > > } > > > > > > if (fmod_ret->nr_links) { > > > @@ -2133,6 +2166,11 @@ int arch_prepare_bpf_trampoline(struct > > > bpf_tramp_image *im, void *image, void *i > > > goto cleanup; > > > } > > > > > > + /* pop struct bpf_tramp_run_ctx > > > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > > > + */ > > > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct > > > bpf_tramp_run_ctx)); > > > + > > > > What is the point of all of these additional sub/add rsp ? > > It seems unconditionally increasing stack_size by sizeof(struct > > bpf_tramp_run_ctx) > > will achieve the same and above 4 extra insns won't be needed. > > I think you are right. > The reason that I don't change stack_size is that we access arguments or saved registers basing on stack_size. Once the stack_size is changed, all these offsets should be changed too.
On Mon, May 9, 2022 at 6:43 PM Kui-Feng Lee <kuifeng@fb.com> wrote: > > On Tue, 2022-05-10 at 01:29 +0000, Kui-Feng Lee wrote: > > On Mon, 2022-05-09 at 14:04 -0700, Alexei Starovoitov wrote: > > > On Sat, May 07, 2022 at 08:21:14PM -0700, Kui-Feng Lee wrote: > > > > > > > > + /* Prepare struct bpf_tramp_run_ctx. > > > > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > > > > + */ > > > > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct > > > > bpf_tramp_run_ctx)); > > > > + > > > > if (fentry->nr_links) > > > > if (invoke_bpf(m, &prog, fentry, regs_off, > > > > flags & > > > > BPF_TRAMP_F_RET_FENTRY_RET)) > > > > @@ -2098,6 +2121,11 @@ int arch_prepare_bpf_trampoline(struct > > > > bpf_tramp_image *im, void *image, void *i > > > > } > > > > > > > > if (flags & BPF_TRAMP_F_CALL_ORIG) { > > > > + /* pop struct bpf_tramp_run_ctx > > > > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > > > > + */ > > > > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct > > > > bpf_tramp_run_ctx)); > > > > + > > > > restore_regs(m, &prog, nr_args, regs_off); > > > > > > > > /* call original function */ > > > > @@ -2110,6 +2138,11 @@ int arch_prepare_bpf_trampoline(struct > > > > bpf_tramp_image *im, void *image, void *i > > > > im->ip_after_call = prog; > > > > memcpy(prog, x86_nops[5], X86_PATCH_SIZE); > > > > prog += X86_PATCH_SIZE; > > > > + > > > > + /* Prepare struct bpf_tramp_run_ctx. > > > > + * sub rsp, sizeof(struct bpf_tramp_run_ctx) > > > > + */ > > > > + EMIT4(0x48, 0x83, 0xEC, sizeof(struct > > > > bpf_tramp_run_ctx)); > > > > } > > > > > > > > if (fmod_ret->nr_links) { > > > > @@ -2133,6 +2166,11 @@ int arch_prepare_bpf_trampoline(struct > > > > bpf_tramp_image *im, void *image, void *i > > > > goto cleanup; > > > > } > > > > > > > > + /* pop struct bpf_tramp_run_ctx > > > > + * add rsp, sizeof(struct bpf_tramp_run_ctx) > > > > + */ > > > > + EMIT4(0x48, 0x83, 0xC4, sizeof(struct > > > > bpf_tramp_run_ctx)); > > > > + > > > > > > What is the point of all of these additional sub/add rsp ? > > > It seems unconditionally increasing stack_size by sizeof(struct > > > bpf_tramp_run_ctx) > > > will achieve the same and above 4 extra insns won't be needed. > > > > I think you are right. > > > > The reason that I don't change stack_size is that we access arguments > or saved registers basing on stack_size. Once the stack_size is > changed, all these offsets should be changed too. That should be trivial. keep regs_off = stack_size; and increase stack_size right after. or some other math. Maybe worth introducing another _off in addition to int regs_off, ip_off, args_off. Definitely update 'Generated trampoline stack layout' comment and explain where bpf_tramp_run_ctx is in relation to regs_off. Maybe keeping regs_off (without new _off) is enough.
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4dcc0b1ac770..bf4576a6938c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1766,10 +1766,26 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, { u8 *prog = *pprog; u8 *jmp_insn; + int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); struct bpf_prog *p = l->link.prog; + /* mov rdi, 0 */ + emit_mov_imm64(&prog, BPF_REG_1, 0, 0); + + /* Prepare struct bpf_tramp_run_ctx. + * + * bpf_tramp_run_ctx is already preserved by + * arch_prepare_bpf_trampoline(). + * + * mov QWORD PTR [rsp + ctx_cookie_off], rdi + */ + EMIT4(0x48, 0x89, 0x7C, 0x24); EMIT1(ctx_cookie_off); + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: mov rsi, rsp (struct bpf_run_ctx *) */ + EMIT3(0x48, 0x89, 0xE6); + if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter, prog)) @@ -1815,6 +1831,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: mov rsi, rbx <- start time in nsec */ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + /* arg3: mov rdx, rsp (struct bpf_run_ctx *) */ + EMIT3(0x48, 0x89, 0xE2); if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit, prog)) @@ -2079,6 +2097,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } } + /* Prepare struct bpf_tramp_run_ctx. + * sub rsp, sizeof(struct bpf_tramp_run_ctx) + */ + EMIT4(0x48, 0x83, 0xEC, sizeof(struct bpf_tramp_run_ctx)); + if (fentry->nr_links) if (invoke_bpf(m, &prog, fentry, regs_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) @@ -2098,6 +2121,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* pop struct bpf_tramp_run_ctx + * add rsp, sizeof(struct bpf_tramp_run_ctx) + */ + EMIT4(0x48, 0x83, 0xC4, sizeof(struct bpf_tramp_run_ctx)); + restore_regs(m, &prog, nr_args, regs_off); /* call original function */ @@ -2110,6 +2138,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i im->ip_after_call = prog; memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; + + /* Prepare struct bpf_tramp_run_ctx. + * sub rsp, sizeof(struct bpf_tramp_run_ctx) + */ + EMIT4(0x48, 0x83, 0xEC, sizeof(struct bpf_tramp_run_ctx)); } if (fmod_ret->nr_links) { @@ -2133,6 +2166,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i goto cleanup; } + /* pop struct bpf_tramp_run_ctx + * add rsp, sizeof(struct bpf_tramp_run_ctx) + */ + EMIT4(0x48, 0x83, 0xC4, sizeof(struct bpf_tramp_run_ctx)); + if (flags & BPF_TRAMP_F_RESTORE_REGS) restore_regs(m, &prog, nr_args, regs_off); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 77258a34ec20..29c3188195a6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -730,6 +730,8 @@ struct bpf_tramp_links { int nr_links; }; +struct bpf_tramp_run_ctx; + /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS @@ -756,10 +758,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i struct bpf_tramp_links *tlinks, void *orig_call); /* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog); -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); @@ -1351,6 +1354,12 @@ struct bpf_trace_run_ctx { u64 bpf_cookie; }; +struct bpf_tramp_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; + struct bpf_run_ctx *saved_run_ctx; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c1853e1c715..5ed9a15daaee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5001,6 +5001,7 @@ static bool syscall_prog_is_valid_access(int off, int size, BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { struct bpf_prog * __maybe_unused prog; + struct bpf_tramp_run_ctx __maybe_unused run_ctx; switch (cmd) { case BPF_MAP_CREATE: @@ -5028,13 +5029,15 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) return -EINVAL; } - if (!__bpf_prog_enter_sleepable(prog)) { + run_ctx.bpf_cookie = 0; + run_ctx.saved_run_ctx = NULL; + if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { /* recursion detected */ bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); - __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); bpf_prog_put(prog); return 0; #endif diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d5e6bc5517cb..baf1b65d523e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -568,11 +568,14 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; @@ -602,29 +605,38 @@ static void notrace update_prog_stats(struct bpf_prog *prog, } } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { inc_misses_counter(prog); return 0; } + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); __this_cpu_dec(*(prog->active)); migrate_enable();
BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on stacks and set/reset the current bpf_run_ctx before/after calling a bpf_prog. Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> --- arch/x86/net/bpf_jit_comp.c | 38 +++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 17 +++++++++++++---- kernel/bpf/syscall.c | 7 +++++-- kernel/bpf/trampoline.c | 20 +++++++++++++++---- 4 files changed, 72 insertions(+), 10 deletions(-)