diff mbox series

[RFC,bpf-next,4/6] bpf: Add bpf runtime hooks for tracking runtime acquire/release

Message ID AM6PR03MB5080FFF4113C70F7862AAA5D99FE2@AM6PR03MB5080.eurprd03.prod.outlook.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf: BPF runtime hooks: low-overhead non-intrusive tracking of runtime acquire/release (for watchdog) | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-7 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-10 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / build-release
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 193 this patch: 193
netdev/build_tools success Errors and warnings before: 26 (+1) this patch: 26 (+1)
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 9361 this patch: 9361
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7017 this patch: 7017
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Juntong Deng Feb. 14, 2025, 12:26 a.m. UTC
This patch adds runtime hooks for tracking runtime acquire/release.

According to the bpf instruction set, all functions can have a maximum
of 5 arguments. The first 5 arguments of the BPF runtime hook function
are used to pass the original arguments of the kfunc, and the 6th
argument is used to pass the memory address of the kfunc. All bpf
runtime hook functions must follow this calling convention.

bpf_runtime_acquire is used to record the information of the referenced
object to the reference node, including the memory address of the object
and the btf id of the data structure. First, call the original kfunc.
If it returns NULL, it means that the reference acquisition failed and
no record is needed. The btf id is obtained from acquire_kfunc_tab
according to the memory address of kfunc. The reference node is taken
from the free reference list and put into the active reference hash
table after recording the information.

bpf_runtime_release is used to take out the reference node from the
active reference hash table according to the memory address of object
and put it into the free reference list. Since currently releasing
kfunc has no return value, bpf_runtime_release does not need a return
value either.

print_bpf_active_ref is used only for demonstration purposes to print
all entries in the active reference hash table.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
---
 include/linux/btf.h |   4 ++
 kernel/bpf/btf.c    | 108 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

Comments

Alexei Starovoitov Feb. 25, 2025, 10:12 p.m. UTC | #1
On Thu, Feb 13, 2025 at 4:36 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>
> +void *bpf_runtime_acquire_hook(void *arg1, void *arg2, void *arg3,
> +                              void *arg4, void *arg5, void *arg6 /* kfunc addr */)
> +{
> +       struct btf_struct_kfunc *struct_kfunc, dummy_key;
> +       struct btf_struct_kfunc_tab *tab;
> +       struct bpf_run_ctx *bpf_ctx;
> +       struct bpf_ref_node *node;
> +       bpf_kfunc_t kfunc;
> +       struct btf *btf;
> +       void *kfunc_ret;
> +
> +       kfunc = (bpf_kfunc_t)arg6;
> +       kfunc_ret = kfunc(arg1, arg2, arg3, arg4, arg5);
> +
> +       if (!kfunc_ret)
> +               return kfunc_ret;
> +
> +       bpf_ctx = current->bpf_ctx;
> +       btf = bpf_get_btf_vmlinux();
> +
> +       tab = btf->acquire_kfunc_tab;
> +       if (!tab)
> +               return kfunc_ret;
> +
> +       dummy_key.kfunc_addr = (unsigned long)arg6;
> +       struct_kfunc = bsearch(&dummy_key, tab->set, tab->cnt,
> +                              sizeof(struct btf_struct_kfunc),
> +                              btf_kfunc_addr_cmp_func);
> +
> +       node = list_first_entry(&bpf_ctx->free_ref_list, struct bpf_ref_node, lnode);
> +       node->obj_addr = (unsigned long)kfunc_ret;
> +       node->struct_btf_id = struct_kfunc->struct_btf_id;
> +
> +       list_del(&node->lnode);
> +       hash_add(bpf_ctx->active_ref_list, &node->hnode, node->obj_addr);
> +
> +       pr_info("bpf prog acquire obj addr = %lx, btf id = %d\n",
> +               node->obj_addr, node->struct_btf_id);
> +       print_bpf_active_refs();
> +
> +       return kfunc_ret;
> +}
> +
> +void bpf_runtime_release_hook(void *arg1, void *arg2, void *arg3,
> +                             void *arg4, void *arg5, void *arg6 /* kfunc addr */)
> +{
> +       struct bpf_run_ctx *bpf_ctx;
> +       struct bpf_ref_node *node;
> +       bpf_kfunc_t kfunc;
> +
> +       kfunc = (bpf_kfunc_t)arg6;
> +       kfunc(arg1, arg2, arg3, arg4, arg5);
> +
> +       bpf_ctx = current->bpf_ctx;
> +
> +       hash_for_each_possible(bpf_ctx->active_ref_list, node, hnode, (unsigned long)arg1) {
> +               if (node->obj_addr == (unsigned long)arg1) {
> +                       hash_del(&node->hnode);
> +                       list_add(&node->lnode, &bpf_ctx->free_ref_list);
> +
> +                       pr_info("bpf prog release obj addr = %lx, btf id = %d\n",
> +                               node->obj_addr, node->struct_btf_id);
> +               }
> +       }
> +
> +       print_bpf_active_refs();
> +}

So for every acq/rel the above two function will be called
and you call this:
"
perhaps we can use some low overhead runtime solution first as a
not too bad alternative
"

low overhead ?!

acq/rel kfuncs can be very hot.
To the level that single atomic_inc() is a noticeable overhead.
Doing above is an obvious no-go in any production setup.

> Before the bpf program actually runs, we can allocate the maximum
> possible number of reference nodes to record reference information.

This is an incorrect assumption.
Look at register_btf_id_dtor_kfuncs()
that patch 1 is sort-of trying to reinvent.
Acquired objects can be stashed with single xchg instruction and
people are not happy with performance either.
An acquire kfunc plus inlined bpf_kptr_xchg is too slow in some cases.
A bunch of bpf progs operate under constraints where nanoseconds count.
That's why we rely on static verification where possible.
Everytime we introduce run-time safety checks (like bpf_arena) we
sacrifice some use cases.
So, no, this proposal is not a solution.
Juntong Deng Feb. 25, 2025, 11:34 p.m. UTC | #2
On 2025/2/25 22:12, Alexei Starovoitov wrote:
> On Thu, Feb 13, 2025 at 4:36 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>>
>> +void *bpf_runtime_acquire_hook(void *arg1, void *arg2, void *arg3,
>> +                              void *arg4, void *arg5, void *arg6 /* kfunc addr */)
>> +{
>> +       struct btf_struct_kfunc *struct_kfunc, dummy_key;
>> +       struct btf_struct_kfunc_tab *tab;
>> +       struct bpf_run_ctx *bpf_ctx;
>> +       struct bpf_ref_node *node;
>> +       bpf_kfunc_t kfunc;
>> +       struct btf *btf;
>> +       void *kfunc_ret;
>> +
>> +       kfunc = (bpf_kfunc_t)arg6;
>> +       kfunc_ret = kfunc(arg1, arg2, arg3, arg4, arg5);
>> +
>> +       if (!kfunc_ret)
>> +               return kfunc_ret;
>> +
>> +       bpf_ctx = current->bpf_ctx;
>> +       btf = bpf_get_btf_vmlinux();
>> +
>> +       tab = btf->acquire_kfunc_tab;
>> +       if (!tab)
>> +               return kfunc_ret;
>> +
>> +       dummy_key.kfunc_addr = (unsigned long)arg6;
>> +       struct_kfunc = bsearch(&dummy_key, tab->set, tab->cnt,
>> +                              sizeof(struct btf_struct_kfunc),
>> +                              btf_kfunc_addr_cmp_func);
>> +
>> +       node = list_first_entry(&bpf_ctx->free_ref_list, struct bpf_ref_node, lnode);
>> +       node->obj_addr = (unsigned long)kfunc_ret;
>> +       node->struct_btf_id = struct_kfunc->struct_btf_id;
>> +
>> +       list_del(&node->lnode);
>> +       hash_add(bpf_ctx->active_ref_list, &node->hnode, node->obj_addr);
>> +
>> +       pr_info("bpf prog acquire obj addr = %lx, btf id = %d\n",
>> +               node->obj_addr, node->struct_btf_id);
>> +       print_bpf_active_refs();
>> +
>> +       return kfunc_ret;
>> +}
>> +
>> +void bpf_runtime_release_hook(void *arg1, void *arg2, void *arg3,
>> +                             void *arg4, void *arg5, void *arg6 /* kfunc addr */)
>> +{
>> +       struct bpf_run_ctx *bpf_ctx;
>> +       struct bpf_ref_node *node;
>> +       bpf_kfunc_t kfunc;
>> +
>> +       kfunc = (bpf_kfunc_t)arg6;
>> +       kfunc(arg1, arg2, arg3, arg4, arg5);
>> +
>> +       bpf_ctx = current->bpf_ctx;
>> +
>> +       hash_for_each_possible(bpf_ctx->active_ref_list, node, hnode, (unsigned long)arg1) {
>> +               if (node->obj_addr == (unsigned long)arg1) {
>> +                       hash_del(&node->hnode);
>> +                       list_add(&node->lnode, &bpf_ctx->free_ref_list);
>> +
>> +                       pr_info("bpf prog release obj addr = %lx, btf id = %d\n",
>> +                               node->obj_addr, node->struct_btf_id);
>> +               }
>> +       }
>> +
>> +       print_bpf_active_refs();
>> +}
> 
> So for every acq/rel the above two function will be called
> and you call this:
> "
> perhaps we can use some low overhead runtime solution first as a
> not too bad alternative
> "
> 
> low overhead ?!
> 
> acq/rel kfuncs can be very hot.
> To the level that single atomic_inc() is a noticeable overhead.
> Doing above is an obvious no-go in any production setup.
> 
>> Before the bpf program actually runs, we can allocate the maximum
>> possible number of reference nodes to record reference information.
> 
> This is an incorrect assumption.
> Look at register_btf_id_dtor_kfuncs()
> that patch 1 is sort-of trying to reinvent.
> Acquired objects can be stashed with single xchg instruction and
> people are not happy with performance either.
> An acquire kfunc plus inlined bpf_kptr_xchg is too slow in some cases.
> A bunch of bpf progs operate under constraints where nanoseconds count.
> That's why we rely on static verification where possible.
> Everytime we introduce run-time safety checks (like bpf_arena) we
> sacrifice some use cases.
> So, no, this proposal is not a solution.

OK, I agree, if single atomic_inc() is a noticeable overhead, then any
runtime solution is not applicable.

(I had thought about using btf id as another argument to further
eliminate the O(log n) overhead of binary search, but now it is
obviously not enough)


I am not sure, BPF runtime hooks seem to be a general feature, maybe it
can be used in other scenarios?

Do you think there would be value in non-intrusive bpf program behavior
tracking if it is only enabled in certain modes?

For example, to help us debug/diagnose bpf programs.
Alexei Starovoitov Feb. 26, 2025, 1:57 a.m. UTC | #3
On Tue, Feb 25, 2025 at 3:34 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>
> On 2025/2/25 22:12, Alexei Starovoitov wrote:
> > On Thu, Feb 13, 2025 at 4:36 PM Juntong Deng <juntong.deng@outlook.com> wrote:
> >>
> >> +void *bpf_runtime_acquire_hook(void *arg1, void *arg2, void *arg3,
> >> +                              void *arg4, void *arg5, void *arg6 /* kfunc addr */)
> >> +{
> >> +       struct btf_struct_kfunc *struct_kfunc, dummy_key;
> >> +       struct btf_struct_kfunc_tab *tab;
> >> +       struct bpf_run_ctx *bpf_ctx;
> >> +       struct bpf_ref_node *node;
> >> +       bpf_kfunc_t kfunc;
> >> +       struct btf *btf;
> >> +       void *kfunc_ret;
> >> +
> >> +       kfunc = (bpf_kfunc_t)arg6;
> >> +       kfunc_ret = kfunc(arg1, arg2, arg3, arg4, arg5);
> >> +
> >> +       if (!kfunc_ret)
> >> +               return kfunc_ret;
> >> +
> >> +       bpf_ctx = current->bpf_ctx;
> >> +       btf = bpf_get_btf_vmlinux();
> >> +
> >> +       tab = btf->acquire_kfunc_tab;
> >> +       if (!tab)
> >> +               return kfunc_ret;
> >> +
> >> +       dummy_key.kfunc_addr = (unsigned long)arg6;
> >> +       struct_kfunc = bsearch(&dummy_key, tab->set, tab->cnt,
> >> +                              sizeof(struct btf_struct_kfunc),
> >> +                              btf_kfunc_addr_cmp_func);
> >> +
> >> +       node = list_first_entry(&bpf_ctx->free_ref_list, struct bpf_ref_node, lnode);
> >> +       node->obj_addr = (unsigned long)kfunc_ret;
> >> +       node->struct_btf_id = struct_kfunc->struct_btf_id;
> >> +
> >> +       list_del(&node->lnode);
> >> +       hash_add(bpf_ctx->active_ref_list, &node->hnode, node->obj_addr);
> >> +
> >> +       pr_info("bpf prog acquire obj addr = %lx, btf id = %d\n",
> >> +               node->obj_addr, node->struct_btf_id);
> >> +       print_bpf_active_refs();
> >> +
> >> +       return kfunc_ret;
> >> +}
> >> +
> >> +void bpf_runtime_release_hook(void *arg1, void *arg2, void *arg3,
> >> +                             void *arg4, void *arg5, void *arg6 /* kfunc addr */)
> >> +{
> >> +       struct bpf_run_ctx *bpf_ctx;
> >> +       struct bpf_ref_node *node;
> >> +       bpf_kfunc_t kfunc;
> >> +
> >> +       kfunc = (bpf_kfunc_t)arg6;
> >> +       kfunc(arg1, arg2, arg3, arg4, arg5);
> >> +
> >> +       bpf_ctx = current->bpf_ctx;
> >> +
> >> +       hash_for_each_possible(bpf_ctx->active_ref_list, node, hnode, (unsigned long)arg1) {
> >> +               if (node->obj_addr == (unsigned long)arg1) {
> >> +                       hash_del(&node->hnode);
> >> +                       list_add(&node->lnode, &bpf_ctx->free_ref_list);
> >> +
> >> +                       pr_info("bpf prog release obj addr = %lx, btf id = %d\n",
> >> +                               node->obj_addr, node->struct_btf_id);
> >> +               }
> >> +       }
> >> +
> >> +       print_bpf_active_refs();
> >> +}
> >
> > So for every acq/rel the above two function will be called
> > and you call this:
> > "
> > perhaps we can use some low overhead runtime solution first as a
> > not too bad alternative
> > "
> >
> > low overhead ?!
> >
> > acq/rel kfuncs can be very hot.
> > To the level that single atomic_inc() is a noticeable overhead.
> > Doing above is an obvious no-go in any production setup.
> >
> >> Before the bpf program actually runs, we can allocate the maximum
> >> possible number of reference nodes to record reference information.
> >
> > This is an incorrect assumption.
> > Look at register_btf_id_dtor_kfuncs()
> > that patch 1 is sort-of trying to reinvent.
> > Acquired objects can be stashed with single xchg instruction and
> > people are not happy with performance either.
> > An acquire kfunc plus inlined bpf_kptr_xchg is too slow in some cases.
> > A bunch of bpf progs operate under constraints where nanoseconds count.
> > That's why we rely on static verification where possible.
> > Everytime we introduce run-time safety checks (like bpf_arena) we
> > sacrifice some use cases.
> > So, no, this proposal is not a solution.
>
> OK, I agree, if single atomic_inc() is a noticeable overhead, then any
> runtime solution is not applicable.
>
> (I had thought about using btf id as another argument to further
> eliminate the O(log n) overhead of binary search, but now it is
> obviously not enough)
>
>
> I am not sure, BPF runtime hooks seem to be a general feature, maybe it
> can be used in other scenarios?
>
> Do you think there would be value in non-intrusive bpf program behavior
> tracking if it is only enabled in certain modes?
>
> For example, to help us debug/diagnose bpf programs.

Unlikely. In general we don't add debug code to the kernel.
There are exceptions like lockdep and kasan that are there to
debug the kernel itself and they're not enabled in prod.
I don't see a use case for "let's replace all kfuncs with a wrapper".
perf record/report works on bpf progs, so profiling/perf analysis
part of bpf prog is solved.
Debugging of bpf prog for correctness is a different story.
It's an interesting challenge on its own, but
wrapping kfuncs won't help.
Debugging bpf prog is not much different than debugging normal kernel code.
Sprinkle printk is typically the answer.
Juntong Deng Feb. 27, 2025, 9:55 p.m. UTC | #4
On 2025/2/26 01:57, Alexei Starovoitov wrote:
> On Tue, Feb 25, 2025 at 3:34 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>>
>> On 2025/2/25 22:12, Alexei Starovoitov wrote:
>>> On Thu, Feb 13, 2025 at 4:36 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>>>>
>>>> +void *bpf_runtime_acquire_hook(void *arg1, void *arg2, void *arg3,
>>>> +                              void *arg4, void *arg5, void *arg6 /* kfunc addr */)
>>>> +{
>>>> +       struct btf_struct_kfunc *struct_kfunc, dummy_key;
>>>> +       struct btf_struct_kfunc_tab *tab;
>>>> +       struct bpf_run_ctx *bpf_ctx;
>>>> +       struct bpf_ref_node *node;
>>>> +       bpf_kfunc_t kfunc;
>>>> +       struct btf *btf;
>>>> +       void *kfunc_ret;
>>>> +
>>>> +       kfunc = (bpf_kfunc_t)arg6;
>>>> +       kfunc_ret = kfunc(arg1, arg2, arg3, arg4, arg5);
>>>> +
>>>> +       if (!kfunc_ret)
>>>> +               return kfunc_ret;
>>>> +
>>>> +       bpf_ctx = current->bpf_ctx;
>>>> +       btf = bpf_get_btf_vmlinux();
>>>> +
>>>> +       tab = btf->acquire_kfunc_tab;
>>>> +       if (!tab)
>>>> +               return kfunc_ret;
>>>> +
>>>> +       dummy_key.kfunc_addr = (unsigned long)arg6;
>>>> +       struct_kfunc = bsearch(&dummy_key, tab->set, tab->cnt,
>>>> +                              sizeof(struct btf_struct_kfunc),
>>>> +                              btf_kfunc_addr_cmp_func);
>>>> +
>>>> +       node = list_first_entry(&bpf_ctx->free_ref_list, struct bpf_ref_node, lnode);
>>>> +       node->obj_addr = (unsigned long)kfunc_ret;
>>>> +       node->struct_btf_id = struct_kfunc->struct_btf_id;
>>>> +
>>>> +       list_del(&node->lnode);
>>>> +       hash_add(bpf_ctx->active_ref_list, &node->hnode, node->obj_addr);
>>>> +
>>>> +       pr_info("bpf prog acquire obj addr = %lx, btf id = %d\n",
>>>> +               node->obj_addr, node->struct_btf_id);
>>>> +       print_bpf_active_refs();
>>>> +
>>>> +       return kfunc_ret;
>>>> +}
>>>> +
>>>> +void bpf_runtime_release_hook(void *arg1, void *arg2, void *arg3,
>>>> +                             void *arg4, void *arg5, void *arg6 /* kfunc addr */)
>>>> +{
>>>> +       struct bpf_run_ctx *bpf_ctx;
>>>> +       struct bpf_ref_node *node;
>>>> +       bpf_kfunc_t kfunc;
>>>> +
>>>> +       kfunc = (bpf_kfunc_t)arg6;
>>>> +       kfunc(arg1, arg2, arg3, arg4, arg5);
>>>> +
>>>> +       bpf_ctx = current->bpf_ctx;
>>>> +
>>>> +       hash_for_each_possible(bpf_ctx->active_ref_list, node, hnode, (unsigned long)arg1) {
>>>> +               if (node->obj_addr == (unsigned long)arg1) {
>>>> +                       hash_del(&node->hnode);
>>>> +                       list_add(&node->lnode, &bpf_ctx->free_ref_list);
>>>> +
>>>> +                       pr_info("bpf prog release obj addr = %lx, btf id = %d\n",
>>>> +                               node->obj_addr, node->struct_btf_id);
>>>> +               }
>>>> +       }
>>>> +
>>>> +       print_bpf_active_refs();
>>>> +}
>>>
>>> So for every acq/rel the above two function will be called
>>> and you call this:
>>> "
>>> perhaps we can use some low overhead runtime solution first as a
>>> not too bad alternative
>>> "
>>>
>>> low overhead ?!
>>>
>>> acq/rel kfuncs can be very hot.
>>> To the level that single atomic_inc() is a noticeable overhead.
>>> Doing above is an obvious no-go in any production setup.
>>>
>>>> Before the bpf program actually runs, we can allocate the maximum
>>>> possible number of reference nodes to record reference information.
>>>
>>> This is an incorrect assumption.
>>> Look at register_btf_id_dtor_kfuncs()
>>> that patch 1 is sort-of trying to reinvent.
>>> Acquired objects can be stashed with single xchg instruction and
>>> people are not happy with performance either.
>>> An acquire kfunc plus inlined bpf_kptr_xchg is too slow in some cases.
>>> A bunch of bpf progs operate under constraints where nanoseconds count.
>>> That's why we rely on static verification where possible.
>>> Everytime we introduce run-time safety checks (like bpf_arena) we
>>> sacrifice some use cases.
>>> So, no, this proposal is not a solution.
>>
>> OK, I agree, if single atomic_inc() is a noticeable overhead, then any
>> runtime solution is not applicable.
>>
>> (I had thought about using btf id as another argument to further
>> eliminate the O(log n) overhead of binary search, but now it is
>> obviously not enough)
>>
>>
>> I am not sure, BPF runtime hooks seem to be a general feature, maybe it
>> can be used in other scenarios?
>>
>> Do you think there would be value in non-intrusive bpf program behavior
>> tracking if it is only enabled in certain modes?
>>
>> For example, to help us debug/diagnose bpf programs.
> 
> Unlikely. In general we don't add debug code to the kernel.
> There are exceptions like lockdep and kasan that are there to
> debug the kernel itself and they're not enabled in prod.
> I don't see a use case for "let's replace all kfuncs with a wrapper".
> perf record/report works on bpf progs, so profiling/perf analysis
> part of bpf prog is solved.
> Debugging of bpf prog for correctness is a different story.
> It's an interesting challenge on its own, but
> wrapping kfuncs won't help.
> Debugging bpf prog is not much different than debugging normal kernel code.
> Sprinkle printk is typically the answer.

I have an idea, though not sure if it is helpful.

(This idea is for the previous problem of holding references too long)

My idea is to add a new KF_FLAG, like KF_ACQUIRE_EPHEMERAL, as a
special reference that can only be held for a short time.

When a bpf program holds such a reference, the bpf program will not be
allowed to enter any new logic with uncertain runtime, such as bpf_loop
and the bpf open coded iterator.

(If the bpf program is already in a loop, then no problem, as long as
the bpf program doesn't enter a new nested loop, since the bpf verifier
guarantees that references must be released in the loop body)

In addition, such references can only be acquired and released between a
limited number of instructions, e.g., 300 instructions.

This way the bpf verifier can force bpf programs to hold such references
only for a short time to do simple get-or-set information operations.

This special reference might solve the problem Christian mentioned
in the file iterator that bpf programs might hold file references for
too long.

Do you think this is a feasible solution?
Alexei Starovoitov Feb. 28, 2025, 3:34 a.m. UTC | #5
On Thu, Feb 27, 2025 at 1:55 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>
> I have an idea, though not sure if it is helpful.
>
> (This idea is for the previous problem of holding references too long)
>
> My idea is to add a new KF_FLAG, like KF_ACQUIRE_EPHEMERAL, as a
> special reference that can only be held for a short time.
>
> When a bpf program holds such a reference, the bpf program will not be
> allowed to enter any new logic with uncertain runtime, such as bpf_loop
> and the bpf open coded iterator.
>
> (If the bpf program is already in a loop, then no problem, as long as
> the bpf program doesn't enter a new nested loop, since the bpf verifier
> guarantees that references must be released in the loop body)
>
> In addition, such references can only be acquired and released between a
> limited number of instructions, e.g., 300 instructions.

Not much can be done with few instructions.
Number of insns is a coarse indicator of time. If there are calls
they can take a non-trivial amount of time.
People didn't like CRIB as a concept. Holding a _regular_ file refcnt for
the duration of the program is not a problem.
Holding special files might be, since they're not supposed to be held.
Like, is it safe to get_file() userfaultfd ? It needs in-depth
analysis and your patch didn't provide any confidence that
such analysis was done.

Speaking of more in-depth analysis of the problem.
In the cover letter you mentioned bpf_throw and exceptions as
one of the way to terminate the program, but there was another
proposal:
https://lpc.events/event/17/contributions/1610/

aka accelerated execution or fast-execute.
After the talk at LPC there were more discussions and follow ups.

Roughly the idea is the following,
during verification determine all kfuncs, helpers that
can be "speed up" and replace them with faster alternatives.
Like bpf_map_lookup_elem() can return NULL in the fast-execution version.
All KF_ACQUIRE | KF_RET_NULL can return NULL to.
bpf_loop() can end sooner.
bpf_*_iter_next() can return NULL,
etc

Then at verification time create such a fast-execute
version of the program with 1-1 mapping of IPs / instructions.
When a prog needs to be cancelled replace return IP
to IP in fast-execute version.
Since all regs are the same, continuing in the fast-execute
version will release all currently held resources
and no need to have either run-time (like this patch set)
or exception style (resource descriptor collection of resources)
bookkeeping to release.
The program itself is going to release whatever it acquired.
bpf_throw does manual stack unwind right now.
No need for that either. Fast-execute will return back
all the way to the kernel hook via normal execution path.

Instead of patching return IP in the stack,
we can text_poke_bp() the code of the original bpf prog to
jump to the fast-execute version at corresponding IP/insn.

The key insight is that cancellation doesn't mean
that the prog stops completely. It continues, but with
an intent to finish as quickly as possible.
In practice it might be faster to do that
than walk your acquired hash table and call destructors.

Another important bit is that control flow is unchanged.
Introducing new edge in a graph is tricky and error prone.

All details need to be figured out, but so far it looks
to be the cleanest and least intrusive solution to program
cancellation.
Would you be interested in helping us design/implement it?
Juntong Deng Feb. 28, 2025, 6:59 p.m. UTC | #6
On 2025/2/28 03:34, Alexei Starovoitov wrote:
> On Thu, Feb 27, 2025 at 1:55 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>>
>> I have an idea, though not sure if it is helpful.
>>
>> (This idea is for the previous problem of holding references too long)
>>
>> My idea is to add a new KF_FLAG, like KF_ACQUIRE_EPHEMERAL, as a
>> special reference that can only be held for a short time.
>>
>> When a bpf program holds such a reference, the bpf program will not be
>> allowed to enter any new logic with uncertain runtime, such as bpf_loop
>> and the bpf open coded iterator.
>>
>> (If the bpf program is already in a loop, then no problem, as long as
>> the bpf program doesn't enter a new nested loop, since the bpf verifier
>> guarantees that references must be released in the loop body)
>>
>> In addition, such references can only be acquired and released between a
>> limited number of instructions, e.g., 300 instructions.
> 
> Not much can be done with few instructions.
> Number of insns is a coarse indicator of time. If there are calls
> they can take a non-trivial amount of time.

Yes, you are right, limiting the number of instructions is not
a good idea.

> People didn't like CRIB as a concept. Holding a _regular_ file refcnt for
> the duration of the program is not a problem.
> Holding special files might be, since they're not supposed to be held.
> Like, is it safe to get_file() userfaultfd ? It needs in-depth
> analysis and your patch didn't provide any confidence that
> such analysis was done.
> 

I understand, I will try to analyze it in depth.

> Speaking of more in-depth analysis of the problem.
> In the cover letter you mentioned bpf_throw and exceptions as
> one of the way to terminate the program, but there was another
> proposal:
> https://lpc.events/event/17/contributions/1610/
> 
> aka accelerated execution or fast-execute.
> After the talk at LPC there were more discussions and follow ups.
> 
> Roughly the idea is the following,
> during verification determine all kfuncs, helpers that
> can be "speed up" and replace them with faster alternatives.
> Like bpf_map_lookup_elem() can return NULL in the fast-execution version.
> All KF_ACQUIRE | KF_RET_NULL can return NULL to.
> bpf_loop() can end sooner.
> bpf_*_iter_next() can return NULL,
> etc
> 
> Then at verification time create such a fast-execute
> version of the program with 1-1 mapping of IPs / instructions.
> When a prog needs to be cancelled replace return IP
> to IP in fast-execute version.
> Since all regs are the same, continuing in the fast-execute
> version will release all currently held resources
> and no need to have either run-time (like this patch set)
> or exception style (resource descriptor collection of resources)
> bookkeeping to release.
> The program itself is going to release whatever it acquired.
> bpf_throw does manual stack unwind right now.
> No need for that either. Fast-execute will return back
> all the way to the kernel hook via normal execution path.
> 
> Instead of patching return IP in the stack,
> we can text_poke_bp() the code of the original bpf prog to
> jump to the fast-execute version at corresponding IP/insn.
> 
> The key insight is that cancellation doesn't mean
> that the prog stops completely. It continues, but with
> an intent to finish as quickly as possible.
> In practice it might be faster to do that
> than walk your acquired hash table and call destructors.
> 
> Another important bit is that control flow is unchanged.
> Introducing new edge in a graph is tricky and error prone.
> 
> All details need to be figured out, but so far it looks
> to be the cleanest and least intrusive solution to program
> cancellation.
> Would you be interested in helping us design/implement it?

This is an amazing idea.

I am very interested in this.

But I think we may not need a fast-execute version of the bpf program
with 1-1 mapping.

Since we are going to modify the code of the bpf program through
text_poke_bp, we can directly modify all relevant CALL instructions in
the bpf program, just like the BPF runtime hook does.

For example, when we need to cancel the execution of a bpf program,
we can "live patch" the bpf program and replace the target address
in all CALL instructions that call KF_ACQUIRE and bpf_*_iter_next()
with the address of a stub function that always returns NULL.

During the JIT process, we can record the locations of all CALL
instructions that may potentially be "live patched".

This seems not difficult to do. The location (ip) of the CALL
instruction can be obtained by image + addrs[i - 1].

BPF_CALL ip = ffffffffc00195f1, kfunc name = bpf_task_from_pid
bpf_task_from_pid return address = ffffffffc00195f6

I did a simple experiment to verify the feasibility of this method.
In the above results, the return address of bpf_task_from_pid is
the location after the CALL instruction (ip), which means that the
ip recorded during the JIT process is correct.

After I complete a full proof of concept, I will send out the patch
series and let's see what happens.

But it may take some time as I am busy with my university
stuff recently.
Kumar Kartikeya Dwivedi March 1, 2025, 1:23 a.m. UTC | #7
On Fri, 28 Feb 2025 at 20:00, Juntong Deng <juntong.deng@outlook.com> wrote:
>
> On 2025/2/28 03:34, Alexei Starovoitov wrote:
> > On Thu, Feb 27, 2025 at 1:55 PM Juntong Deng <juntong.deng@outlook.com> wrote:
> >>
> >> I have an idea, though not sure if it is helpful.
> >>
> >> (This idea is for the previous problem of holding references too long)
> >>
> >> My idea is to add a new KF_FLAG, like KF_ACQUIRE_EPHEMERAL, as a
> >> special reference that can only be held for a short time.
> >>
> >> When a bpf program holds such a reference, the bpf program will not be
> >> allowed to enter any new logic with uncertain runtime, such as bpf_loop
> >> and the bpf open coded iterator.
> >>
> >> (If the bpf program is already in a loop, then no problem, as long as
> >> the bpf program doesn't enter a new nested loop, since the bpf verifier
> >> guarantees that references must be released in the loop body)
> >>
> >> In addition, such references can only be acquired and released between a
> >> limited number of instructions, e.g., 300 instructions.
> >
> > Not much can be done with few instructions.
> > Number of insns is a coarse indicator of time. If there are calls
> > they can take a non-trivial amount of time.
>
> Yes, you are right, limiting the number of instructions is not
> a good idea.
>
> > People didn't like CRIB as a concept. Holding a _regular_ file refcnt for
> > the duration of the program is not a problem.
> > Holding special files might be, since they're not supposed to be held.
> > Like, is it safe to get_file() userfaultfd ? It needs in-depth
> > analysis and your patch didn't provide any confidence that
> > such analysis was done.
> >
>
> I understand, I will try to analyze it in depth.
>
> > Speaking of more in-depth analysis of the problem.
> > In the cover letter you mentioned bpf_throw and exceptions as
> > one of the way to terminate the program, but there was another
> > proposal:
> > https://lpc.events/event/17/contributions/1610/
> >
> > aka accelerated execution or fast-execute.
> > After the talk at LPC there were more discussions and follow ups.
> >
> > Roughly the idea is the following,
> > during verification determine all kfuncs, helpers that
> > can be "speed up" and replace them with faster alternatives.
> > Like bpf_map_lookup_elem() can return NULL in the fast-execution version.
> > All KF_ACQUIRE | KF_RET_NULL can return NULL to.
> > bpf_loop() can end sooner.
> > bpf_*_iter_next() can return NULL,
> > etc
> >
> > Then at verification time create such a fast-execute
> > version of the program with 1-1 mapping of IPs / instructions.
> > When a prog needs to be cancelled replace return IP
> > to IP in fast-execute version.
> > Since all regs are the same, continuing in the fast-execute
> > version will release all currently held resources
> > and no need to have either run-time (like this patch set)
> > or exception style (resource descriptor collection of resources)
> > bookkeeping to release.
> > The program itself is going to release whatever it acquired.
> > bpf_throw does manual stack unwind right now.
> > No need for that either. Fast-execute will return back
> > all the way to the kernel hook via normal execution path.
> >
> > Instead of patching return IP in the stack,
> > we can text_poke_bp() the code of the original bpf prog to
> > jump to the fast-execute version at corresponding IP/insn.
> >
> > The key insight is that cancellation doesn't mean
> > that the prog stops completely. It continues, but with
> > an intent to finish as quickly as possible.
> > In practice it might be faster to do that
> > than walk your acquired hash table and call destructors.
> >
> > Another important bit is that control flow is unchanged.
> > Introducing new edge in a graph is tricky and error prone.
> >
> > All details need to be figured out, but so far it looks
> > to be the cleanest and least intrusive solution to program
> > cancellation.
> > Would you be interested in helping us design/implement it?
>
> This is an amazing idea.
>
> I am very interested in this.
>
> But I think we may not need a fast-execute version of the bpf program
> with 1-1 mapping.
>
> Since we are going to modify the code of the bpf program through
> text_poke_bp, we can directly modify all relevant CALL instructions in
> the bpf program, just like the BPF runtime hook does.

Cloning the text allows you to not make the modifications globally
visible, in case we want to support cancellations local to a CPU.
So there is a material difference

You can argue for and against local/global cancellations, therefore it
seems we should not bind early to one specific choice and keep options
open.
It is tied to how one views BPF program execution.
Whether a single execution of the program constitutes an isolated
invocation, or whether all invocations in parallel should be affected
due to a cancellation event.
The answer may lie in how the cancellation was triggered.

Here's an anecdote:
At least when I was (or am) using this, and when I have assertions in
the program (to prove some verification property, some runtime
condition, or simply for my program logic), it was better if the
cancellation was local (and triggered synchronously on a throw). In
comparison, when I did cancellations on page faults into arena/heap
loads, the program is clearly broken, so it seemed better to rip it
out (but in my case I still chose to do that as a separate step, to
not mess with parallel invocations of the program that may still be
functioning correctly).

Unlike user space which has a clear boundary against the kernel, BPF
programs have side effects and can influence the kernel's control
flow, so "crashing" them has a semantic implication for the kernel.

>
> For example, when we need to cancel the execution of a bpf program,
> we can "live patch" the bpf program and replace the target address
> in all CALL instructions that call KF_ACQUIRE and bpf_*_iter_next()
> with the address of a stub function that always returns NULL.
>
> During the JIT process, we can record the locations of all CALL
> instructions that may potentially be "live patched".
>
> This seems not difficult to do. The location (ip) of the CALL
> instruction can be obtained by image + addrs[i - 1].
>
> BPF_CALL ip = ffffffffc00195f1, kfunc name = bpf_task_from_pid
> bpf_task_from_pid return address = ffffffffc00195f6
>
> I did a simple experiment to verify the feasibility of this method.
> In the above results, the return address of bpf_task_from_pid is
> the location after the CALL instruction (ip), which means that the
> ip recorded during the JIT process is correct.
>
> After I complete a full proof of concept, I will send out the patch
> series and let's see what happens.

We should also think about whether removing the exceptions support makes sense.
Since it's not complete upstream (in terms of releasing held resources), it
hasn't found much use (except whatever I tried to use it for).
There would be some exotic use cases (like using it to prove to the
verifier some precondition on some kernel resource), but that wouldn't
be a justification to keep it around.

One of the original use cases was asserting that a map return value is not NULL.
The most pressing case is already solved by making the verifier
smarter for array maps.

As such there may not be much value, so it might be better to just
drop that code altogether and simplify the verifier if this approach
seems viable and lands.
Since it's all exposed through kfuncs, there's no UAPI constraint.


>
> But it may take some time as I am busy with my university
> stuff recently.
Juntong Deng March 3, 2025, 2:41 p.m. UTC | #8
On 2025/3/1 01:23, Kumar Kartikeya Dwivedi wrote:
> On Fri, 28 Feb 2025 at 20:00, Juntong Deng <juntong.deng@outlook.com> wrote:
>>
>> On 2025/2/28 03:34, Alexei Starovoitov wrote:
>>> On Thu, Feb 27, 2025 at 1:55 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>>>>
>>>> I have an idea, though not sure if it is helpful.
>>>>
>>>> (This idea is for the previous problem of holding references too long)
>>>>
>>>> My idea is to add a new KF_FLAG, like KF_ACQUIRE_EPHEMERAL, as a
>>>> special reference that can only be held for a short time.
>>>>
>>>> When a bpf program holds such a reference, the bpf program will not be
>>>> allowed to enter any new logic with uncertain runtime, such as bpf_loop
>>>> and the bpf open coded iterator.
>>>>
>>>> (If the bpf program is already in a loop, then no problem, as long as
>>>> the bpf program doesn't enter a new nested loop, since the bpf verifier
>>>> guarantees that references must be released in the loop body)
>>>>
>>>> In addition, such references can only be acquired and released between a
>>>> limited number of instructions, e.g., 300 instructions.
>>>
>>> Not much can be done with few instructions.
>>> Number of insns is a coarse indicator of time. If there are calls
>>> they can take a non-trivial amount of time.
>>
>> Yes, you are right, limiting the number of instructions is not
>> a good idea.
>>
>>> People didn't like CRIB as a concept. Holding a _regular_ file refcnt for
>>> the duration of the program is not a problem.
>>> Holding special files might be, since they're not supposed to be held.
>>> Like, is it safe to get_file() userfaultfd ? It needs in-depth
>>> analysis and your patch didn't provide any confidence that
>>> such analysis was done.
>>>
>>
>> I understand, I will try to analyze it in depth.
>>
>>> Speaking of more in-depth analysis of the problem.
>>> In the cover letter you mentioned bpf_throw and exceptions as
>>> one of the way to terminate the program, but there was another
>>> proposal:
>>> https://lpc.events/event/17/contributions/1610/
>>>
>>> aka accelerated execution or fast-execute.
>>> After the talk at LPC there were more discussions and follow ups.
>>>
>>> Roughly the idea is the following,
>>> during verification determine all kfuncs, helpers that
>>> can be "speed up" and replace them with faster alternatives.
>>> Like bpf_map_lookup_elem() can return NULL in the fast-execution version.
>>> All KF_ACQUIRE | KF_RET_NULL can return NULL to.
>>> bpf_loop() can end sooner.
>>> bpf_*_iter_next() can return NULL,
>>> etc
>>>
>>> Then at verification time create such a fast-execute
>>> version of the program with 1-1 mapping of IPs / instructions.
>>> When a prog needs to be cancelled replace return IP
>>> to IP in fast-execute version.
>>> Since all regs are the same, continuing in the fast-execute
>>> version will release all currently held resources
>>> and no need to have either run-time (like this patch set)
>>> or exception style (resource descriptor collection of resources)
>>> bookkeeping to release.
>>> The program itself is going to release whatever it acquired.
>>> bpf_throw does manual stack unwind right now.
>>> No need for that either. Fast-execute will return back
>>> all the way to the kernel hook via normal execution path.
>>>
>>> Instead of patching return IP in the stack,
>>> we can text_poke_bp() the code of the original bpf prog to
>>> jump to the fast-execute version at corresponding IP/insn.
>>>
>>> The key insight is that cancellation doesn't mean
>>> that the prog stops completely. It continues, but with
>>> an intent to finish as quickly as possible.
>>> In practice it might be faster to do that
>>> than walk your acquired hash table and call destructors.
>>>
>>> Another important bit is that control flow is unchanged.
>>> Introducing new edge in a graph is tricky and error prone.
>>>
>>> All details need to be figured out, but so far it looks
>>> to be the cleanest and least intrusive solution to program
>>> cancellation.
>>> Would you be interested in helping us design/implement it?
>>
>> This is an amazing idea.
>>
>> I am very interested in this.
>>
>> But I think we may not need a fast-execute version of the bpf program
>> with 1-1 mapping.
>>
>> Since we are going to modify the code of the bpf program through
>> text_poke_bp, we can directly modify all relevant CALL instructions in
>> the bpf program, just like the BPF runtime hook does.
> 
> Cloning the text allows you to not make the modifications globally
> visible, in case we want to support cancellations local to a CPU.
> So there is a material difference
> 
> You can argue for and against local/global cancellations, therefore it
> seems we should not bind early to one specific choice and keep options
> open.
> It is tied to how one views BPF program execution.
> Whether a single execution of the program constitutes an isolated
> invocation, or whether all invocations in parallel should be affected
> due to a cancellation event.
> The answer may lie in how the cancellation was triggered.
> 
> Here's an anecdote:
> At least when I was (or am) using this, and when I have assertions in
> the program (to prove some verification property, some runtime
> condition, or simply for my program logic), it was better if the
> cancellation was local (and triggered synchronously on a throw). In
> comparison, when I did cancellations on page faults into arena/heap
> loads, the program is clearly broken, so it seemed better to rip it
> out (but in my case I still chose to do that as a separate step, to
> not mess with parallel invocations of the program that may still be
> functioning correctly).
> 
> Unlike user space which has a clear boundary against the kernel, BPF
> programs have side effects and can influence the kernel's control
> flow, so "crashing" them has a semantic implication for the kernel.
> 

Thanks for your reply.

Yes, I agree, we should keep the options open.

I simply thought before that if a bpf program meets the conditions for
cancellation, then we can assume that it is a potentially "malicious"
bpf program and may be cancelled again in the future. So we should just
rip it out of the kernel as a whole, like an immune system.

But I ignored the assertion case before and this should be considered.

If we do not modify the bpf program code (which is globally visible), it
seems that the feasible solution comes back to modifying the return
address in the runtime stack of the bpf program, since only the stack is
not shared. I am not sure, but accurately modifying the stack of a
running bpf program seems not easy to implement?

>>
>> For example, when we need to cancel the execution of a bpf program,
>> we can "live patch" the bpf program and replace the target address
>> in all CALL instructions that call KF_ACQUIRE and bpf_*_iter_next()
>> with the address of a stub function that always returns NULL.
>>
>> During the JIT process, we can record the locations of all CALL
>> instructions that may potentially be "live patched".
>>
>> This seems not difficult to do. The location (ip) of the CALL
>> instruction can be obtained by image + addrs[i - 1].
>>
>> BPF_CALL ip = ffffffffc00195f1, kfunc name = bpf_task_from_pid
>> bpf_task_from_pid return address = ffffffffc00195f6
>>
>> I did a simple experiment to verify the feasibility of this method.
>> In the above results, the return address of bpf_task_from_pid is
>> the location after the CALL instruction (ip), which means that the
>> ip recorded during the JIT process is correct.
>>
>> After I complete a full proof of concept, I will send out the patch
>> series and let's see what happens.
> 
> We should also think about whether removing the exceptions support makes sense.
> Since it's not complete upstream (in terms of releasing held resources), it
> hasn't found much use (except whatever I tried to use it for).
> There would be some exotic use cases (like using it to prove to the
> verifier some precondition on some kernel resource), but that wouldn't
> be a justification to keep it around.
> 
> One of the original use cases was asserting that a map return value is not NULL.
> The most pressing case is already solved by making the verifier
> smarter for array maps.
> 
> As such there may not be much value, so it might be better to just
> drop that code altogether and simplify the verifier if this approach
> seems viable and lands.
> Since it's all exposed through kfuncs, there's no UAPI constraint.
> 

In my opinion, even if we have a watchdog, BPF exceptions still serves a
different purpose.

The purpose of the watchdog is to force the cancellation of the
execution of an out-of-control bpf program, whereas the purpose of the
BPF exception is for the bpf program to proactively choose to cancel
the execution itself.

For example when a bpf program realizes that something is not right,
that a fatal error has been detected, and that there may be a
significant risk in continuing execution, then it can choose to
proactively and immediately cancel its own execution.

 From this perspective, BPF exceptions are more of a panic() mechanism
for bpf programs than exceptions in other programming languages.

So I think the key to whether BPF exceptions are valuable is whether we
think that exceptions may occur during the execution of a bpf program?
And do we think that it is the responsibility of the bpf program
developer to handle these exceptions (e.g., ensuring runtime correctness
via assertions)?

This seems to be a controversial topic because we seem to keep assuming
that bpf programs are safe.

> 
>>
>> But it may take some time as I am busy with my university
>> stuff recently.
diff mbox series

Patch

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 2bd7fc996756..39f12d101809 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -567,6 +567,10 @@  struct btf_field_iter {
 };
 
 #ifdef CONFIG_BPF_SYSCALL
+void *bpf_runtime_acquire_hook(void *arg1, void *arg2, void *arg3,
+			       void *arg4, void *arg5, void *arg6);
+void bpf_runtime_release_hook(void *arg1, void *arg2, void *arg3,
+			      void *arg4, void *arg5, void *arg6);
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 void btf_set_base_btf(struct btf *btf, const struct btf *base_btf);
 int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **map_ids);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3548b52ca9c2..93ca804d52e3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -9532,3 +9532,111 @@  bool btf_param_match_suffix(const struct btf *btf,
 	param_name += len - suffix_len;
 	return !strncmp(param_name, suffix, suffix_len);
 }
+
+static void print_bpf_active_refs(void)
+{
+	/* This function is only used to demonstrate the idea */
+	struct btf_struct_kfunc_tab *tab;
+	struct btf_struct_kfunc *kfunc;
+	struct bpf_run_ctx *bpf_ctx;
+	struct bpf_ref_node *node;
+	char sym[KSYM_SYMBOL_LEN];
+	int bkt, num = 0;
+	struct btf *btf;
+
+	btf = bpf_get_btf_vmlinux();
+	bpf_ctx = current->bpf_ctx;
+	tab = btf->release_kfunc_btf_tab;
+
+	pr_info("bpf prog current release table:\n");
+
+	if (hash_empty(bpf_ctx->active_ref_list)) {
+		pr_info("table empty\n");
+		return;
+	}
+
+	hash_for_each(bpf_ctx->active_ref_list, bkt, node, hnode) {
+		kfunc = bsearch(&node->struct_btf_id, tab->set, tab->cnt,
+				sizeof(struct btf_struct_kfunc), btf_id_cmp_func);
+		sprint_symbol(sym, kfunc->kfunc_addr);
+		pr_info("obj %d, obj addr = %lx, btf id = %d, can be released by %s\n",
+			num, node->obj_addr, node->struct_btf_id, sym);
+		num++;
+		/*
+		 * If we want to release this object, we can use
+		 * void (*release_kfunc)(void *) = (void (*)(void *))kfunc->kfunc_addr;
+		 * release_kfunc((void *)node->obj_addr);
+		 */
+	}
+}
+
+typedef void *(*bpf_kfunc_t)(void *arg1, void *arg2, void *arg3,
+			     void *arg4, void *arg5);
+
+void *bpf_runtime_acquire_hook(void *arg1, void *arg2, void *arg3,
+			       void *arg4, void *arg5, void *arg6 /* kfunc addr */)
+{
+	struct btf_struct_kfunc *struct_kfunc, dummy_key;
+	struct btf_struct_kfunc_tab *tab;
+	struct bpf_run_ctx *bpf_ctx;
+	struct bpf_ref_node *node;
+	bpf_kfunc_t kfunc;
+	struct btf *btf;
+	void *kfunc_ret;
+
+	kfunc = (bpf_kfunc_t)arg6;
+	kfunc_ret = kfunc(arg1, arg2, arg3, arg4, arg5);
+
+	if (!kfunc_ret)
+		return kfunc_ret;
+
+	bpf_ctx = current->bpf_ctx;
+	btf = bpf_get_btf_vmlinux();
+
+	tab = btf->acquire_kfunc_tab;
+	if (!tab)
+		return kfunc_ret;
+
+	dummy_key.kfunc_addr = (unsigned long)arg6;
+	struct_kfunc = bsearch(&dummy_key, tab->set, tab->cnt,
+			       sizeof(struct btf_struct_kfunc),
+			       btf_kfunc_addr_cmp_func);
+
+	node = list_first_entry(&bpf_ctx->free_ref_list, struct bpf_ref_node, lnode);
+	node->obj_addr = (unsigned long)kfunc_ret;
+	node->struct_btf_id = struct_kfunc->struct_btf_id;
+
+	list_del(&node->lnode);
+	hash_add(bpf_ctx->active_ref_list, &node->hnode, node->obj_addr);
+
+	pr_info("bpf prog acquire obj addr = %lx, btf id = %d\n",
+		node->obj_addr, node->struct_btf_id);
+	print_bpf_active_refs();
+
+	return kfunc_ret;
+}
+
+void bpf_runtime_release_hook(void *arg1, void *arg2, void *arg3,
+			      void *arg4, void *arg5, void *arg6 /* kfunc addr */)
+{
+	struct bpf_run_ctx *bpf_ctx;
+	struct bpf_ref_node *node;
+	bpf_kfunc_t kfunc;
+
+	kfunc = (bpf_kfunc_t)arg6;
+	kfunc(arg1, arg2, arg3, arg4, arg5);
+
+	bpf_ctx = current->bpf_ctx;
+
+	hash_for_each_possible(bpf_ctx->active_ref_list, node, hnode, (unsigned long)arg1) {
+		if (node->obj_addr == (unsigned long)arg1) {
+			hash_del(&node->hnode);
+			list_add(&node->lnode, &bpf_ctx->free_ref_list);
+
+			pr_info("bpf prog release obj addr = %lx, btf id = %d\n",
+				node->obj_addr, node->struct_btf_id);
+		}
+	}
+
+	print_bpf_active_refs();
+}