Message ID | 20231022154527.229117-3-zhouchuyi@bytedance.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Relax allowlist for open-coded css_task iter | expand |
On Sun, Oct 22, 2023 at 8:45 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > > This patch adds a test which demonstrates how css_task iter can be combined > with cgroup iter and it won't cause deadlock, though cgroup iter is not > sleepable. > > Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> > --- > .../selftests/bpf/prog_tests/cgroup_iter.c | 33 +++++++++++++++ > .../selftests/bpf/progs/iters_css_task.c | 41 +++++++++++++++++++ > 2 files changed, 74 insertions(+) > > diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c > index e02feb5fae97..3679687a6927 100644 > --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c > +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c > @@ -4,6 +4,7 @@ > #include <test_progs.h> > #include <bpf/libbpf.h> > #include <bpf/btf.h> > +#include "iters_css_task.skel.h" > #include "cgroup_iter.skel.h" > #include "cgroup_helpers.h" > > @@ -263,6 +264,35 @@ static void test_walk_dead_self_only(struct cgroup_iter *skel) > close(cgrp_fd); > } > > +static void test_walk_self_only_css_task(void) > +{ > + struct iters_css_task *skel = NULL; > + int err; > + > + skel = iters_css_task__open(); > + if (!ASSERT_OK_PTR(skel, "skel_open")) > + return; > + > + bpf_program__set_autoload(skel->progs.cgroup_id_printer, true); > + > + err = iters_css_task__load(skel); > + if (!ASSERT_OK(err, "skel_load")) > + goto cleanup; > + > + err = join_cgroup(cg_path[CHILD2]); > + if (!ASSERT_OK(err, "join_cgroup")) > + goto cleanup; > + > + skel->bss->target_pid = getpid(); > + snprintf(expected_output, sizeof(expected_output), > + PROLOGUE "%8llu\n" EPILOGUE, cg_id[CHILD2]); > + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[CHILD2], > + BPF_CGROUP_ITER_SELF_ONLY, "test_walk_self_only_css_task"); > + ASSERT_EQ(skel->bss->css_task_cnt, 1, "css_task_cnt"); > +cleanup: > + iters_css_task__destroy(skel); > +} > + > void test_cgroup_iter(void) > { > struct cgroup_iter *skel = NULL; > @@ -293,6 +323,9 @@ void test_cgroup_iter(void) > test_walk_self_only(skel); > if (test__start_subtest("cgroup_iter__dead_self_only")) > test_walk_dead_self_only(skel); > + if (test__start_subtest("cgroup_iter__self_only_css_task")) > + test_walk_self_only_css_task(); > + > out: > cgroup_iter__destroy(skel); > cleanup_cgroups(); > diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c > index 5089ce384a1c..0974e6f44328 100644 > --- a/tools/testing/selftests/bpf/progs/iters_css_task.c > +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c > @@ -10,6 +10,7 @@ > > char _license[] SEC("license") = "GPL"; > > +struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym; > struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; > void bpf_cgroup_release(struct cgroup *p) __ksym; > > @@ -45,3 +46,43 @@ int BPF_PROG(iter_css_task_for_each, struct vm_area_struct *vma, > > return -EPERM; > } > + > +static inline u64 cgroup_id(struct cgroup *cgrp) > +{ > + return cgrp->kn->id; > +} > + > +SEC("?iter/cgroup") > +int cgroup_id_printer(struct bpf_iter__cgroup *ctx) > +{ > + struct seq_file *seq = ctx->meta->seq; > + struct cgroup *cgrp, *acquired; > + struct cgroup_subsys_state *css; > + struct task_struct *task; > + > + cgrp = ctx->cgroup; > + > + /* epilogue */ > + if (cgrp == NULL) { > + BPF_SEQ_PRINTF(seq, "epilogue\n"); > + return 0; > + } > + > + /* prologue */ > + if (ctx->meta->seq_num == 0) > + BPF_SEQ_PRINTF(seq, "prologue\n"); > + > + BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp)); > + > + acquired = bpf_cgroup_from_id(cgroup_id(cgrp)); You're doing this dance, because a plain cgrp pointer is not trusted? Maybe let's add BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...} to the verifier similar to what we do for bpf_iter__task. After if (cgrp == NULL) check the pointer is safe to iterate on. > + if (!acquired) > + return 0; > + css = &acquired->self; > + css_task_cnt = 0; > + bpf_for_each(css_task, task, css, CSS_TASK_ITER_PROCS) { > + if (task->pid == target_pid) > + css_task_cnt++; > + } > + bpf_cgroup_release(acquired); > + return 0; > +} > -- > 2.20.1 >
在 2023/10/23 00:03, Alexei Starovoitov 写道: > On Sun, Oct 22, 2023 at 8:45 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >> >> This patch adds a test which demonstrates how css_task iter can be combined >> with cgroup iter and it won't cause deadlock, though cgroup iter is not >> sleepable. >> >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> >> --- >> .../selftests/bpf/prog_tests/cgroup_iter.c | 33 +++++++++++++++ >> .../selftests/bpf/progs/iters_css_task.c | 41 +++++++++++++++++++ >> 2 files changed, 74 insertions(+) >> >> diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c >> index e02feb5fae97..3679687a6927 100644 >> --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c >> +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c >> @@ -4,6 +4,7 @@ >> #include <test_progs.h> >> #include <bpf/libbpf.h> >> #include <bpf/btf.h> >> +#include "iters_css_task.skel.h" >> #include "cgroup_iter.skel.h" >> #include "cgroup_helpers.h" >> >> @@ -263,6 +264,35 @@ static void test_walk_dead_self_only(struct cgroup_iter *skel) >> close(cgrp_fd); >> } >> >> +static void test_walk_self_only_css_task(void) >> +{ >> + struct iters_css_task *skel = NULL; >> + int err; >> + >> + skel = iters_css_task__open(); >> + if (!ASSERT_OK_PTR(skel, "skel_open")) >> + return; >> + >> + bpf_program__set_autoload(skel->progs.cgroup_id_printer, true); >> + >> + err = iters_css_task__load(skel); >> + if (!ASSERT_OK(err, "skel_load")) >> + goto cleanup; >> + >> + err = join_cgroup(cg_path[CHILD2]); >> + if (!ASSERT_OK(err, "join_cgroup")) >> + goto cleanup; >> + >> + skel->bss->target_pid = getpid(); >> + snprintf(expected_output, sizeof(expected_output), >> + PROLOGUE "%8llu\n" EPILOGUE, cg_id[CHILD2]); >> + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[CHILD2], >> + BPF_CGROUP_ITER_SELF_ONLY, "test_walk_self_only_css_task"); >> + ASSERT_EQ(skel->bss->css_task_cnt, 1, "css_task_cnt"); >> +cleanup: >> + iters_css_task__destroy(skel); >> +} >> + >> void test_cgroup_iter(void) >> { >> struct cgroup_iter *skel = NULL; >> @@ -293,6 +323,9 @@ void test_cgroup_iter(void) >> test_walk_self_only(skel); >> if (test__start_subtest("cgroup_iter__dead_self_only")) >> test_walk_dead_self_only(skel); >> + if (test__start_subtest("cgroup_iter__self_only_css_task")) >> + test_walk_self_only_css_task(); >> + >> out: >> cgroup_iter__destroy(skel); >> cleanup_cgroups(); >> diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c >> index 5089ce384a1c..0974e6f44328 100644 >> --- a/tools/testing/selftests/bpf/progs/iters_css_task.c >> +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c >> @@ -10,6 +10,7 @@ >> >> char _license[] SEC("license") = "GPL"; >> >> +struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym; >> struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; >> void bpf_cgroup_release(struct cgroup *p) __ksym; >> >> @@ -45,3 +46,43 @@ int BPF_PROG(iter_css_task_for_each, struct vm_area_struct *vma, >> >> return -EPERM; >> } >> + >> +static inline u64 cgroup_id(struct cgroup *cgrp) >> +{ >> + return cgrp->kn->id; >> +} >> + >> +SEC("?iter/cgroup") >> +int cgroup_id_printer(struct bpf_iter__cgroup *ctx) >> +{ >> + struct seq_file *seq = ctx->meta->seq; >> + struct cgroup *cgrp, *acquired; >> + struct cgroup_subsys_state *css; >> + struct task_struct *task; >> + >> + cgrp = ctx->cgroup; >> + >> + /* epilogue */ >> + if (cgrp == NULL) { >> + BPF_SEQ_PRINTF(seq, "epilogue\n"); >> + return 0; >> + } >> + >> + /* prologue */ >> + if (ctx->meta->seq_num == 0) >> + BPF_SEQ_PRINTF(seq, "prologue\n"); >> + >> + BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp)); >> + >> + acquired = bpf_cgroup_from_id(cgroup_id(cgrp)); > > You're doing this dance, because a plain cgrp pointer is not trusted? > Maybe let's add > BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...} > to the verifier similar to what we do for bpf_iter__task. > Yes, thanks for the suggestion. But it seems currently, bpf_iter__task->task works well either. SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { struct task_struct *task = ctx->task; // here task should be trusted since BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {...} if (task == NULL) { return 0; } bpf_task_acquire(task); bpf_task_release(task); return 0; } The log of verifier is : VERIFIER LOG: ============= reg type unsupported for arg#0 function dump_task#7 0: R1=ctx(off=0,imm=0) R10=fp0 ; struct task_struct *task = ctx->task; 0: (79) r6 = *(u64 *)(r1 +8) ; R1=ctx(off=0,imm=0) R6_w=ptr_or_null_task_struct(id=1,off=0,imm=0) ; if (task == NULL) { 1: (15) if r6 == 0x0 goto pc+4 ; R6_w=ptr_task_struct(off=0,imm=0) ; bpf_task_acquire(task); 2: (bf) r1 = r6 ; R1_w=ptr_task_struct(off=0,imm=0) R6_w=ptr_task_struct(off=0,imm=0) 3: (85) call bpf_task_acquire#26990 R1 must be a rcu pointer processed 4 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 From the above log, it seems 'task' is a normal porint not a trusted pointer. Actually, it seems BPF verifier didn't even call check_ptr_to_btf_access().. But for bpf_iter__task->meta, it works well. BPF verifier would consider it as a trusted pointer. SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { struct task_struct *task = ctx->task; struct bpf_iter_meta *meta = ctx->meta; struct seq_file *seq = meta->seq; if (task == NULL) { BPF_SEQ_PRINTF(seq, "%s\n", "NULL"); return 0; } bpf_task_acquire(task); bpf_task_release(task); return 0; } VERIFIER LOG: ============= reg type unsupported for arg#0 function dump_task#7 0: R1=ctx(off=0,imm=0) R10=fp0 ; int dump_task(struct bpf_iter__task *ctx) 0: (bf) r8 = r1 ; R1=ctx(off=0,imm=0) R8_w=ctx(off=0,imm=0) ; struct bpf_iter_meta *meta = ctx->meta; 1: (79) r1 = *(u64 *)(r8 +0) func 'bpf_iter_task' arg0 has btf_id 22699 type STRUCT 'bpf_iter_meta' 2: R1_w=trusted_ptr_bpf_iter_meta(off=0,imm=0) R8_w=ctx(off=0,imm=0) ; struct seq_file *seq = meta->seq; 2: (79) r6 = *(u64 *)(r1 +0) ; R1_w=trusted_ptr_bpf_iter_meta(off=0,imm=0) R6_w=trusted_ptr_seq_file(off=0,imm=0) ; struct task_struct *task = ctx->task; 3: (79) r7 = *(u64 *)(r8 +8) ; R7_w=ptr_or_null_task_struct(id=1,off=0,imm=0) R8_w=ctx(off=0,imm=0) ; if (task == (void *)0) { 4: (55) if r7 != 0x0 goto pc+12 17: R1_w=trusted_ptr_bpf_iter_meta(off=0,imm=0) R6_w=trusted_ptr_seq_file(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0) R8_w=ctx(off=0,imm=0) R10=fp0 ; bpf_task_acquire(task); 17: (bf) r1 = r7 ; R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0) 18: (85) call bpf_task_acquire#26990 R1 must be a rcu pointer I will try to figure out it. Thanks.
On Mon, Oct 23, 2023 at 6:50 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > > > R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0) > 18: (85) call bpf_task_acquire#26990 > R1 must be a rcu pointer > > I will try to figure out it. Thanks. That would be great. So far it looks like a regression. I'm guessing __bpf_md_ptr wrapping is confusing the verifier. Since it's more complicated than I thought, please respin the current set with fixes to patch 1 and leave the patch 2 as-is. That can be another follow up.
Hello, 在 2023/10/24 01:14, Alexei Starovoitov 写道: > On Mon, Oct 23, 2023 at 6:50 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >> >> >> R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0) >> 18: (85) call bpf_task_acquire#26990 >> R1 must be a rcu pointer >> >> I will try to figure out it. > > Thanks. That would be great. > So far it looks like a regression. > I'm guessing __bpf_md_ptr wrapping is confusing the verifier. > > Since it's more complicated than I thought, please respin > the current set with fixes to patch 1 and leave the patch 2 as-is. > That can be another follow up. After adding this change: diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 15d71d2986d3..4565e5457754 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6071,6 +6071,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; + if (prog_args_trusted(prog)) + info->reg_type |= PTR_TRUSTED; return true; } } the task pointer would be recognized as 'trusted_ptr_or_null_task_struct'. The VERIFIER LOG: ============= reg type unsupported for arg#0 function dump_task#7 0: R1=ctx(off=0,imm=0) R10=fp0 ; struct task_struct *task = ctx->task; 0: (79) r1 = *(u64 *)(r1 +8) ; R1_w=trusted_ptr_or_null_task_struct(id=1,off=0,imm=0) The following BPF Prog works well. SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { struct task_struct *task = ctx->task; struct task_struct *acq; if (task == NULL) return 0; acq = bpf_task_acquire(task); if (!acq) return 0; bpf_task_release(acq); return 0; } Do you think this change is correct ? Or do you have better ideas ? Thanks.
在 2023/10/30 22:37, Chuyi Zhou 写道: > Hello, > > 在 2023/10/24 01:14, Alexei Starovoitov 写道: >> On Mon, Oct 23, 2023 at 6:50 AM Chuyi Zhou <zhouchuyi@bytedance.com> >> wrote: >>> >>> >>> R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0) >>> 18: (85) call bpf_task_acquire#26990 >>> R1 must be a rcu pointer >>> >>> I will try to figure out it. >> >> Thanks. That would be great. >> So far it looks like a regression. >> I'm guessing __bpf_md_ptr wrapping is confusing the verifier. >> >> Since it's more complicated than I thought, please respin >> the current set with fixes to patch 1 and leave the patch 2 as-is. >> That can be another follow up. > > After adding this change: > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > index 15d71d2986d3..4565e5457754 100644 > --- a/kernel/bpf/btf.c > +++ b/kernel/bpf/btf.c > @@ -6071,6 +6071,8 @@ bool btf_ctx_access(int off, int size, enum > bpf_access_type type, > info->reg_type = ctx_arg_info->reg_type; > info->btf = btf_vmlinux; > info->btf_id = ctx_arg_info->btf_id; > + if (prog_args_trusted(prog)) > + info->reg_type |= PTR_TRUSTED; > return true; > } > } > > the task pointer would be recognized as 'trusted_ptr_or_null_task_struct'. > > The VERIFIER LOG: > ============= > reg type unsupported for arg#0 function dump_task#7 > 0: R1=ctx(off=0,imm=0) R10=fp0 > ; struct task_struct *task = ctx->task; > 0: (79) r1 = *(u64 *)(r1 +8) ; > R1_w=trusted_ptr_or_null_task_struct(id=1,off=0,imm=0) > > The following BPF Prog works well. > > SEC("iter/task") > int dump_task(struct bpf_iter__task *ctx) > { > struct task_struct *task = ctx->task; > struct task_struct *acq; > if (task == NULL) > return 0; > acq = bpf_task_acquire(task); > if (!acq) > return 0; > bpf_task_release(acq); > > return 0; > } > bpf_iter__task->meta would be recognized as a trusted ptr. In btf_ctx_access(), we would add PTR_TRUSTED flag if prog_args_trusted(prog) return true. It seems for BPF_TRACE_ITER, ctx access is always 'trusted'. But I noticed that in task_iter.c, we actually explicitly declare that the type of bpf_iter__task->task is PTR_TO_BTF_ID_OR_NULL. static struct bpf_iter_reg task_reg_info = { .target = "task", .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_seq_info, .fill_link_info = bpf_iter_fill_link_info, .show_fdinfo = bpf_iter_task_show_fdinfo, }; btf_ctx_access() would enforce the reg_type is ctx_arg_info->reg_type(PTR_TO_BTF_ID_OR_NULL) for bpf_iter__task->task: bool btf_ctx_access() .... /* this is a pointer to another type */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { ... info->reg_type = ctx_arg_info->reg_type; info->btf = btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; return true; } } So, maybe another possible solution is: diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 209e5135f9fb..72a6778e3fba 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__cgroup, cgroup), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | MEM_RCU }, }, .seq_info = &cgroup_iter_seq_info, }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 59e747938bdb..4fd3f734dffd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &task_seq_info, .fill_link_info = bpf_iter_fill_link_info, Just some thoughts. Thanks.
On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > > > So, maybe another possible solution is: > > diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c > index 209e5135f9fb..72a6778e3fba 100644 > --- a/kernel/bpf/cgroup_iter.c > +++ b/kernel/bpf/cgroup_iter.c > @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = { > .ctx_arg_info_size = 1, > .ctx_arg_info = { > { offsetof(struct bpf_iter__cgroup, cgroup), > - PTR_TO_BTF_ID_OR_NULL }, > + PTR_TO_BTF_ID_OR_NULL | MEM_RCU }, > }, > .seq_info = &cgroup_iter_seq_info, > }; > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > index 59e747938bdb..4fd3f734dffd 100644 > --- a/kernel/bpf/task_iter.c > +++ b/kernel/bpf/task_iter.c > @@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = { > .ctx_arg_info_size = 1, > .ctx_arg_info = { > { offsetof(struct bpf_iter__task, task), > - PTR_TO_BTF_ID_OR_NULL }, > + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, Yep. That looks good. bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too. Not sure... why did you go with MEM_RCU there ?
Hello, 在 2023/11/1 06:06, Alexei Starovoitov 写道: > On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >> >> >> So, maybe another possible solution is: >> >> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c >> index 209e5135f9fb..72a6778e3fba 100644 >> --- a/kernel/bpf/cgroup_iter.c >> +++ b/kernel/bpf/cgroup_iter.c >> @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = { >> .ctx_arg_info_size = 1, >> .ctx_arg_info = { >> { offsetof(struct bpf_iter__cgroup, cgroup), >> - PTR_TO_BTF_ID_OR_NULL }, >> + PTR_TO_BTF_ID_OR_NULL | MEM_RCU }, >> }, >> .seq_info = &cgroup_iter_seq_info, >> }; >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c >> index 59e747938bdb..4fd3f734dffd 100644 >> --- a/kernel/bpf/task_iter.c >> +++ b/kernel/bpf/task_iter.c >> @@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = { >> .ctx_arg_info_size = 1, >> .ctx_arg_info = { >> { offsetof(struct bpf_iter__task, task), >> - PTR_TO_BTF_ID_OR_NULL }, >> + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, > > Yep. That looks good. > bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too. > Not sure... why did you go with MEM_RCU there ? hmm... That is because in our previous discussion, you suggested we'd better add BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...} I didn't think too much about it. But I noticed that we only use cgroup_mutex to protect the iteration in cgroup_iter.c. Looking at cgroup_kn_lock_live() in kernel/cgroup/cgroup.c, it would use cgroup_tryget()/cgroup_is_dead() to check whether the cgrp is 'dead'. cgroup_tryget() seems is equal to bpf_cgroup_acquire(). So, maybe let's return a 'rcu' cgrp pointer. If BPF Prog want stronger guarantee of cgrp, just use bpf_cgroup_acquire(). Just some thoughts. Thanks.
On Tue, Oct 31, 2023 at 7:41 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > > Hello, > > 在 2023/11/1 06:06, Alexei Starovoitov 写道: > > On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > >> > >> > >> So, maybe another possible solution is: > >> > >> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c > >> index 209e5135f9fb..72a6778e3fba 100644 > >> --- a/kernel/bpf/cgroup_iter.c > >> +++ b/kernel/bpf/cgroup_iter.c > >> @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = { > >> .ctx_arg_info_size = 1, > >> .ctx_arg_info = { > >> { offsetof(struct bpf_iter__cgroup, cgroup), > >> - PTR_TO_BTF_ID_OR_NULL }, > >> + PTR_TO_BTF_ID_OR_NULL | MEM_RCU }, > >> }, > >> .seq_info = &cgroup_iter_seq_info, > >> }; > >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > >> index 59e747938bdb..4fd3f734dffd 100644 > >> --- a/kernel/bpf/task_iter.c > >> +++ b/kernel/bpf/task_iter.c > >> @@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = { > >> .ctx_arg_info_size = 1, > >> .ctx_arg_info = { > >> { offsetof(struct bpf_iter__task, task), > >> - PTR_TO_BTF_ID_OR_NULL }, > >> + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, > > > > Yep. That looks good. > > bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too. > > Not sure... why did you go with MEM_RCU there ? > > hmm... > > That is because in our previous discussion, you suggested we'd better > add BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...} I mentioned that because we don't have BTF_TYPE_SAFE_TRUSTED_OR_NULL. and cgroup pointer can be NULL, but since you found a cleaner way we can do PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED. > I didn't think too much about it. But I noticed that we only use > cgroup_mutex to protect the iteration in cgroup_iter.c. > > Looking at cgroup_kn_lock_live() in kernel/cgroup/cgroup.c, it would use > cgroup_tryget()/cgroup_is_dead() to check whether the cgrp is 'dead'. > cgroup_tryget() seems is equal to bpf_cgroup_acquire(). So, maybe let's > return a 'rcu' cgrp pointer. If BPF Prog want stronger guarantee of > cgrp, just use bpf_cgroup_acquire(). and that would be misleading. MEM_RCU means that the pointer is valid, but it could have refcnt == 0, while PTR_TRUSTED means that it's good to use as-is. Here cgroup pointer is trusted. It's not a dead cgroup. See kernel/bpf/cgroup_iter.c __cgroup_iter_seq_show(). bpf prog doesn't need to call bpf_cgroup_acquire.
在 2023/11/2 14:07, Alexei Starovoitov 写道: > On Tue, Oct 31, 2023 at 7:41 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >> >> Hello, >> >> 在 2023/11/1 06:06, Alexei Starovoitov 写道: >>> On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >>>> >>>> [cut] >>> Yep. That looks good. >>> bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too. >>> Not sure... why did you go with MEM_RCU there ? >> >> hmm... >> >> That is because in our previous discussion, you suggested we'd better >> add BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...} > > I mentioned that because we don't have > BTF_TYPE_SAFE_TRUSTED_OR_NULL. > > and cgroup pointer can be NULL, but since you found a cleaner way > we can do PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED. > >> I didn't think too much about it. But I noticed that we only use >> cgroup_mutex to protect the iteration in cgroup_iter.c. >> >> Looking at cgroup_kn_lock_live() in kernel/cgroup/cgroup.c, it would use >> cgroup_tryget()/cgroup_is_dead() to check whether the cgrp is 'dead'. >> cgroup_tryget() seems is equal to bpf_cgroup_acquire(). So, maybe let's >> return a 'rcu' cgrp pointer. If BPF Prog want stronger guarantee of >> cgrp, just use bpf_cgroup_acquire(). > > and that would be misleading. > MEM_RCU means that the pointer is valid, but it could have refcnt == 0, > while PTR_TRUSTED means that it's good to use as-is. > > Here cgroup pointer is trusted. It's not a dead cgroup. > See kernel/bpf/cgroup_iter.c __cgroup_iter_seq_show(). > bpf prog doesn't need to call bpf_cgroup_acquire. Thanks for the detailed explanation. I will send a follow up.
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index e02feb5fae97..3679687a6927 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -4,6 +4,7 @@ #include <test_progs.h> #include <bpf/libbpf.h> #include <bpf/btf.h> +#include "iters_css_task.skel.h" #include "cgroup_iter.skel.h" #include "cgroup_helpers.h" @@ -263,6 +264,35 @@ static void test_walk_dead_self_only(struct cgroup_iter *skel) close(cgrp_fd); } +static void test_walk_self_only_css_task(void) +{ + struct iters_css_task *skel = NULL; + int err; + + skel = iters_css_task__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + bpf_program__set_autoload(skel->progs.cgroup_id_printer, true); + + err = iters_css_task__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = join_cgroup(cg_path[CHILD2]); + if (!ASSERT_OK(err, "join_cgroup")) + goto cleanup; + + skel->bss->target_pid = getpid(); + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n" EPILOGUE, cg_id[CHILD2]); + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[CHILD2], + BPF_CGROUP_ITER_SELF_ONLY, "test_walk_self_only_css_task"); + ASSERT_EQ(skel->bss->css_task_cnt, 1, "css_task_cnt"); +cleanup: + iters_css_task__destroy(skel); +} + void test_cgroup_iter(void) { struct cgroup_iter *skel = NULL; @@ -293,6 +323,9 @@ void test_cgroup_iter(void) test_walk_self_only(skel); if (test__start_subtest("cgroup_iter__dead_self_only")) test_walk_dead_self_only(skel); + if (test__start_subtest("cgroup_iter__self_only_css_task")) + test_walk_self_only_css_task(); + out: cgroup_iter__destroy(skel); cleanup_cgroups(); diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c index 5089ce384a1c..0974e6f44328 100644 --- a/tools/testing/selftests/bpf/progs/iters_css_task.c +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c @@ -10,6 +10,7 @@ char _license[] SEC("license") = "GPL"; +struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym; struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; void bpf_cgroup_release(struct cgroup *p) __ksym; @@ -45,3 +46,43 @@ int BPF_PROG(iter_css_task_for_each, struct vm_area_struct *vma, return -EPERM; } + +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + +SEC("?iter/cgroup") +int cgroup_id_printer(struct bpf_iter__cgroup *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct cgroup *cgrp, *acquired; + struct cgroup_subsys_state *css; + struct task_struct *task; + + cgrp = ctx->cgroup; + + /* epilogue */ + if (cgrp == NULL) { + BPF_SEQ_PRINTF(seq, "epilogue\n"); + return 0; + } + + /* prologue */ + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, "prologue\n"); + + BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp)); + + acquired = bpf_cgroup_from_id(cgroup_id(cgrp)); + if (!acquired) + return 0; + css = &acquired->self; + css_task_cnt = 0; + bpf_for_each(css_task, task, css, CSS_TASK_ITER_PROCS) { + if (task->pid == target_pid) + css_task_cnt++; + } + bpf_cgroup_release(acquired); + return 0; +}
This patch adds a test which demonstrates how css_task iter can be combined with cgroup iter and it won't cause deadlock, though cgroup iter is not sleepable. Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> --- .../selftests/bpf/prog_tests/cgroup_iter.c | 33 +++++++++++++++ .../selftests/bpf/progs/iters_css_task.c | 41 +++++++++++++++++++ 2 files changed, 74 insertions(+)