diff mbox series

[bpf-next,2/2] selftests/bpf: Add test for css_task iter combining with cgroup iter

Message ID 20231022154527.229117-3-zhouchuyi@bytedance.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Relax allowlist for open-coded css_task iter | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers warning 12 maintainers not CCed: song@kernel.org mykolal@fb.com yonghong.song@linux.dev jolsa@kernel.org kpsingh@kernel.org john.fastabend@gmail.com shuah@kernel.org linux-kselftest@vger.kernel.org sdf@google.com houtao1@huawei.com haoluo@google.com martin.lau@linux.dev
netdev/build_clang success Errors and warnings before: 9 this patch: 9
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 9 this patch: 9
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Comparison to NULL could be written "!cgrp"
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 1
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16

Commit Message

Chuyi Zhou Oct. 22, 2023, 3:45 p.m. UTC
This patch adds a test which demonstrates how css_task iter can be combined
with cgroup iter and it won't cause deadlock, though cgroup iter is not
sleepable.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 .../selftests/bpf/prog_tests/cgroup_iter.c    | 33 +++++++++++++++
 .../selftests/bpf/progs/iters_css_task.c      | 41 +++++++++++++++++++
 2 files changed, 74 insertions(+)

Comments

Alexei Starovoitov Oct. 22, 2023, 4:03 p.m. UTC | #1
On Sun, Oct 22, 2023 at 8:45 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> This patch adds a test which demonstrates how css_task iter can be combined
> with cgroup iter and it won't cause deadlock, though cgroup iter is not
> sleepable.
>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  .../selftests/bpf/prog_tests/cgroup_iter.c    | 33 +++++++++++++++
>  .../selftests/bpf/progs/iters_css_task.c      | 41 +++++++++++++++++++
>  2 files changed, 74 insertions(+)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
> index e02feb5fae97..3679687a6927 100644
> --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
> +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
> @@ -4,6 +4,7 @@
>  #include <test_progs.h>
>  #include <bpf/libbpf.h>
>  #include <bpf/btf.h>
> +#include "iters_css_task.skel.h"
>  #include "cgroup_iter.skel.h"
>  #include "cgroup_helpers.h"
>
> @@ -263,6 +264,35 @@ static void test_walk_dead_self_only(struct cgroup_iter *skel)
>         close(cgrp_fd);
>  }
>
> +static void test_walk_self_only_css_task(void)
> +{
> +       struct iters_css_task *skel = NULL;
> +       int err;
> +
> +       skel = iters_css_task__open();
> +       if (!ASSERT_OK_PTR(skel, "skel_open"))
> +               return;
> +
> +       bpf_program__set_autoload(skel->progs.cgroup_id_printer, true);
> +
> +       err = iters_css_task__load(skel);
> +       if (!ASSERT_OK(err, "skel_load"))
> +               goto cleanup;
> +
> +       err = join_cgroup(cg_path[CHILD2]);
> +       if (!ASSERT_OK(err, "join_cgroup"))
> +               goto cleanup;
> +
> +       skel->bss->target_pid = getpid();
> +       snprintf(expected_output, sizeof(expected_output),
> +               PROLOGUE "%8llu\n" EPILOGUE, cg_id[CHILD2]);
> +       read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[CHILD2],
> +               BPF_CGROUP_ITER_SELF_ONLY, "test_walk_self_only_css_task");
> +       ASSERT_EQ(skel->bss->css_task_cnt, 1, "css_task_cnt");
> +cleanup:
> +       iters_css_task__destroy(skel);
> +}
> +
>  void test_cgroup_iter(void)
>  {
>         struct cgroup_iter *skel = NULL;
> @@ -293,6 +323,9 @@ void test_cgroup_iter(void)
>                 test_walk_self_only(skel);
>         if (test__start_subtest("cgroup_iter__dead_self_only"))
>                 test_walk_dead_self_only(skel);
> +       if (test__start_subtest("cgroup_iter__self_only_css_task"))
> +               test_walk_self_only_css_task();
> +
>  out:
>         cgroup_iter__destroy(skel);
>         cleanup_cgroups();
> diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c
> index 5089ce384a1c..0974e6f44328 100644
> --- a/tools/testing/selftests/bpf/progs/iters_css_task.c
> +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c
> @@ -10,6 +10,7 @@
>
>  char _license[] SEC("license") = "GPL";
>
> +struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym;
>  struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
>  void bpf_cgroup_release(struct cgroup *p) __ksym;
>
> @@ -45,3 +46,43 @@ int BPF_PROG(iter_css_task_for_each, struct vm_area_struct *vma,
>
>         return -EPERM;
>  }
> +
> +static inline u64 cgroup_id(struct cgroup *cgrp)
> +{
> +       return cgrp->kn->id;
> +}
> +
> +SEC("?iter/cgroup")
> +int cgroup_id_printer(struct bpf_iter__cgroup *ctx)
> +{
> +       struct seq_file *seq = ctx->meta->seq;
> +       struct cgroup *cgrp, *acquired;
> +       struct cgroup_subsys_state *css;
> +       struct task_struct *task;
> +
> +       cgrp = ctx->cgroup;
> +
> +       /* epilogue */
> +       if (cgrp == NULL) {
> +               BPF_SEQ_PRINTF(seq, "epilogue\n");
> +               return 0;
> +       }
> +
> +       /* prologue */
> +       if (ctx->meta->seq_num == 0)
> +               BPF_SEQ_PRINTF(seq, "prologue\n");
> +
> +       BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp));
> +
> +       acquired = bpf_cgroup_from_id(cgroup_id(cgrp));

You're doing this dance, because a plain cgrp pointer is not trusted?
Maybe let's add
BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...}
to the verifier similar to what we do for bpf_iter__task.

After if (cgrp == NULL) check the pointer is safe to iterate on.

> +       if (!acquired)
> +               return 0;
> +       css = &acquired->self;
> +       css_task_cnt = 0;
> +       bpf_for_each(css_task, task, css, CSS_TASK_ITER_PROCS) {
> +               if (task->pid == target_pid)
> +                       css_task_cnt++;
> +       }
> +       bpf_cgroup_release(acquired);
> +       return 0;
> +}
> --
> 2.20.1
>
Chuyi Zhou Oct. 23, 2023, 1:50 p.m. UTC | #2
在 2023/10/23 00:03, Alexei Starovoitov 写道:
> On Sun, Oct 22, 2023 at 8:45 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> This patch adds a test which demonstrates how css_task iter can be combined
>> with cgroup iter and it won't cause deadlock, though cgroup iter is not
>> sleepable.
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>>   .../selftests/bpf/prog_tests/cgroup_iter.c    | 33 +++++++++++++++
>>   .../selftests/bpf/progs/iters_css_task.c      | 41 +++++++++++++++++++
>>   2 files changed, 74 insertions(+)
>>
>> diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
>> index e02feb5fae97..3679687a6927 100644
>> --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
>> +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
>> @@ -4,6 +4,7 @@
>>   #include <test_progs.h>
>>   #include <bpf/libbpf.h>
>>   #include <bpf/btf.h>
>> +#include "iters_css_task.skel.h"
>>   #include "cgroup_iter.skel.h"
>>   #include "cgroup_helpers.h"
>>
>> @@ -263,6 +264,35 @@ static void test_walk_dead_self_only(struct cgroup_iter *skel)
>>          close(cgrp_fd);
>>   }
>>
>> +static void test_walk_self_only_css_task(void)
>> +{
>> +       struct iters_css_task *skel = NULL;
>> +       int err;
>> +
>> +       skel = iters_css_task__open();
>> +       if (!ASSERT_OK_PTR(skel, "skel_open"))
>> +               return;
>> +
>> +       bpf_program__set_autoload(skel->progs.cgroup_id_printer, true);
>> +
>> +       err = iters_css_task__load(skel);
>> +       if (!ASSERT_OK(err, "skel_load"))
>> +               goto cleanup;
>> +
>> +       err = join_cgroup(cg_path[CHILD2]);
>> +       if (!ASSERT_OK(err, "join_cgroup"))
>> +               goto cleanup;
>> +
>> +       skel->bss->target_pid = getpid();
>> +       snprintf(expected_output, sizeof(expected_output),
>> +               PROLOGUE "%8llu\n" EPILOGUE, cg_id[CHILD2]);
>> +       read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[CHILD2],
>> +               BPF_CGROUP_ITER_SELF_ONLY, "test_walk_self_only_css_task");
>> +       ASSERT_EQ(skel->bss->css_task_cnt, 1, "css_task_cnt");
>> +cleanup:
>> +       iters_css_task__destroy(skel);
>> +}
>> +
>>   void test_cgroup_iter(void)
>>   {
>>          struct cgroup_iter *skel = NULL;
>> @@ -293,6 +323,9 @@ void test_cgroup_iter(void)
>>                  test_walk_self_only(skel);
>>          if (test__start_subtest("cgroup_iter__dead_self_only"))
>>                  test_walk_dead_self_only(skel);
>> +       if (test__start_subtest("cgroup_iter__self_only_css_task"))
>> +               test_walk_self_only_css_task();
>> +
>>   out:
>>          cgroup_iter__destroy(skel);
>>          cleanup_cgroups();
>> diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c
>> index 5089ce384a1c..0974e6f44328 100644
>> --- a/tools/testing/selftests/bpf/progs/iters_css_task.c
>> +++ b/tools/testing/selftests/bpf/progs/iters_css_task.c
>> @@ -10,6 +10,7 @@
>>
>>   char _license[] SEC("license") = "GPL";
>>
>> +struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym;
>>   struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
>>   void bpf_cgroup_release(struct cgroup *p) __ksym;
>>
>> @@ -45,3 +46,43 @@ int BPF_PROG(iter_css_task_for_each, struct vm_area_struct *vma,
>>
>>          return -EPERM;
>>   }
>> +
>> +static inline u64 cgroup_id(struct cgroup *cgrp)
>> +{
>> +       return cgrp->kn->id;
>> +}
>> +
>> +SEC("?iter/cgroup")
>> +int cgroup_id_printer(struct bpf_iter__cgroup *ctx)
>> +{
>> +       struct seq_file *seq = ctx->meta->seq;
>> +       struct cgroup *cgrp, *acquired;
>> +       struct cgroup_subsys_state *css;
>> +       struct task_struct *task;
>> +
>> +       cgrp = ctx->cgroup;
>> +
>> +       /* epilogue */
>> +       if (cgrp == NULL) {
>> +               BPF_SEQ_PRINTF(seq, "epilogue\n");
>> +               return 0;
>> +       }
>> +
>> +       /* prologue */
>> +       if (ctx->meta->seq_num == 0)
>> +               BPF_SEQ_PRINTF(seq, "prologue\n");
>> +
>> +       BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp));
>> +
>> +       acquired = bpf_cgroup_from_id(cgroup_id(cgrp));
> 
> You're doing this dance, because a plain cgrp pointer is not trusted?
> Maybe let's add
> BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...}
> to the verifier similar to what we do for bpf_iter__task.
> 

Yes, thanks for the suggestion.

But it seems currently, bpf_iter__task->task works well either.

SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
	struct task_struct *task = ctx->task;
	// here task should be trusted since BTF_TYPE_SAFE_TRUSTED(struct 
bpf_iter__task) {...}
	if (task == NULL) {
		return 0;
	}
	bpf_task_acquire(task);
	
	bpf_task_release(task);
	return 0;
}

The log of verifier is :
VERIFIER LOG:
=============
reg type unsupported for arg#0 function dump_task#7
0: R1=ctx(off=0,imm=0) R10=fp0
; struct task_struct *task = ctx->task;
0: (79) r6 = *(u64 *)(r1 +8)          ; R1=ctx(off=0,imm=0) 
R6_w=ptr_or_null_task_struct(id=1,off=0,imm=0)
; if (task == NULL) {
1: (15) if r6 == 0x0 goto pc+4        ; R6_w=ptr_task_struct(off=0,imm=0)
; bpf_task_acquire(task);
2: (bf) r1 = r6                       ; 
R1_w=ptr_task_struct(off=0,imm=0) R6_w=ptr_task_struct(off=0,imm=0)
3: (85) call bpf_task_acquire#26990
R1 must be a rcu pointer
processed 4 insns (limit 1000000) max_states_per_insn 0 total_states 0 
peak_states 0 mark_read 0

 From the above log, it seems 'task' is a normal porint not a trusted 
pointer.

Actually, it seems BPF verifier didn't even call check_ptr_to_btf_access()..

But for bpf_iter__task->meta, it works well. BPF verifier would consider 
it as a trusted pointer.

SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
         struct task_struct *task = ctx->task;
         struct bpf_iter_meta *meta = ctx->meta;
         struct seq_file *seq = meta->seq;
         if (task == NULL) {
                 BPF_SEQ_PRINTF(seq, "%s\n", "NULL");
                 return 0;
         }
         bpf_task_acquire(task);

         bpf_task_release(task);
         return 0;
}

VERIFIER LOG:
=============
reg type unsupported for arg#0 function dump_task#7
0: R1=ctx(off=0,imm=0) R10=fp0
; int dump_task(struct bpf_iter__task *ctx)
0: (bf) r8 = r1                       ; R1=ctx(off=0,imm=0) 
R8_w=ctx(off=0,imm=0)
; struct bpf_iter_meta *meta = ctx->meta;
1: (79) r1 = *(u64 *)(r8 +0)
func 'bpf_iter_task' arg0 has btf_id 22699 type STRUCT 'bpf_iter_meta'
2: R1_w=trusted_ptr_bpf_iter_meta(off=0,imm=0) R8_w=ctx(off=0,imm=0)
; struct seq_file *seq = meta->seq;
2: (79) r6 = *(u64 *)(r1 +0)          ; 
R1_w=trusted_ptr_bpf_iter_meta(off=0,imm=0) 
R6_w=trusted_ptr_seq_file(off=0,imm=0)
; struct task_struct *task = ctx->task;
3: (79) r7 = *(u64 *)(r8 +8)          ; 
R7_w=ptr_or_null_task_struct(id=1,off=0,imm=0) R8_w=ctx(off=0,imm=0)
; if (task == (void *)0) {
4: (55) if r7 != 0x0 goto pc+12 17: 
R1_w=trusted_ptr_bpf_iter_meta(off=0,imm=0) 
R6_w=trusted_ptr_seq_file(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0) 
R8_w=ctx(off=0,imm=0) R10=fp0
; bpf_task_acquire(task);
17: (bf) r1 = r7                      ; 
R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0)
18: (85) call bpf_task_acquire#26990
R1 must be a rcu pointer

I will try to figure out it.

Thanks.
Alexei Starovoitov Oct. 23, 2023, 5:14 p.m. UTC | #3
On Mon, Oct 23, 2023 at 6:50 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
>
> R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0)
> 18: (85) call bpf_task_acquire#26990
> R1 must be a rcu pointer
>
> I will try to figure out it.

Thanks. That would be great.
So far it looks like a regression.
I'm guessing __bpf_md_ptr wrapping is confusing the verifier.

Since it's more complicated than I thought, please respin
the current set with fixes to patch 1 and leave the patch 2 as-is.
That can be another follow up.
Chuyi Zhou Oct. 30, 2023, 2:37 p.m. UTC | #4
Hello,

在 2023/10/24 01:14, Alexei Starovoitov 写道:
> On Mon, Oct 23, 2023 at 6:50 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>>
>> R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0)
>> 18: (85) call bpf_task_acquire#26990
>> R1 must be a rcu pointer
>>
>> I will try to figure out it.
> 
> Thanks. That would be great.
> So far it looks like a regression.
> I'm guessing __bpf_md_ptr wrapping is confusing the verifier.
> 
> Since it's more complicated than I thought, please respin
> the current set with fixes to patch 1 and leave the patch 2 as-is.
> That can be another follow up.

After adding this change:

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 15d71d2986d3..4565e5457754 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6071,6 +6071,8 @@ bool btf_ctx_access(int off, int size, enum 
bpf_access_type type,
                         info->reg_type = ctx_arg_info->reg_type;
                         info->btf = btf_vmlinux;
                         info->btf_id = ctx_arg_info->btf_id;
+                       if (prog_args_trusted(prog))
+                               info->reg_type |= PTR_TRUSTED;
                         return true;
                 }
         }

the task pointer would be recognized as 'trusted_ptr_or_null_task_struct'.

The VERIFIER LOG:
=============
reg type unsupported for arg#0 function dump_task#7
0: R1=ctx(off=0,imm=0) R10=fp0
; struct task_struct *task = ctx->task;
0: (79) r1 = *(u64 *)(r1 +8)          ; 
R1_w=trusted_ptr_or_null_task_struct(id=1,off=0,imm=0)

The following BPF Prog works well.

SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
	struct task_struct *task = ctx->task;
	struct task_struct *acq;
	if (task == NULL)
		return 0;
	acq = bpf_task_acquire(task);
	if (!acq)
		return 0;
	bpf_task_release(acq);
	
	return 0;
}

Do you think this change is correct ?
Or do you have better ideas ?

Thanks.
Chuyi Zhou Oct. 31, 2023, 11:37 a.m. UTC | #5
在 2023/10/30 22:37, Chuyi Zhou 写道:
> Hello,
> 
> 在 2023/10/24 01:14, Alexei Starovoitov 写道:
>> On Mon, Oct 23, 2023 at 6:50 AM Chuyi Zhou <zhouchuyi@bytedance.com> 
>> wrote:
>>>
>>>
>>> R1_w=ptr_task_struct(off=0,imm=0) R7_w=ptr_task_struct(off=0,imm=0)
>>> 18: (85) call bpf_task_acquire#26990
>>> R1 must be a rcu pointer
>>>
>>> I will try to figure out it.
>>
>> Thanks. That would be great.
>> So far it looks like a regression.
>> I'm guessing __bpf_md_ptr wrapping is confusing the verifier.
>>
>> Since it's more complicated than I thought, please respin
>> the current set with fixes to patch 1 and leave the patch 2 as-is.
>> That can be another follow up.
> 
> After adding this change:
> 
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 15d71d2986d3..4565e5457754 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -6071,6 +6071,8 @@ bool btf_ctx_access(int off, int size, enum 
> bpf_access_type type,
>                          info->reg_type = ctx_arg_info->reg_type;
>                          info->btf = btf_vmlinux;
>                          info->btf_id = ctx_arg_info->btf_id;
> +                       if (prog_args_trusted(prog))
> +                               info->reg_type |= PTR_TRUSTED;
>                          return true;
>                  }
>          }
> 
> the task pointer would be recognized as 'trusted_ptr_or_null_task_struct'.
> 
> The VERIFIER LOG:
> =============
> reg type unsupported for arg#0 function dump_task#7
> 0: R1=ctx(off=0,imm=0) R10=fp0
> ; struct task_struct *task = ctx->task;
> 0: (79) r1 = *(u64 *)(r1 +8)          ; 
> R1_w=trusted_ptr_or_null_task_struct(id=1,off=0,imm=0)
> 
> The following BPF Prog works well.
> 
> SEC("iter/task")
> int dump_task(struct bpf_iter__task *ctx)
> {
>      struct task_struct *task = ctx->task;
>      struct task_struct *acq;
>      if (task == NULL)
>          return 0;
>      acq = bpf_task_acquire(task);
>      if (!acq)
>          return 0;
>      bpf_task_release(acq);
> 
>      return 0;
> }
> 

bpf_iter__task->meta would be recognized as a trusted ptr.
In btf_ctx_access(), we would add PTR_TRUSTED flag if 
prog_args_trusted(prog) return true.

It seems for BPF_TRACE_ITER, ctx access is always 'trusted'.

But I noticed that in task_iter.c, we actually explicitly declare that 
the type of bpf_iter__task->task is PTR_TO_BTF_ID_OR_NULL.

static struct bpf_iter_reg task_reg_info = {
	.target			= "task",
	.attach_target		= bpf_iter_attach_task,
	.feature		= BPF_ITER_RESCHED,
	.ctx_arg_info_size	= 1,
	.ctx_arg_info		= {
		{ offsetof(struct bpf_iter__task, task),
		  PTR_TO_BTF_ID_OR_NULL },
	},
	.seq_info		= &task_seq_info,
	.fill_link_info		= bpf_iter_fill_link_info,
	.show_fdinfo		= bpf_iter_task_show_fdinfo,
};

btf_ctx_access() would enforce the reg_type is 
ctx_arg_info->reg_type(PTR_TO_BTF_ID_OR_NULL) for bpf_iter__task->task:

bool btf_ctx_access()
	....
	/* this is a pointer to another type */
	for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];

		if (ctx_arg_info->offset == off) {
			...
			info->reg_type = ctx_arg_info->reg_type;
			info->btf = btf_vmlinux;
			info->btf_id = ctx_arg_info->btf_id;
			return true;
		}
	}


So, maybe another possible solution is:

diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 209e5135f9fb..72a6778e3fba 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = {
         .ctx_arg_info_size      = 1,
         .ctx_arg_info           = {
                 { offsetof(struct bpf_iter__cgroup, cgroup),
-                 PTR_TO_BTF_ID_OR_NULL },
+                 PTR_TO_BTF_ID_OR_NULL | MEM_RCU },
         },
         .seq_info               = &cgroup_iter_seq_info,
  };
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 59e747938bdb..4fd3f734dffd 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = {
         .ctx_arg_info_size      = 1,
         .ctx_arg_info           = {
                 { offsetof(struct bpf_iter__task, task),
-                 PTR_TO_BTF_ID_OR_NULL },
+                 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
         },
         .seq_info               = &task_seq_info,
         .fill_link_info         = bpf_iter_fill_link_info,


Just some thoughts.

Thanks.
Alexei Starovoitov Oct. 31, 2023, 10:06 p.m. UTC | #6
On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
>
> So, maybe another possible solution is:
>
> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> index 209e5135f9fb..72a6778e3fba 100644
> --- a/kernel/bpf/cgroup_iter.c
> +++ b/kernel/bpf/cgroup_iter.c
> @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = {
>          .ctx_arg_info_size      = 1,
>          .ctx_arg_info           = {
>                  { offsetof(struct bpf_iter__cgroup, cgroup),
> -                 PTR_TO_BTF_ID_OR_NULL },
> +                 PTR_TO_BTF_ID_OR_NULL | MEM_RCU },
>          },
>          .seq_info               = &cgroup_iter_seq_info,
>   };
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 59e747938bdb..4fd3f734dffd 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = {
>          .ctx_arg_info_size      = 1,
>          .ctx_arg_info           = {
>                  { offsetof(struct bpf_iter__task, task),
> -                 PTR_TO_BTF_ID_OR_NULL },
> +                 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },

Yep. That looks good.
bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too.
Not sure... why did you go with MEM_RCU there ?
Chuyi Zhou Nov. 1, 2023, 2:41 a.m. UTC | #7
Hello,

在 2023/11/1 06:06, Alexei Starovoitov 写道:
> On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>>
>> So, maybe another possible solution is:
>>
>> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
>> index 209e5135f9fb..72a6778e3fba 100644
>> --- a/kernel/bpf/cgroup_iter.c
>> +++ b/kernel/bpf/cgroup_iter.c
>> @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = {
>>           .ctx_arg_info_size      = 1,
>>           .ctx_arg_info           = {
>>                   { offsetof(struct bpf_iter__cgroup, cgroup),
>> -                 PTR_TO_BTF_ID_OR_NULL },
>> +                 PTR_TO_BTF_ID_OR_NULL | MEM_RCU },
>>           },
>>           .seq_info               = &cgroup_iter_seq_info,
>>    };
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index 59e747938bdb..4fd3f734dffd 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = {
>>           .ctx_arg_info_size      = 1,
>>           .ctx_arg_info           = {
>>                   { offsetof(struct bpf_iter__task, task),
>> -                 PTR_TO_BTF_ID_OR_NULL },
>> +                 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
> 
> Yep. That looks good.
> bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too.
> Not sure... why did you go with MEM_RCU there ?

hmm...

That is because in our previous discussion, you suggested we'd better 
add BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...}

I didn't think too much about it. But I noticed that we only use 
cgroup_mutex to protect the iteration in cgroup_iter.c.

Looking at cgroup_kn_lock_live() in kernel/cgroup/cgroup.c, it would use 
cgroup_tryget()/cgroup_is_dead() to check whether the cgrp is 'dead'. 
cgroup_tryget() seems is equal to bpf_cgroup_acquire(). So, maybe let's 
return a 'rcu' cgrp pointer. If BPF Prog want stronger guarantee of 
cgrp, just use bpf_cgroup_acquire().

Just some thoughts. Thanks.
Alexei Starovoitov Nov. 2, 2023, 6:07 a.m. UTC | #8
On Tue, Oct 31, 2023 at 7:41 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello,
>
> 在 2023/11/1 06:06, Alexei Starovoitov 写道:
> > On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >>
> >> So, maybe another possible solution is:
> >>
> >> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> >> index 209e5135f9fb..72a6778e3fba 100644
> >> --- a/kernel/bpf/cgroup_iter.c
> >> +++ b/kernel/bpf/cgroup_iter.c
> >> @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = {
> >>           .ctx_arg_info_size      = 1,
> >>           .ctx_arg_info           = {
> >>                   { offsetof(struct bpf_iter__cgroup, cgroup),
> >> -                 PTR_TO_BTF_ID_OR_NULL },
> >> +                 PTR_TO_BTF_ID_OR_NULL | MEM_RCU },
> >>           },
> >>           .seq_info               = &cgroup_iter_seq_info,
> >>    };
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> index 59e747938bdb..4fd3f734dffd 100644
> >> --- a/kernel/bpf/task_iter.c
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -706,7 +706,7 @@ static struct bpf_iter_reg task_reg_info = {
> >>           .ctx_arg_info_size      = 1,
> >>           .ctx_arg_info           = {
> >>                   { offsetof(struct bpf_iter__task, task),
> >> -                 PTR_TO_BTF_ID_OR_NULL },
> >> +                 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
> >
> > Yep. That looks good.
> > bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too.
> > Not sure... why did you go with MEM_RCU there ?
>
> hmm...
>
> That is because in our previous discussion, you suggested we'd better
> add BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...}

I mentioned that because we don't have
BTF_TYPE_SAFE_TRUSTED_OR_NULL.

and cgroup pointer can be NULL, but since you found a cleaner way
we can do PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED.

> I didn't think too much about it. But I noticed that we only use
> cgroup_mutex to protect the iteration in cgroup_iter.c.
>
> Looking at cgroup_kn_lock_live() in kernel/cgroup/cgroup.c, it would use
> cgroup_tryget()/cgroup_is_dead() to check whether the cgrp is 'dead'.
> cgroup_tryget() seems is equal to bpf_cgroup_acquire(). So, maybe let's
> return a 'rcu' cgrp pointer. If BPF Prog want stronger guarantee of
> cgrp, just use bpf_cgroup_acquire().

and that would be misleading.
MEM_RCU means that the pointer is valid, but it could have refcnt == 0,
while PTR_TRUSTED means that it's good to use as-is.

Here cgroup pointer is trusted. It's not a dead cgroup.
See kernel/bpf/cgroup_iter.c __cgroup_iter_seq_show().
bpf prog doesn't need to call bpf_cgroup_acquire.
Chuyi Zhou Nov. 2, 2023, 9:21 a.m. UTC | #9
在 2023/11/2 14:07, Alexei Starovoitov 写道:
> On Tue, Oct 31, 2023 at 7:41 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> Hello,
>>
>> 在 2023/11/1 06:06, Alexei Starovoitov 写道:
>>> On Tue, Oct 31, 2023 at 4:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>>>
>>>>

[cut]

>>> Yep. That looks good.
>>> bpf_cgroup_reg_info -> cgroup is probably PTR_TRUSTED too.
>>> Not sure... why did you go with MEM_RCU there ?
>>
>> hmm...
>>
>> That is because in our previous discussion, you suggested we'd better
>> add BTF_TYPE_SAFE_RCU_OR_NULL(struct bpf_iter__cgroup) {...}
> 
> I mentioned that because we don't have
> BTF_TYPE_SAFE_TRUSTED_OR_NULL.
> 
> and cgroup pointer can be NULL, but since you found a cleaner way
> we can do PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED.
> 
>> I didn't think too much about it. But I noticed that we only use
>> cgroup_mutex to protect the iteration in cgroup_iter.c.
>>
>> Looking at cgroup_kn_lock_live() in kernel/cgroup/cgroup.c, it would use
>> cgroup_tryget()/cgroup_is_dead() to check whether the cgrp is 'dead'.
>> cgroup_tryget() seems is equal to bpf_cgroup_acquire(). So, maybe let's
>> return a 'rcu' cgrp pointer. If BPF Prog want stronger guarantee of
>> cgrp, just use bpf_cgroup_acquire().
> 
> and that would be misleading.
> MEM_RCU means that the pointer is valid, but it could have refcnt == 0,
> while PTR_TRUSTED means that it's good to use as-is.
> 
> Here cgroup pointer is trusted. It's not a dead cgroup.
> See kernel/bpf/cgroup_iter.c __cgroup_iter_seq_show().
> bpf prog doesn't need to call bpf_cgroup_acquire.

Thanks for the detailed explanation.

I will send a follow up.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
index e02feb5fae97..3679687a6927 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
@@ -4,6 +4,7 @@ 
 #include <test_progs.h>
 #include <bpf/libbpf.h>
 #include <bpf/btf.h>
+#include "iters_css_task.skel.h"
 #include "cgroup_iter.skel.h"
 #include "cgroup_helpers.h"
 
@@ -263,6 +264,35 @@  static void test_walk_dead_self_only(struct cgroup_iter *skel)
 	close(cgrp_fd);
 }
 
+static void test_walk_self_only_css_task(void)
+{
+	struct iters_css_task *skel = NULL;
+	int err;
+
+	skel = iters_css_task__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	bpf_program__set_autoload(skel->progs.cgroup_id_printer, true);
+
+	err = iters_css_task__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	err = join_cgroup(cg_path[CHILD2]);
+	if (!ASSERT_OK(err, "join_cgroup"))
+		goto cleanup;
+
+	skel->bss->target_pid = getpid();
+	snprintf(expected_output, sizeof(expected_output),
+		PROLOGUE "%8llu\n" EPILOGUE, cg_id[CHILD2]);
+	read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[CHILD2],
+		BPF_CGROUP_ITER_SELF_ONLY, "test_walk_self_only_css_task");
+	ASSERT_EQ(skel->bss->css_task_cnt, 1, "css_task_cnt");
+cleanup:
+	iters_css_task__destroy(skel);
+}
+
 void test_cgroup_iter(void)
 {
 	struct cgroup_iter *skel = NULL;
@@ -293,6 +323,9 @@  void test_cgroup_iter(void)
 		test_walk_self_only(skel);
 	if (test__start_subtest("cgroup_iter__dead_self_only"))
 		test_walk_dead_self_only(skel);
+	if (test__start_subtest("cgroup_iter__self_only_css_task"))
+		test_walk_self_only_css_task();
+
 out:
 	cgroup_iter__destroy(skel);
 	cleanup_cgroups();
diff --git a/tools/testing/selftests/bpf/progs/iters_css_task.c b/tools/testing/selftests/bpf/progs/iters_css_task.c
index 5089ce384a1c..0974e6f44328 100644
--- a/tools/testing/selftests/bpf/progs/iters_css_task.c
+++ b/tools/testing/selftests/bpf/progs/iters_css_task.c
@@ -10,6 +10,7 @@ 
 
 char _license[] SEC("license") = "GPL";
 
+struct cgroup *bpf_cgroup_acquire(struct cgroup *p) __ksym;
 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
 void bpf_cgroup_release(struct cgroup *p) __ksym;
 
@@ -45,3 +46,43 @@  int BPF_PROG(iter_css_task_for_each, struct vm_area_struct *vma,
 
 	return -EPERM;
 }
+
+static inline u64 cgroup_id(struct cgroup *cgrp)
+{
+	return cgrp->kn->id;
+}
+
+SEC("?iter/cgroup")
+int cgroup_id_printer(struct bpf_iter__cgroup *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct cgroup *cgrp, *acquired;
+	struct cgroup_subsys_state *css;
+	struct task_struct *task;
+
+	cgrp = ctx->cgroup;
+
+	/* epilogue */
+	if (cgrp == NULL) {
+		BPF_SEQ_PRINTF(seq, "epilogue\n");
+		return 0;
+	}
+
+	/* prologue */
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "prologue\n");
+
+	BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp));
+
+	acquired = bpf_cgroup_from_id(cgroup_id(cgrp));
+	if (!acquired)
+		return 0;
+	css = &acquired->self;
+	css_task_cnt = 0;
+	bpf_for_each(css_task, task, css, CSS_TASK_ITER_PROCS) {
+		if (task->pid == target_pid)
+			css_task_cnt++;
+	}
+	bpf_cgroup_release(acquired);
+	return 0;
+}