diff mbox series

[bpf-next,v3,4/7] bpf: Introduce css open-coded iterator kfuncs

Message ID 20230925105552.817513-5-zhouchuyi@bytedance.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Add Open-coded task, css_task and css iters | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ShellCheck
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for veristat

Commit Message

Chuyi Zhou Sept. 25, 2023, 10:55 a.m. UTC
This Patch adds kfuncs bpf_iter_css_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_css in open-coded iterator
style. These kfuncs actually wrapps css_next_descendant_{pre, post}.
css_iter can be used to:

1) iterating a sepcific cgroup tree with pre/post/up order

2) iterating cgroup_subsystem in BPF Prog, like
for_each_mem_cgroup_tree/cpuset_for_each_descendant_pre in kernel.

The API design is consistent with cgroup_iter. bpf_iter_css_new accepts
parameters defining iteration order and starting css. Here we also reuse
BPF_CGROUP_ITER_DESCENDANTS_PRE, BPF_CGROUP_ITER_DESCENDANTS_POST,
BPF_CGROUP_ITER_ANCESTORS_UP enums.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 kernel/bpf/cgroup_iter.c                      | 57 +++++++++++++++++++
 kernel/bpf/helpers.c                          |  3 +
 .../testing/selftests/bpf/bpf_experimental.h  |  6 ++
 3 files changed, 66 insertions(+)

Comments

Andrii Nakryiko Sept. 27, 2023, 11:24 p.m. UTC | #1
On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> This Patch adds kfuncs bpf_iter_css_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_css in open-coded iterator
> style. These kfuncs actually wrapps css_next_descendant_{pre, post}.
> css_iter can be used to:
>
> 1) iterating a sepcific cgroup tree with pre/post/up order
>
> 2) iterating cgroup_subsystem in BPF Prog, like
> for_each_mem_cgroup_tree/cpuset_for_each_descendant_pre in kernel.
>
> The API design is consistent with cgroup_iter. bpf_iter_css_new accepts
> parameters defining iteration order and starting css. Here we also reuse
> BPF_CGROUP_ITER_DESCENDANTS_PRE, BPF_CGROUP_ITER_DESCENDANTS_POST,
> BPF_CGROUP_ITER_ANCESTORS_UP enums.
>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  kernel/bpf/cgroup_iter.c                      | 57 +++++++++++++++++++
>  kernel/bpf/helpers.c                          |  3 +
>  .../testing/selftests/bpf/bpf_experimental.h  |  6 ++
>  3 files changed, 66 insertions(+)
>
> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> index 810378f04fbc..ebc3d9471f52 100644
> --- a/kernel/bpf/cgroup_iter.c
> +++ b/kernel/bpf/cgroup_iter.c
> @@ -294,3 +294,60 @@ static int __init bpf_cgroup_iter_init(void)
>  }
>
>  late_initcall(bpf_cgroup_iter_init);
> +
> +struct bpf_iter_css {
> +       __u64 __opaque[2];
> +       __u32 __opaque_int[1];
> +} __attribute__((aligned(8)));
> +

same as before, __opaque[3] only


> +struct bpf_iter_css_kern {
> +       struct cgroup_subsys_state *start;
> +       struct cgroup_subsys_state *pos;
> +       int order;
> +} __attribute__((aligned(8)));
> +
> +__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
> +               struct cgroup_subsys_state *start, enum bpf_cgroup_iter_order order)

Similarly, I wonder if we should go for a more generic "flags" argument?

> +{
> +       struct bpf_iter_css_kern *kit = (void *)it;

empty line

> +       kit->start = NULL;
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) != sizeof(struct bpf_iter_css));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));

please move this up before kit->start assignment, and separate by empty lines

> +       switch (order) {
> +       case BPF_CGROUP_ITER_DESCENDANTS_PRE:
> +       case BPF_CGROUP_ITER_DESCENDANTS_POST:
> +       case BPF_CGROUP_ITER_ANCESTORS_UP:
> +               break;
> +       default:
> +               return -EINVAL;
> +       }
> +
> +       kit->start = start;
> +       kit->pos = NULL;
> +       kit->order = order;
> +       return 0;
> +}
> +
> +__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
> +{
> +       struct bpf_iter_css_kern *kit = (void *)it;

empty line

> +       if (!kit->start)
> +               return NULL;
> +
> +       switch (kit->order) {
> +       case BPF_CGROUP_ITER_DESCENDANTS_PRE:
> +               kit->pos = css_next_descendant_pre(kit->pos, kit->start);
> +               break;
> +       case BPF_CGROUP_ITER_DESCENDANTS_POST:
> +               kit->pos = css_next_descendant_post(kit->pos, kit->start);
> +               break;
> +       default:

we know it's BPF_CGROUP_ITER_ANCESTORS_UP, so why not have that here explicitly?

> +               kit->pos = kit->pos ? kit->pos->parent : kit->start;
> +       }
> +
> +       return kit->pos;

wouldn't this implementation never return the "start" css? is that intentional?

> +}
> +
> +__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
> +{
> +}
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 556262c27a75..9c3af36249a2 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2510,6 +2510,9 @@ BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
>  BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
> +BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
> index d989775dbdb5..aa247d1d81d1 100644
> --- a/tools/testing/selftests/bpf/bpf_experimental.h
> +++ b/tools/testing/selftests/bpf/bpf_experimental.h
> @@ -174,4 +174,10 @@ extern int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task,
>  extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
>  extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
>
> +struct bpf_iter_css;
> +extern int bpf_iter_css_new(struct bpf_iter_css *it,
> +                               struct cgroup_subsys_state *start, enum bpf_cgroup_iter_order order) __weak __ksym;
> +extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
> +extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
> +
>  #endif
> --
> 2.20.1
>
Chuyi Zhou Sept. 28, 2023, 2:51 a.m. UTC | #2
Hello,

在 2023/9/28 07:24, Andrii Nakryiko 写道:
> On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> This Patch adds kfuncs bpf_iter_css_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_css in open-coded iterator
>> style. These kfuncs actually wrapps css_next_descendant_{pre, post}.
>> css_iter can be used to:
>>
>> 1) iterating a sepcific cgroup tree with pre/post/up order
>>
>> 2) iterating cgroup_subsystem in BPF Prog, like
>> for_each_mem_cgroup_tree/cpuset_for_each_descendant_pre in kernel.
>>
>> The API design is consistent with cgroup_iter. bpf_iter_css_new accepts
>> parameters defining iteration order and starting css. Here we also reuse
>> BPF_CGROUP_ITER_DESCENDANTS_PRE, BPF_CGROUP_ITER_DESCENDANTS_POST,
>> BPF_CGROUP_ITER_ANCESTORS_UP enums.
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>>   kernel/bpf/cgroup_iter.c                      | 57 +++++++++++++++++++
>>   kernel/bpf/helpers.c                          |  3 +
>>   .../testing/selftests/bpf/bpf_experimental.h  |  6 ++
>>   3 files changed, 66 insertions(+)
>>
>> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
>> index 810378f04fbc..ebc3d9471f52 100644
>> --- a/kernel/bpf/cgroup_iter.c
>> +++ b/kernel/bpf/cgroup_iter.c
>> @@ -294,3 +294,60 @@ static int __init bpf_cgroup_iter_init(void)
>>   }
>>
>>   late_initcall(bpf_cgroup_iter_init);
>> +
>> +struct bpf_iter_css {
>> +       __u64 __opaque[2];
>> +       __u32 __opaque_int[1];
>> +} __attribute__((aligned(8)));
>> +
> 
> same as before, __opaque[3] only
> 
> 
>> +struct bpf_iter_css_kern {
>> +       struct cgroup_subsys_state *start;
>> +       struct cgroup_subsys_state *pos;
>> +       int order;
>> +} __attribute__((aligned(8)));
>> +
>> +__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
>> +               struct cgroup_subsys_state *start, enum bpf_cgroup_iter_order order)
> 
> Similarly, I wonder if we should go for a more generic "flags" argument?
> 
>> +{
>> +       struct bpf_iter_css_kern *kit = (void *)it;
> 
> empty line
> 
>> +       kit->start = NULL;
>> +       BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) != sizeof(struct bpf_iter_css));
>> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
> 
> please move this up before kit->start assignment, and separate by empty lines
> 
>> +       switch (order) {
>> +       case BPF_CGROUP_ITER_DESCENDANTS_PRE:
>> +       case BPF_CGROUP_ITER_DESCENDANTS_POST:
>> +       case BPF_CGROUP_ITER_ANCESTORS_UP:
>> +               break;
>> +       default:
>> +               return -EINVAL;
>> +       }
>> +
>> +       kit->start = start;
>> +       kit->pos = NULL;
>> +       kit->order = order;
>> +       return 0;
>> +}
>> +
>> +__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
>> +{
>> +       struct bpf_iter_css_kern *kit = (void *)it;
> 
> empty line
> 
>> +       if (!kit->start)
>> +               return NULL;
>> +
>> +       switch (kit->order) {
>> +       case BPF_CGROUP_ITER_DESCENDANTS_PRE:
>> +               kit->pos = css_next_descendant_pre(kit->pos, kit->start);
>> +               break;
>> +       case BPF_CGROUP_ITER_DESCENDANTS_POST:
>> +               kit->pos = css_next_descendant_post(kit->pos, kit->start);
>> +               break;
>> +       default:
> 
> we know it's BPF_CGROUP_ITER_ANCESTORS_UP, so why not have that here explicitly?
> 
>> +               kit->pos = kit->pos ? kit->pos->parent : kit->start;
>> +       }
>> +
>> +       return kit->pos;
> 
> wouldn't this implementation never return the "start" css? is that intentional?
> 

Thanks for the review.

This implementation actually would return the "start" css.

1. BPF_CGROUP_ITER_DESCENDANTS_PRE:
1.1 when we first call next(), css_next_descendant_pre(NULL, kit->start) 
will return kit->start.
1.2 second call next(), css_next_descendant_pre(kit->start, kit->start) 
would return a first valid child under kit->start with pre-order
1.3 third call next, css_next_descendant_pre(last_valid_child, 
kit->start) would return the next valid child
...
util css_next_descendant_pre return a NULL pointer, which means we have 
visited all valid child including "start" css itself.

The above logic is equal to macro 'css_for_each_descendant_pre' in kernel.

Same, BPF_CGROUP_ITER_DESCENDANTS_POST is equal to macro 
'css_for_each_descendant_post' which would return 'start' css when we 
have visited all valid child.

2. BPF_CGROUP_ITER_ANCESTORS_UP
2.1 when we fisrt call next(), kit->pos is NULL, and we would return 
kit->start.


The selftest in patch7 whould check:
1. when we use BPF_CGROUP_ITER_DESCENDANTS_PRE to iterate a cgroup tree, 
the first cgroup we visted should be root('start') cgroup.
2. when we use BPF_CGROUP_ITER_DESCENDANTS_POST to iterate a cgroup 
tree, the last cgroup we visited should be root('start') cgroup.


Am I miss something important?


Thanks.
Andrii Nakryiko Sept. 29, 2023, 9:29 p.m. UTC | #3
On Wed, Sep 27, 2023 at 7:51 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello,
>
> 在 2023/9/28 07:24, Andrii Nakryiko 写道:
> > On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >> This Patch adds kfuncs bpf_iter_css_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_css in open-coded iterator
> >> style. These kfuncs actually wrapps css_next_descendant_{pre, post}.
> >> css_iter can be used to:
> >>
> >> 1) iterating a sepcific cgroup tree with pre/post/up order
> >>
> >> 2) iterating cgroup_subsystem in BPF Prog, like
> >> for_each_mem_cgroup_tree/cpuset_for_each_descendant_pre in kernel.
> >>
> >> The API design is consistent with cgroup_iter. bpf_iter_css_new accepts
> >> parameters defining iteration order and starting css. Here we also reuse
> >> BPF_CGROUP_ITER_DESCENDANTS_PRE, BPF_CGROUP_ITER_DESCENDANTS_POST,
> >> BPF_CGROUP_ITER_ANCESTORS_UP enums.
> >>
> >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> >> ---
> >>   kernel/bpf/cgroup_iter.c                      | 57 +++++++++++++++++++
> >>   kernel/bpf/helpers.c                          |  3 +
> >>   .../testing/selftests/bpf/bpf_experimental.h  |  6 ++
> >>   3 files changed, 66 insertions(+)
> >>
> >> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> >> index 810378f04fbc..ebc3d9471f52 100644
> >> --- a/kernel/bpf/cgroup_iter.c
> >> +++ b/kernel/bpf/cgroup_iter.c
> >> @@ -294,3 +294,60 @@ static int __init bpf_cgroup_iter_init(void)
> >>   }
> >>
> >>   late_initcall(bpf_cgroup_iter_init);
> >> +
> >> +struct bpf_iter_css {
> >> +       __u64 __opaque[2];
> >> +       __u32 __opaque_int[1];
> >> +} __attribute__((aligned(8)));
> >> +
> >
> > same as before, __opaque[3] only
> >
> >
> >> +struct bpf_iter_css_kern {
> >> +       struct cgroup_subsys_state *start;
> >> +       struct cgroup_subsys_state *pos;
> >> +       int order;
> >> +} __attribute__((aligned(8)));
> >> +
> >> +__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
> >> +               struct cgroup_subsys_state *start, enum bpf_cgroup_iter_order order)
> >
> > Similarly, I wonder if we should go for a more generic "flags" argument?
> >
> >> +{
> >> +       struct bpf_iter_css_kern *kit = (void *)it;
> >
> > empty line
> >
> >> +       kit->start = NULL;
> >> +       BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) != sizeof(struct bpf_iter_css));
> >> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
> >
> > please move this up before kit->start assignment, and separate by empty lines
> >
> >> +       switch (order) {
> >> +       case BPF_CGROUP_ITER_DESCENDANTS_PRE:
> >> +       case BPF_CGROUP_ITER_DESCENDANTS_POST:
> >> +       case BPF_CGROUP_ITER_ANCESTORS_UP:
> >> +               break;
> >> +       default:
> >> +               return -EINVAL;
> >> +       }
> >> +
> >> +       kit->start = start;
> >> +       kit->pos = NULL;
> >> +       kit->order = order;
> >> +       return 0;
> >> +}
> >> +
> >> +__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
> >> +{
> >> +       struct bpf_iter_css_kern *kit = (void *)it;
> >
> > empty line
> >
> >> +       if (!kit->start)
> >> +               return NULL;
> >> +
> >> +       switch (kit->order) {
> >> +       case BPF_CGROUP_ITER_DESCENDANTS_PRE:
> >> +               kit->pos = css_next_descendant_pre(kit->pos, kit->start);
> >> +               break;
> >> +       case BPF_CGROUP_ITER_DESCENDANTS_POST:
> >> +               kit->pos = css_next_descendant_post(kit->pos, kit->start);
> >> +               break;
> >> +       default:
> >
> > we know it's BPF_CGROUP_ITER_ANCESTORS_UP, so why not have that here explicitly?
> >
> >> +               kit->pos = kit->pos ? kit->pos->parent : kit->start;
> >> +       }
> >> +
> >> +       return kit->pos;
> >
> > wouldn't this implementation never return the "start" css? is that intentional?
> >
>
> Thanks for the review.
>
> This implementation actually would return the "start" css.
>
> 1. BPF_CGROUP_ITER_DESCENDANTS_PRE:
> 1.1 when we first call next(), css_next_descendant_pre(NULL, kit->start)
> will return kit->start.
> 1.2 second call next(), css_next_descendant_pre(kit->start, kit->start)
> would return a first valid child under kit->start with pre-order
> 1.3 third call next, css_next_descendant_pre(last_valid_child,
> kit->start) would return the next valid child
> ...
> util css_next_descendant_pre return a NULL pointer, which means we have
> visited all valid child including "start" css itself.
>
> The above logic is equal to macro 'css_for_each_descendant_pre' in kernel.
>
> Same, BPF_CGROUP_ITER_DESCENDANTS_POST is equal to macro
> 'css_for_each_descendant_post' which would return 'start' css when we
> have visited all valid child.
>
> 2. BPF_CGROUP_ITER_ANCESTORS_UP
> 2.1 when we fisrt call next(), kit->pos is NULL, and we would return
> kit->start.
>
>
> The selftest in patch7 whould check:
> 1. when we use BPF_CGROUP_ITER_DESCENDANTS_PRE to iterate a cgroup tree,
> the first cgroup we visted should be root('start') cgroup.
> 2. when we use BPF_CGROUP_ITER_DESCENDANTS_POST to iterate a cgroup
> tree, the last cgroup we visited should be root('start') cgroup.
>
>
> Am I miss something important?
>

No, again, my bad, I didn't trace the logic completely before asking.
All makes sense with kit->pos being initialized to NULL. Thanks for
elaborating!

>
> Thanks.
>
>
>
diff mbox series

Patch

diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 810378f04fbc..ebc3d9471f52 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -294,3 +294,60 @@  static int __init bpf_cgroup_iter_init(void)
 }
 
 late_initcall(bpf_cgroup_iter_init);
+
+struct bpf_iter_css {
+	__u64 __opaque[2];
+	__u32 __opaque_int[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_kern {
+	struct cgroup_subsys_state *start;
+	struct cgroup_subsys_state *pos;
+	int order;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
+		struct cgroup_subsys_state *start, enum bpf_cgroup_iter_order order)
+{
+	struct bpf_iter_css_kern *kit = (void *)it;
+	kit->start = NULL;
+	BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) != sizeof(struct bpf_iter_css));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
+	switch (order) {
+	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+	case BPF_CGROUP_ITER_DESCENDANTS_POST:
+	case BPF_CGROUP_ITER_ANCESTORS_UP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	kit->start = start;
+	kit->pos = NULL;
+	kit->order = order;
+	return 0;
+}
+
+__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
+{
+	struct bpf_iter_css_kern *kit = (void *)it;
+	if (!kit->start)
+		return NULL;
+
+	switch (kit->order) {
+	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+		kit->pos = css_next_descendant_pre(kit->pos, kit->start);
+		break;
+	case BPF_CGROUP_ITER_DESCENDANTS_POST:
+		kit->pos = css_next_descendant_post(kit->pos, kit->start);
+		break;
+	default:
+		kit->pos = kit->pos ? kit->pos->parent : kit->start;
+	}
+
+	return kit->pos;
+}
+
+__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
+{
+}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 556262c27a75..9c3af36249a2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2510,6 +2510,9 @@  BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index d989775dbdb5..aa247d1d81d1 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -174,4 +174,10 @@  extern int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task,
 extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
 extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
 
+struct bpf_iter_css;
+extern int bpf_iter_css_new(struct bpf_iter_css *it,
+				struct cgroup_subsys_state *start, enum bpf_cgroup_iter_order order) __weak __ksym;
+extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
+extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
+
 #endif