diff mbox series

[bpf-next,v2,3/6] bpf: Introduce process open coded iterator kfuncs

Message ID 20230912070149.969939-4-zhouchuyi@bytedance.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Add Open-coded process and css iters | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 3097 this patch: 3103
netdev/cc_maintainers warning 8 maintainers not CCed: martin.lau@linux.dev jolsa@kernel.org haoluo@google.com sdf@google.com john.fastabend@gmail.com yonghong.song@linux.dev kpsingh@kernel.org song@kernel.org
netdev/build_clang fail Errors and warnings before: 1535 this patch: 1538
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 3190 this patch: 3196
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: Prefer __aligned(8) over __attribute__((aligned(8))) WARNING: line length of 82 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 fail Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc

Commit Message

Chuyi Zhou Sept. 12, 2023, 7:01 a.m. UTC
This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_process in open-coded iterator
style. BPF programs can use these kfuncs or through bpf_for_each macro to
iterate all processes in the system.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/helpers.c           |  3 +++
 kernel/bpf/task_iter.c         | 29 +++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  4 ++++
 tools/lib/bpf/bpf_helpers.h    |  5 +++++
 5 files changed, 45 insertions(+)

Comments

Andrii Nakryiko Sept. 14, 2023, 11:26 p.m. UTC | #1
On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_process in open-coded iterator
> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> iterate all processes in the system.
>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  include/uapi/linux/bpf.h       |  4 ++++
>  kernel/bpf/helpers.c           |  3 +++
>  kernel/bpf/task_iter.c         | 29 +++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  4 ++++
>  tools/lib/bpf/bpf_helpers.h    |  5 +++++
>  5 files changed, 45 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index de02c0971428..befa55b52e29 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7322,4 +7322,8 @@ struct bpf_iter_css_task {
>         __u64 __opaque[1];
>  } __attribute__((aligned(8)));
>
> +struct bpf_iter_process {
> +       __u64 __opaque[1];
> +} __attribute__((aligned(8)));
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index d6a16becfbb9..9b7d2c6f99d1 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2507,6 +2507,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
>  BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index d8539cc05ffd..9d1927dc3a06 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -851,6 +851,35 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
>         kfree(kit->css_it);
>  }
>
> +struct bpf_iter_process_kern {
> +       struct task_struct *tsk;
> +} __attribute__((aligned(8)));
> +

Few high level thoughts. I think it would be good to follow
SEC("iter/task") naming and approach. Open-coded iterators in many
ways are in-kernel counterpart to iterator programs, so keeping them
close enough within reason is useful for knowledge transfer.

SEC("iter/task") allows to:
a) iterate all threads in the system
b) iterate all threads for a given TGID
c) it also allows to "iterate" a single thread or process, but that's
a bit less relevant for in-kernel iterator, but we can still support
them, why not?

I'm not sure if it supports iterating all processes (as in group
leaders of each task group) in the system, but if it's possible I
think we should support it at least for open-coded iterator, seems
like a very useful functionality.

So to that end, let's design a small set of input arguments for
bpf_iter_process_new() that would allow to specify this as flags +
either (optional) struct task_struct * pointer to represent
task/process or PID/TGID.


> +__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)

Also, given iterator in the previous is called css_task, and that we
have iter/task, iter/task_vma, etc iterator programs, shouldn't this
one be called bpf_iter_task_new(), which also will be consistent with
Dave's task_vma open-coded iterator?

> +{
> +       struct bpf_iter_process_kern *kit = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
> +                                       __alignof__(struct bpf_iter_process));
> +
> +       kit->tsk = &init_task;
> +       return 0;
> +}
> +
> +__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
> +{
> +       struct bpf_iter_process_kern *kit = (void *)it;
> +
> +       kit->tsk = next_task(kit->tsk);
> +
> +       return kit->tsk == &init_task ? NULL : kit->tsk;
> +}
> +
> +__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
> +{
> +}
> +
>  DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
>
>  static void do_mmap_read_unlock(struct irq_work *entry)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index de02c0971428..befa55b52e29 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -7322,4 +7322,8 @@ struct bpf_iter_css_task {
>         __u64 __opaque[1];
>  } __attribute__((aligned(8)));
>
> +struct bpf_iter_process {
> +       __u64 __opaque[1];
> +} __attribute__((aligned(8)));
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
> index f48723c6c593..858252c2641c 100644
> --- a/tools/lib/bpf/bpf_helpers.h
> +++ b/tools/lib/bpf/bpf_helpers.h
> @@ -310,6 +310,11 @@ extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
>  extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
>  extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
>
> +struct bpf_iter_process;
> +extern int bpf_iter_process_new(struct bpf_iter_process *it) __weak __ksym;
> +extern struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it) __weak __ksym;
> +extern void bpf_iter_process_destroy(struct bpf_iter_process *it) __weak __ksym;
> +

same, please add this to bpf_experimental, not bpf_helpers.h


>  #ifndef bpf_for_each
>  /* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
>   * using BPF open-coded iterators without having to write mundane explicit
> --
> 2.20.1
>
Chuyi Zhou Sept. 15, 2023, 4:48 a.m. UTC | #2
Hello.

在 2023/9/15 07:26, Andrii Nakryiko 写道:
> On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_process in open-coded iterator
>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
>> iterate all processes in the system.
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>>   include/uapi/linux/bpf.h       |  4 ++++
>>   kernel/bpf/helpers.c           |  3 +++
>>   kernel/bpf/task_iter.c         | 29 +++++++++++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h |  4 ++++
>>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
>>   5 files changed, 45 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index de02c0971428..befa55b52e29 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7322,4 +7322,8 @@ struct bpf_iter_css_task {
>>          __u64 __opaque[1];
>>   } __attribute__((aligned(8)));
>>
>> +struct bpf_iter_process {
>> +       __u64 __opaque[1];
>> +} __attribute__((aligned(8)));
>> +
>>   #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index d6a16becfbb9..9b7d2c6f99d1 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -2507,6 +2507,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
>> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
>> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index d8539cc05ffd..9d1927dc3a06 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -851,6 +851,35 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
>>          kfree(kit->css_it);
>>   }
>>
>> +struct bpf_iter_process_kern {
>> +       struct task_struct *tsk;
>> +} __attribute__((aligned(8)));
>> +
> 
> Few high level thoughts. I think it would be good to follow
> SEC("iter/task") naming and approach. Open-coded iterators in many
> ways are in-kernel counterpart to iterator programs, so keeping them
> close enough within reason is useful for knowledge transfer.
> 
> SEC("iter/task") allows to:
> a) iterate all threads in the system
> b) iterate all threads for a given TGID
> c) it also allows to "iterate" a single thread or process, but that's
> a bit less relevant for in-kernel iterator, but we can still support
> them, why not?
> 
> I'm not sure if it supports iterating all processes (as in group
> leaders of each task group) in the system, but if it's possible I
> think we should support it at least for open-coded iterator, seems
> like a very useful functionality.
> 
> So to that end, let's design a small set of input arguments for
> bpf_iter_process_new() that would allow to specify this as flags +
> either (optional) struct task_struct * pointer to represent
> task/process or PID/TGID.
> 

IIUC, we should define the following task_new kfunc

struct bpf_iter_task_kern {
	struct task_struct *start;
	struct task_struct *cur;	
	unsigned int flag;
} __attribute__((aligned(8)));

__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_css_task *it,
		struct task_struct *start, unsigned int flags)

If we want to iterate all threads of a task, just pass it to *start*,
and if we want to iterating all process in the system, users may need to 
pass a nullptr to the *start*. But it seems current BPF verifier will 
reject the nullptr to task_struct. The error message meybe:
"Possibly NULL pointer passed to trusted argx"

I noticed the __opt annotation in kfunc document. It seems with 
following we can pass the nullptr to *start*

__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_css_task *it,
		void *start__opt, unsigned int flags__szk)

However, in this way, user can pass any invalid pointer to the kfunc 
without verifying. Besides, it seems __opt is only allowed to use with 
__szk together and flags__szk is ambiguous in semantics.

Do you have better ideas? Or I missing something ?

Thanks
Chuyi Zhou Sept. 15, 2023, 3:03 p.m. UTC | #3
在 2023/9/15 07:26, Andrii Nakryiko 写道:
> On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_process in open-coded iterator
>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
>> iterate all processes in the system.
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>>   include/uapi/linux/bpf.h       |  4 ++++
>>   kernel/bpf/helpers.c           |  3 +++
>>   kernel/bpf/task_iter.c         | 29 +++++++++++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h |  4 ++++
>>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
>>   5 files changed, 45 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index de02c0971428..befa55b52e29 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7322,4 +7322,8 @@ struct bpf_iter_css_task {
>>          __u64 __opaque[1];
>>   } __attribute__((aligned(8)));
>>
>> +struct bpf_iter_process {
>> +       __u64 __opaque[1];
>> +} __attribute__((aligned(8)));
>> +
>>   #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index d6a16becfbb9..9b7d2c6f99d1 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -2507,6 +2507,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
>> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
>> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index d8539cc05ffd..9d1927dc3a06 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -851,6 +851,35 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
>>          kfree(kit->css_it);
>>   }
>>
>> +struct bpf_iter_process_kern {
>> +       struct task_struct *tsk;
>> +} __attribute__((aligned(8)));
>> +
> 
> Few high level thoughts. I think it would be good to follow
> SEC("iter/task") naming and approach. Open-coded iterators in many
> ways are in-kernel counterpart to iterator programs, so keeping them
> close enough within reason is useful for knowledge transfer.
> 
> SEC("iter/task") allows to:
> a) iterate all threads in the system
> b) iterate all threads for a given TGID
> c) it also allows to "iterate" a single thread or process, but that's
> a bit less relevant for in-kernel iterator, but we can still support
> them, why not?
> 
> I'm not sure if it supports iterating all processes (as in group
> leaders of each task group) in the system, but if it's possible I
> think we should support it at least for open-coded iterator, seems
> like a very useful functionality.
> 
> So to that end, let's design a small set of input arguments for
> bpf_iter_process_new() that would allow to specify this as flags +
> either (optional) struct task_struct * pointer to represent
> task/process or PID/TGID.
> 

Another concern from Alexei was the readability of the API of open-coded 
in BPF Program[1].

bpf_for_each(task, curr) is straightforward. Users can easily understand 
that this API does the same thing as 'for_each_process' in kernel.

However, if we keep the approach of SEC("iter/task")

enum ITER_ITEM {
	ITER_TASK,
	ITER_THREAD,
}

__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_process *it, struct 
task_struct *group_task, enum ITER_ITEM type)

the API have to chang:


bpf_for_each(task, curr, NULL, ITERATE_TASK) // iterate all process in 
the  system
bpf_for_each(task, curr, group_leader, ITERATE_THREAD) // iterate all 
thread of group_leader
bpf_for_each(task, curr, NULL, ITERATE_THREAD) //iterate all threads of 
all the process in the system

Useres may guess what are this API actually doing....

So, I'm thinking if we can add a layer of abstraction to hide the 
details from the users:

#define bpf_for_each_process(task) \
	bpf_for_each(task, curr, NULL, ITERATE_TASK)


It would be nice if you could give me some better suggestions.

Thanks!

[1] 
https://lore.kernel.org/lkml/CAADnVQLbDWUxFen-RS67C86sOE5DykEPD8xyihJ2RnG1WEnTQg@mail.gmail.com/
Andrii Nakryiko Sept. 15, 2023, 8:37 p.m. UTC | #4
On Fri, Sep 15, 2023 at 8:03 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
>
>
> 在 2023/9/15 07:26, Andrii Nakryiko 写道:
> > On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_process in open-coded iterator
> >> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> >> iterate all processes in the system.
> >>
> >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> >> ---
> >>   include/uapi/linux/bpf.h       |  4 ++++
> >>   kernel/bpf/helpers.c           |  3 +++
> >>   kernel/bpf/task_iter.c         | 29 +++++++++++++++++++++++++++++
> >>   tools/include/uapi/linux/bpf.h |  4 ++++
> >>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
> >>   5 files changed, 45 insertions(+)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index de02c0971428..befa55b52e29 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -7322,4 +7322,8 @@ struct bpf_iter_css_task {
> >>          __u64 __opaque[1];
> >>   } __attribute__((aligned(8)));
> >>
> >> +struct bpf_iter_process {
> >> +       __u64 __opaque[1];
> >> +} __attribute__((aligned(8)));
> >> +
> >>   #endif /* _UAPI__LINUX_BPF_H__ */
> >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> >> index d6a16becfbb9..9b7d2c6f99d1 100644
> >> --- a/kernel/bpf/helpers.c
> >> +++ b/kernel/bpf/helpers.c
> >> @@ -2507,6 +2507,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> >>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
> >>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
> >>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
> >> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
> >> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
> >> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> index d8539cc05ffd..9d1927dc3a06 100644
> >> --- a/kernel/bpf/task_iter.c
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -851,6 +851,35 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
> >>          kfree(kit->css_it);
> >>   }
> >>
> >> +struct bpf_iter_process_kern {
> >> +       struct task_struct *tsk;
> >> +} __attribute__((aligned(8)));
> >> +
> >
> > Few high level thoughts. I think it would be good to follow
> > SEC("iter/task") naming and approach. Open-coded iterators in many
> > ways are in-kernel counterpart to iterator programs, so keeping them
> > close enough within reason is useful for knowledge transfer.
> >
> > SEC("iter/task") allows to:
> > a) iterate all threads in the system
> > b) iterate all threads for a given TGID
> > c) it also allows to "iterate" a single thread or process, but that's
> > a bit less relevant for in-kernel iterator, but we can still support
> > them, why not?
> >
> > I'm not sure if it supports iterating all processes (as in group
> > leaders of each task group) in the system, but if it's possible I
> > think we should support it at least for open-coded iterator, seems
> > like a very useful functionality.
> >
> > So to that end, let's design a small set of input arguments for
> > bpf_iter_process_new() that would allow to specify this as flags +
> > either (optional) struct task_struct * pointer to represent
> > task/process or PID/TGID.
> >
>
> Another concern from Alexei was the readability of the API of open-coded
> in BPF Program[1].
>
> bpf_for_each(task, curr) is straightforward. Users can easily understand
> that this API does the same thing as 'for_each_process' in kernel.

In general, users might have no idea about for_each_process macro in
the kernel, so I don't find this particular argument very convincing.

We can add a separate set of iterator kfuncs for every useful
combination of conditions, of course, but it's a double-edged sword.
Needing to use a different iterator just to specify a different
direction of cgroup iteration (from the example you referred in [1])
also means that it's now harder to write some generic function that
needs to do something for all cgroups matching some criteria where the
order might be coming as an argument.

Similarly for task iterators. It's not hard to imagine some processing
that can be equivalently done per thread or per process in the system,
or on each thread of the process, depending on some conditions or
external configuration. Having to do three different
bpf_for_each(task_xxx, task, ...) for this seems suboptimal. If the
nature of the thing that is iterated over is the same, and it's just a
different set of filters to specify which subset of those items should
be iterated, I think it's better to try to stick to the same iterator
with few simple arguments. IMO, of course, there is no objectively
best approach.

>
> However, if we keep the approach of SEC("iter/task")
>
> enum ITER_ITEM {
>         ITER_TASK,
>         ITER_THREAD,
> }
>
> __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_process *it, struct
> task_struct *group_task, enum ITER_ITEM type)
>
> the API have to chang:
>
>
> bpf_for_each(task, curr, NULL, ITERATE_TASK) // iterate all process in
> the  system
> bpf_for_each(task, curr, group_leader, ITERATE_THREAD) // iterate all
> thread of group_leader
> bpf_for_each(task, curr, NULL, ITERATE_THREAD) //iterate all threads of
> all the process in the system
>
> Useres may guess what are this API actually doing....

I'd expect users to consult documentation before trying to use an
unfamiliar cutting-edge functionality. So let's try to keep
documentation clear and up to the point. Extra flag argument doesn't
seem to be a big deal.


>
> So, I'm thinking if we can add a layer of abstraction to hide the
> details from the users:
>
> #define bpf_for_each_process(task) \
>         bpf_for_each(task, curr, NULL, ITERATE_TASK)
>
>
> It would be nice if you could give me some better suggestions.

No, please no. This macro wrapper is useless and just obfuscates what
is going on. If users think it's helpful for them, they can trivially
define it for their own code.

>
> Thanks!
>
> [1]
> https://lore.kernel.org/lkml/CAADnVQLbDWUxFen-RS67C86sOE5DykEPD8xyihJ2RnG1WEnTQg@mail.gmail.com/
Chuyi Zhou Sept. 16, 2023, 2:03 p.m. UTC | #5
Hello.

在 2023/9/16 04:37, Andrii Nakryiko 写道:
> On Fri, Sep 15, 2023 at 8:03 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>> 在 2023/9/15 07:26, Andrii Nakryiko 写道:
>>> On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>>>
>>>> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
>>>> creation and manipulation of struct bpf_iter_process in open-coded iterator
>>>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
>>>> iterate all processes in the system.
>>>>
[...cut...]
>>>
>>> Few high level thoughts. I think it would be good to follow
>>> SEC("iter/task") naming and approach. Open-coded iterators in many
>>> ways are in-kernel counterpart to iterator programs, so keeping them
>>> close enough within reason is useful for knowledge transfer.
>>>
>>> SEC("iter/task") allows to:
>>> a) iterate all threads in the system
>>> b) iterate all threads for a given TGID
>>> c) it also allows to "iterate" a single thread or process, but that's
>>> a bit less relevant for in-kernel iterator, but we can still support
>>> them, why not?
>>>
>>> I'm not sure if it supports iterating all processes (as in group
>>> leaders of each task group) in the system, but if it's possible I
>>> think we should support it at least for open-coded iterator, seems
>>> like a very useful functionality.
>>>
>>> So to that end, let's design a small set of input arguments for
>>> bpf_iter_process_new() that would allow to specify this as flags +
>>> either (optional) struct task_struct * pointer to represent
>>> task/process or PID/TGID.
>>>
>>
>> Another concern from Alexei was the readability of the API of open-coded
>> in BPF Program[1].
>>
>> bpf_for_each(task, curr) is straightforward. Users can easily understand
>> that this API does the same thing as 'for_each_process' in kernel.
> 
> In general, users might have no idea about for_each_process macro in
> the kernel, so I don't find this particular argument very convincing.
> 
> We can add a separate set of iterator kfuncs for every useful
> combination of conditions, of course, but it's a double-edged sword.
> Needing to use a different iterator just to specify a different
> direction of cgroup iteration (from the example you referred in [1])
> also means that it's now harder to write some generic function that
> needs to do something for all cgroups matching some criteria where the
> order might be coming as an argument.
> 
> Similarly for task iterators. It's not hard to imagine some processing
> that can be equivalently done per thread or per process in the system,
> or on each thread of the process, depending on some conditions or
> external configuration. Having to do three different
> bpf_for_each(task_xxx, task, ...) for this seems suboptimal. If the
> nature of the thing that is iterated over is the same, and it's just a
> different set of filters to specify which subset of those items should
> be iterated, I think it's better to try to stick to the same iterator
> with few simple arguments. IMO, of course, there is no objectively
> best approach.
> 
>>
>> However, if we keep the approach of SEC("iter/task")
>>
>> enum ITER_ITEM {
>>          ITER_TASK,
>>          ITER_THREAD,
>> }
>>
>> __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_process *it, struct
>> task_struct *group_task, enum ITER_ITEM type)
>>
>> the API have to chang:
>>
>>
>> bpf_for_each(task, curr, NULL, ITERATE_TASK) // iterate all process in
>> the  system
>> bpf_for_each(task, curr, group_leader, ITERATE_THREAD) // iterate all
>> thread of group_leader
>> bpf_for_each(task, curr, NULL, ITERATE_THREAD) //iterate all threads of
>> all the process in the system
>>
>> Useres may guess what are this API actually doing....
> 
> I'd expect users to consult documentation before trying to use an
> unfamiliar cutting-edge functionality. So let's try to keep
> documentation clear and up to the point. Extra flag argument doesn't
> seem to be a big deal.

Thanks for your suggestion!

Before we begin working on the next version, I have outlined a detailed 
API design here:

1.task_iter

It will be used to iterate process/threads like SEC("iter/task"). Here 
we should better to follow the naming and approach SEC("iter/task"):

enum {
	ITERATE_PROCESS,
	ITERATE_THREAD,
}

__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct 
task_struct *task, int flag);

If we want to iterate all processes in the system, the iteration will 
start from the *task* which is passed from user.(since process in the 
system are connected through a linked list)

Additionally, the *task* can allow users to specify iterating all 
threads within a task group.

SEC("xxx")
int xxxx(void *ctx)
{
	struct task_struct *pos;
	struct task_struct *cur_task = bpf_get_current_task_btf();

	bpf_rcu_read_lock();

	// iterating all process in the system start from cur_task
	bpf_for_each(task, pos, cur_task, ITERATE_PROCESS) {
		
	}

	// iterate all thread belongs to cur_task group.
	bpf_for_each(task, pos, cur_task, ITERATE_THREAD) {
	
	}
	
	bpf_rcu_read_unlock();
	return 0;
}

Iterating all thread of each process is great(ITERATE_ALL). But maybe 
let's break it down step by step and implement 
ITERATE_PROCESS/ITERATE_THREAD first? (I'm little worried about the cpu 
overhead of ITERATE_ALL, since we are doing a heavy job in BPF Prog)

I wanted to reuse BPF_TASK_ITER_ALL/BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID 
insted of new enums like ITERATE_PROCESS/ITERATE_THREAD. But it seems 
necessary. In BPF Prog, we usually operate task_struct directly instead 
of pid/tgid. It's a little weird to use 
BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID here:

bpf_for_each(task, pos, cur_task, BPF_TASK_ITER_TID) {
}

On the other hand, 
BPF_TASK_ITER_ALL/BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID are inner flags 
that are hidden from the users.
Exposing ITERATE_PROCESS/ITERATE_THREAD will not cause confusion to user.


2. css_iter.

css_iter will be used to:
(1) iterating subsystem, like 
for_each_mem_cgroup_tree/cpuset_for_each_descendant_pre in kernel.
(2) iterating cgroup. (patch-6's selfetest has a basic example)

css(cgroup_subsys_state) is more fundamental than struct cgroup. I think 
we'd better operating css rather than cgroup, since it's can be hard for 
cgroup_iter to achive (2). So here we keep the name of "css_iter", 
BPF_CGROUP_ITER_DESCENDANTS_PRE/BPF_CGROUP_ITER_DESCENDANTS_POST/BPF_CGROUP_ITER_ANCESTORS_UP 
can be reused.


__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
		struct cgroup_subsys_state *root, unsigned int flag)

bpf_for_each(css, root, BPF_CGROUP_ITER_DESCENDANTS_PRE)

Thanks.
Andrii Nakryiko Sept. 19, 2023, 11:30 p.m. UTC | #6
On Sat, Sep 16, 2023 at 7:03 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello.
>
> 在 2023/9/16 04:37, Andrii Nakryiko 写道:
> > On Fri, Sep 15, 2023 at 8:03 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >> 在 2023/9/15 07:26, Andrii Nakryiko 写道:
> >>> On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>>>
> >>>> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> >>>> creation and manipulation of struct bpf_iter_process in open-coded iterator
> >>>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> >>>> iterate all processes in the system.
> >>>>
> [...cut...]
> >>>
> >>> Few high level thoughts. I think it would be good to follow
> >>> SEC("iter/task") naming and approach. Open-coded iterators in many
> >>> ways are in-kernel counterpart to iterator programs, so keeping them
> >>> close enough within reason is useful for knowledge transfer.
> >>>
> >>> SEC("iter/task") allows to:
> >>> a) iterate all threads in the system
> >>> b) iterate all threads for a given TGID
> >>> c) it also allows to "iterate" a single thread or process, but that's
> >>> a bit less relevant for in-kernel iterator, but we can still support
> >>> them, why not?
> >>>
> >>> I'm not sure if it supports iterating all processes (as in group
> >>> leaders of each task group) in the system, but if it's possible I
> >>> think we should support it at least for open-coded iterator, seems
> >>> like a very useful functionality.
> >>>
> >>> So to that end, let's design a small set of input arguments for
> >>> bpf_iter_process_new() that would allow to specify this as flags +
> >>> either (optional) struct task_struct * pointer to represent
> >>> task/process or PID/TGID.
> >>>
> >>
> >> Another concern from Alexei was the readability of the API of open-coded
> >> in BPF Program[1].
> >>
> >> bpf_for_each(task, curr) is straightforward. Users can easily understand
> >> that this API does the same thing as 'for_each_process' in kernel.
> >
> > In general, users might have no idea about for_each_process macro in
> > the kernel, so I don't find this particular argument very convincing.
> >
> > We can add a separate set of iterator kfuncs for every useful
> > combination of conditions, of course, but it's a double-edged sword.
> > Needing to use a different iterator just to specify a different
> > direction of cgroup iteration (from the example you referred in [1])
> > also means that it's now harder to write some generic function that
> > needs to do something for all cgroups matching some criteria where the
> > order might be coming as an argument.
> >
> > Similarly for task iterators. It's not hard to imagine some processing
> > that can be equivalently done per thread or per process in the system,
> > or on each thread of the process, depending on some conditions or
> > external configuration. Having to do three different
> > bpf_for_each(task_xxx, task, ...) for this seems suboptimal. If the
> > nature of the thing that is iterated over is the same, and it's just a
> > different set of filters to specify which subset of those items should
> > be iterated, I think it's better to try to stick to the same iterator
> > with few simple arguments. IMO, of course, there is no objectively
> > best approach.
> >
> >>
> >> However, if we keep the approach of SEC("iter/task")
> >>
> >> enum ITER_ITEM {
> >>          ITER_TASK,
> >>          ITER_THREAD,
> >> }
> >>
> >> __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_process *it, struct
> >> task_struct *group_task, enum ITER_ITEM type)
> >>
> >> the API have to chang:
> >>
> >>
> >> bpf_for_each(task, curr, NULL, ITERATE_TASK) // iterate all process in
> >> the  system
> >> bpf_for_each(task, curr, group_leader, ITERATE_THREAD) // iterate all
> >> thread of group_leader
> >> bpf_for_each(task, curr, NULL, ITERATE_THREAD) //iterate all threads of
> >> all the process in the system
> >>
> >> Useres may guess what are this API actually doing....
> >
> > I'd expect users to consult documentation before trying to use an
> > unfamiliar cutting-edge functionality. So let's try to keep
> > documentation clear and up to the point. Extra flag argument doesn't
> > seem to be a big deal.
>
> Thanks for your suggestion!
>
> Before we begin working on the next version, I have outlined a detailed
> API design here:
>
> 1.task_iter
>
> It will be used to iterate process/threads like SEC("iter/task"). Here
> we should better to follow the naming and approach SEC("iter/task"):
>
> enum {
>         ITERATE_PROCESS,
>         ITERATE_THREAD,
> }
>
> __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct
> task_struct *task, int flag);
>
> If we want to iterate all processes in the system, the iteration will
> start from the *task* which is passed from user.(since process in the
> system are connected through a linked list)

but will go through all of them anyways, right? it's kind of
surprising from usability standpoint to have to pass some task_struct
to iterate all of them, tbh. I wonder if it's hard to adjust kfunc
validation to allow "nullable" pointers? We can look at that
separately, of course.

>
> Additionally, the *task* can allow users to specify iterating all
> threads within a task group.
>
> SEC("xxx")
> int xxxx(void *ctx)
> {
>         struct task_struct *pos;
>         struct task_struct *cur_task = bpf_get_current_task_btf();
>
>         bpf_rcu_read_lock();
>
>         // iterating all process in the system start from cur_task
>         bpf_for_each(task, pos, cur_task, ITERATE_PROCESS) {
>
>         }
>
>         // iterate all thread belongs to cur_task group.
>         bpf_for_each(task, pos, cur_task, ITERATE_THREAD) {
>
>         }
>
>         bpf_rcu_read_unlock();
>         return 0;
> }
>
> Iterating all thread of each process is great(ITERATE_ALL). But maybe
> let's break it down step by step and implement
> ITERATE_PROCESS/ITERATE_THREAD first? (I'm little worried about the cpu
> overhead of ITERATE_ALL, since we are doing a heavy job in BPF Prog)
>

Hm... but if it was a sleepable BPF program and
bpf_rcu_read_{lock,unlock}() was only per task, then it shouldn't be
the problem? See enum bpf_cgroup_iter_order.


> I wanted to reuse BPF_TASK_ITER_ALL/BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID
> insted of new enums like ITERATE_PROCESS/ITERATE_THREAD. But it seems
> necessary. In BPF Prog, we usually operate task_struct directly instead
> of pid/tgid. It's a little weird to use
> BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID here:

enum bpf_iter_task_type is internal type, so we can rename
BPF_TASK_ITER_TID to BPF_TASK_ITER_THREAD and BPF_TASK_ITER_PROC (or
add them as aliases). At the very least, we should use consistent
BPF_TASK_ITER_xxx naming, instead of just ITERATE_PROCESS. See

>
> bpf_for_each(task, pos, cur_task, BPF_TASK_ITER_TID) {
> }
>
> On the other hand,
> BPF_TASK_ITER_ALL/BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID are inner flags
> that are hidden from the users.
> Exposing ITERATE_PROCESS/ITERATE_THREAD will not cause confusion to user.
>

inner types are not a problem when used with vmlinux.h


>
> 2. css_iter.
>
> css_iter will be used to:
> (1) iterating subsystem, like
> for_each_mem_cgroup_tree/cpuset_for_each_descendant_pre in kernel.
> (2) iterating cgroup. (patch-6's selfetest has a basic example)
>
> css(cgroup_subsys_state) is more fundamental than struct cgroup. I think
> we'd better operating css rather than cgroup, since it's can be hard for
> cgroup_iter to achive (2). So here we keep the name of "css_iter",
> BPF_CGROUP_ITER_DESCENDANTS_PRE/BPF_CGROUP_ITER_DESCENDANTS_POST/BPF_CGROUP_ITER_ANCESTORS_UP
> can be reused.
>
>
> __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
>                 struct cgroup_subsys_state *root, unsigned int flag)
>
> bpf_for_each(css, root, BPF_CGROUP_ITER_DESCENDANTS_PRE)
>

Makes sense, yep, thanks.

> Thanks.
>
>
>
>
Chuyi Zhou Sept. 20, 2023, 12:20 p.m. UTC | #7
在 2023/9/20 07:30, Andrii Nakryiko 写道:
> On Sat, Sep 16, 2023 at 7:03 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> Hello.
>>
>> 在 2023/9/16 04:37, Andrii Nakryiko 写道:
>>> On Fri, Sep 15, 2023 at 8:03 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>>> 在 2023/9/15 07:26, Andrii Nakryiko 写道:
>>>>> On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>>>>>
>>>>>> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
>>>>>> creation and manipulation of struct bpf_iter_process in open-coded iterator
>>>>>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
>>>>>> iterate all processes in the system.
>>>>>>
>> [...cut...]
>>>>>
>>>>> Few high level thoughts. I think it would be good to follow
>>>>> SEC("iter/task") naming and approach. Open-coded iterators in many
>>>>> ways are in-kernel counterpart to iterator programs, so keeping them
>>>>> close enough within reason is useful for knowledge transfer.
>>>>>
>>>>> SEC("iter/task") allows to:
>>>>> a) iterate all threads in the system
>>>>> b) iterate all threads for a given TGID
>>>>> c) it also allows to "iterate" a single thread or process, but that's
>>>>> a bit less relevant for in-kernel iterator, but we can still support
>>>>> them, why not?
>>>>>
>>>>> I'm not sure if it supports iterating all processes (as in group
>>>>> leaders of each task group) in the system, but if it's possible I
>>>>> think we should support it at least for open-coded iterator, seems
>>>>> like a very useful functionality.
>>>>>
>>>>> So to that end, let's design a small set of input arguments for
>>>>> bpf_iter_process_new() that would allow to specify this as flags +
>>>>> either (optional) struct task_struct * pointer to represent
>>>>> task/process or PID/TGID.
>>>>>
>>>>
>>>> Another concern from Alexei was the readability of the API of open-coded
>>>> in BPF Program[1].
>>>>
>>>> bpf_for_each(task, curr) is straightforward. Users can easily understand
>>>> that this API does the same thing as 'for_each_process' in kernel.
>>>
>>> In general, users might have no idea about for_each_process macro in
>>> the kernel, so I don't find this particular argument very convincing.
>>>
>>> We can add a separate set of iterator kfuncs for every useful
>>> combination of conditions, of course, but it's a double-edged sword.
>>> Needing to use a different iterator just to specify a different
>>> direction of cgroup iteration (from the example you referred in [1])
>>> also means that it's now harder to write some generic function that
>>> needs to do something for all cgroups matching some criteria where the
>>> order might be coming as an argument.
>>>
>>> Similarly for task iterators. It's not hard to imagine some processing
>>> that can be equivalently done per thread or per process in the system,
>>> or on each thread of the process, depending on some conditions or
>>> external configuration. Having to do three different
>>> bpf_for_each(task_xxx, task, ...) for this seems suboptimal. If the
>>> nature of the thing that is iterated over is the same, and it's just a
>>> different set of filters to specify which subset of those items should
>>> be iterated, I think it's better to try to stick to the same iterator
>>> with few simple arguments. IMO, of course, there is no objectively
>>> best approach.
>>>
>>>>
>>>> However, if we keep the approach of SEC("iter/task")
>>>>
>>>> enum ITER_ITEM {
>>>>           ITER_TASK,
>>>>           ITER_THREAD,
>>>> }
>>>>
>>>> __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_process *it, struct
>>>> task_struct *group_task, enum ITER_ITEM type)
>>>>
>>>> the API have to chang:
>>>>
>>>>
>>>> bpf_for_each(task, curr, NULL, ITERATE_TASK) // iterate all process in
>>>> the  system
>>>> bpf_for_each(task, curr, group_leader, ITERATE_THREAD) // iterate all
>>>> thread of group_leader
>>>> bpf_for_each(task, curr, NULL, ITERATE_THREAD) //iterate all threads of
>>>> all the process in the system
>>>>
>>>> Useres may guess what are this API actually doing....
>>>
>>> I'd expect users to consult documentation before trying to use an
>>> unfamiliar cutting-edge functionality. So let's try to keep
>>> documentation clear and up to the point. Extra flag argument doesn't
>>> seem to be a big deal.
>>
>> Thanks for your suggestion!
>>
>> Before we begin working on the next version, I have outlined a detailed
>> API design here:
>>
>> 1.task_iter
>>
>> It will be used to iterate process/threads like SEC("iter/task"). Here
>> we should better to follow the naming and approach SEC("iter/task"):
>>
>> enum {
>>          ITERATE_PROCESS,
>>          ITERATE_THREAD,
>> }
>>
>> __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct
>> task_struct *task, int flag);
>>
>> If we want to iterate all processes in the system, the iteration will
>> start from the *task* which is passed from user.(since process in the
>> system are connected through a linked list)
> 
> but will go through all of them anyways, right? it's kind of
> surprising from usability standpoint to have to pass some task_struct
> to iterate all of them, tbh. I wonder if it's hard to adjust kfunc
> validation to allow "nullable" pointers? We can look at that
> separately, of course.
> 

Yes, little subtle here. When we want to iterate threads of a task, we 
may want the verifier to ensure the *task* is a valid pointer, which 
would require KF_TRUSTED_ARGS. However, when we want to iterate all 
process/threads in the system, we want to accept a null pointer.

Anyway, I will try to explore a workaround in the verifier here.

Thanks.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index de02c0971428..befa55b52e29 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7322,4 +7322,8 @@  struct bpf_iter_css_task {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_process {
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d6a16becfbb9..9b7d2c6f99d1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2507,6 +2507,9 @@  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index d8539cc05ffd..9d1927dc3a06 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -851,6 +851,35 @@  __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
 	kfree(kit->css_it);
 }
 
+struct bpf_iter_process_kern {
+	struct task_struct *tsk;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
+{
+	struct bpf_iter_process_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
+					__alignof__(struct bpf_iter_process));
+
+	kit->tsk = &init_task;
+	return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
+{
+	struct bpf_iter_process_kern *kit = (void *)it;
+
+	kit->tsk = next_task(kit->tsk);
+
+	return kit->tsk == &init_task ? NULL : kit->tsk;
+}
+
+__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
+{
+}
+
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
 static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index de02c0971428..befa55b52e29 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7322,4 +7322,8 @@  struct bpf_iter_css_task {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_process {
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index f48723c6c593..858252c2641c 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -310,6 +310,11 @@  extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
 extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
 extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
 
+struct bpf_iter_process;
+extern int bpf_iter_process_new(struct bpf_iter_process *it) __weak __ksym;
+extern struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it) __weak __ksym;
+extern void bpf_iter_process_destroy(struct bpf_iter_process *it) __weak __ksym;
+
 #ifndef bpf_for_each
 /* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
  * using BPF open-coded iterators without having to write mundane explicit