diff mbox series

[RFC,bpf-next,2/4] bpf: Introduce process open coded iterator kfuncs

Message ID 20230827072057.1591929-3-zhouchuyi@bytedance.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series Add Open-coded process and css iters | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-7 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 fail Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 3090 this patch: 3098
netdev/cc_maintainers warning 8 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com sdf@google.com song@kernel.org yonghong.song@linux.dev jolsa@kernel.org haoluo@google.com
netdev/build_clang fail Errors and warnings before: 1536 this patch: 1539
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 3122 this patch: 3130
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: Prefer __aligned(8) over __attribute__((aligned(8))) WARNING: line length of 82 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Chuyi Zhou Aug. 27, 2023, 7:20 a.m. UTC
This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_process in open-coded iterator
style. BPF programs can use these kfuncs or through bpf_for_each macro to
iterate all processes in the system.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/helpers.c           |  3 +++
 kernel/bpf/task_iter.c         | 31 +++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  4 ++++
 tools/lib/bpf/bpf_helpers.h    |  5 +++++
 5 files changed, 47 insertions(+)

Comments

Alexei Starovoitov Sept. 5, 2023, 8:09 p.m. UTC | #1
On Sun, Aug 27, 2023 at 12:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_process in open-coded iterator
> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> iterate all processes in the system.
>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  include/uapi/linux/bpf.h       |  4 ++++
>  kernel/bpf/helpers.c           |  3 +++
>  kernel/bpf/task_iter.c         | 31 +++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  4 ++++
>  tools/lib/bpf/bpf_helpers.h    |  5 +++++
>  5 files changed, 47 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 2a6e9b99564b..cfbd527e3733 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7199,4 +7199,8 @@ struct bpf_iter_css_task {
>         __u64 __opaque[1];
>  } __attribute__((aligned(8)));
>
> +struct bpf_iter_process {
> +       __u64 __opaque[1];
> +} __attribute__((aligned(8)));
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index cf113ad24837..81a2005edc26 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2458,6 +2458,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
>  BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index b1bdba40b684..a6717a76c1e0 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -862,6 +862,37 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
>         kfree(kit->css_it);
>  }
>
> +struct bpf_iter_process_kern {
> +       struct task_struct *tsk;
> +} __attribute__((aligned(8)));
> +
> +__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
> +{
> +       struct bpf_iter_process_kern *kit = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
> +                                       __alignof__(struct bpf_iter_process));
> +
> +       rcu_read_lock();
> +       kit->tsk = &init_task;
> +       return 0;
> +}
> +
> +__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
> +{
> +       struct bpf_iter_process_kern *kit = (void *)it;
> +
> +       kit->tsk = next_task(kit->tsk);
> +
> +       return kit->tsk == &init_task ? NULL : kit->tsk;
> +}
> +
> +__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
> +{
> +       rcu_read_unlock();
> +}

This iter can be used in all ctx-s which is nice, but let's
make the verifier enforce rcu_read_lock/unlock done by bpf prog
instead of doing in the ctor/dtor of iter, since
in sleepable progs the verifier won't recognize that body is RCU CS.
We'd need to teach the verifier to allow bpf_iter_process_new()
inside in_rcu_cs() and make sure there is no rcu_read_unlock
while BPF_ITER_STATE_ACTIVE.
bpf_iter_process_destroy() would become a nop.
Chuyi Zhou Sept. 6, 2023, 12:37 p.m. UTC | #2
Hello, Alexei.

在 2023/9/6 04:09, Alexei Starovoitov 写道:
> On Sun, Aug 27, 2023 at 12:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
>> creation and manipulation of struct bpf_iter_process in open-coded iterator
>> style. BPF programs can use these kfuncs or through bpf_for_each macro to
>> iterate all processes in the system.
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>>   include/uapi/linux/bpf.h       |  4 ++++
>>   kernel/bpf/helpers.c           |  3 +++
>>   kernel/bpf/task_iter.c         | 31 +++++++++++++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h |  4 ++++
>>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
>>   5 files changed, 47 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 2a6e9b99564b..cfbd527e3733 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7199,4 +7199,8 @@ struct bpf_iter_css_task {
>>          __u64 __opaque[1];
>>   } __attribute__((aligned(8)));
>>
>> +struct bpf_iter_process {
>> +       __u64 __opaque[1];
>> +} __attribute__((aligned(8)));
>> +
>>   #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index cf113ad24837..81a2005edc26 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -2458,6 +2458,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
>> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
>> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index b1bdba40b684..a6717a76c1e0 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -862,6 +862,37 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
>>          kfree(kit->css_it);
>>   }
>>
>> +struct bpf_iter_process_kern {
>> +       struct task_struct *tsk;
>> +} __attribute__((aligned(8)));
>> +
>> +__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
>> +{
>> +       struct bpf_iter_process_kern *kit = (void *)it;
>> +
>> +       BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
>> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
>> +                                       __alignof__(struct bpf_iter_process));
>> +
>> +       rcu_read_lock();
>> +       kit->tsk = &init_task;
>> +       return 0;
>> +}
>> +
>> +__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
>> +{
>> +       struct bpf_iter_process_kern *kit = (void *)it;
>> +
>> +       kit->tsk = next_task(kit->tsk);
>> +
>> +       return kit->tsk == &init_task ? NULL : kit->tsk;
>> +}
>> +
>> +__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
>> +{
>> +       rcu_read_unlock();
>> +}
> 
> This iter can be used in all ctx-s which is nice, but let's
> make the verifier enforce rcu_read_lock/unlock done by bpf prog
> instead of doing in the ctor/dtor of iter, since
> in sleepable progs the verifier won't recognize that body is RCU CS.
> We'd need to teach the verifier to allow bpf_iter_process_new()
> inside in_rcu_cs() and make sure there is no rcu_read_unlock
> while BPF_ITER_STATE_ACTIVE.
> bpf_iter_process_destroy() would become a nop.

Thanks for your review!

I think bpf_iter_process_{new, next, destroy} should be protected by 
bpf_rcu_read_lock/unlock explicitly whether the prog is sleepable or 
not, right? I'm not very familiar with the BPF verifier, but I believe 
there is still a risk in directly calling these kfuns even if 
in_rcu_cs() is true.

Maby what we actually need here is to enforce BPF verifier to check 
env->cur_state->active_rcu_lock is true when we want to call these kfuncs.

Thanks.
Alexei Starovoitov Sept. 6, 2023, 5:17 p.m. UTC | #3
On Wed, Sep 6, 2023 at 5:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello, Alexei.
>
> 在 2023/9/6 04:09, Alexei Starovoitov 写道:
> > On Sun, Aug 27, 2023 at 12:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> >> creation and manipulation of struct bpf_iter_process in open-coded iterator
> >> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> >> iterate all processes in the system.
> >>
> >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> >> ---
> >>   include/uapi/linux/bpf.h       |  4 ++++
> >>   kernel/bpf/helpers.c           |  3 +++
> >>   kernel/bpf/task_iter.c         | 31 +++++++++++++++++++++++++++++++
> >>   tools/include/uapi/linux/bpf.h |  4 ++++
> >>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
> >>   5 files changed, 47 insertions(+)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 2a6e9b99564b..cfbd527e3733 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -7199,4 +7199,8 @@ struct bpf_iter_css_task {
> >>          __u64 __opaque[1];
> >>   } __attribute__((aligned(8)));
> >>
> >> +struct bpf_iter_process {
> >> +       __u64 __opaque[1];
> >> +} __attribute__((aligned(8)));
> >> +
> >>   #endif /* _UAPI__LINUX_BPF_H__ */
> >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> >> index cf113ad24837..81a2005edc26 100644
> >> --- a/kernel/bpf/helpers.c
> >> +++ b/kernel/bpf/helpers.c
> >> @@ -2458,6 +2458,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> >>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
> >>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
> >>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
> >> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
> >> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
> >> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> index b1bdba40b684..a6717a76c1e0 100644
> >> --- a/kernel/bpf/task_iter.c
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -862,6 +862,37 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
> >>          kfree(kit->css_it);
> >>   }
> >>
> >> +struct bpf_iter_process_kern {
> >> +       struct task_struct *tsk;
> >> +} __attribute__((aligned(8)));
> >> +
> >> +__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
> >> +{
> >> +       struct bpf_iter_process_kern *kit = (void *)it;
> >> +
> >> +       BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
> >> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
> >> +                                       __alignof__(struct bpf_iter_process));
> >> +
> >> +       rcu_read_lock();
> >> +       kit->tsk = &init_task;
> >> +       return 0;
> >> +}
> >> +
> >> +__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
> >> +{
> >> +       struct bpf_iter_process_kern *kit = (void *)it;
> >> +
> >> +       kit->tsk = next_task(kit->tsk);
> >> +
> >> +       return kit->tsk == &init_task ? NULL : kit->tsk;
> >> +}
> >> +
> >> +__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
> >> +{
> >> +       rcu_read_unlock();
> >> +}
> >
> > This iter can be used in all ctx-s which is nice, but let's
> > make the verifier enforce rcu_read_lock/unlock done by bpf prog
> > instead of doing in the ctor/dtor of iter, since
> > in sleepable progs the verifier won't recognize that body is RCU CS.
> > We'd need to teach the verifier to allow bpf_iter_process_new()
> > inside in_rcu_cs() and make sure there is no rcu_read_unlock
> > while BPF_ITER_STATE_ACTIVE.
> > bpf_iter_process_destroy() would become a nop.
>
> Thanks for your review!
>
> I think bpf_iter_process_{new, next, destroy} should be protected by
> bpf_rcu_read_lock/unlock explicitly whether the prog is sleepable or
> not, right?

Correct. By explicit bpf_rcu_read_lock() in case of sleepable progs
or just by using them in normal bpf progs that have implicit rcu_read_lock()
done before calling into them.

> I'm not very familiar with the BPF verifier, but I believe
> there is still a risk in directly calling these kfuns even if
> in_rcu_cs() is true.
>
> Maby what we actually need here is to enforce BPF verifier to check
> env->cur_state->active_rcu_lock is true when we want to call these kfuncs.

active_rcu_lock means explicit bpf_rcu_read_lock.
Currently we do allow bpf_rcu_read_lock in non-sleepable, but it's pointless.

Technically we can extend the check:
                if (in_rbtree_lock_required_cb(env) && (rcu_lock ||
rcu_unlock)) {
                        verbose(env, "Calling
bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
                        return -EACCES;
                }
to discourage their use in all non-sleepable, but it will break some progs.

I think it's ok to check in_rcu_cs() to allow bpf_iter_process_*().
If bpf prog adds explicit and unnecessary bpf_rcu_read_lock() around
the iter ops it won't do any harm.
Just need to make sure that rcu unlock logic:
                } else if (rcu_unlock) {
                        bpf_for_each_reg_in_vstate(env->cur_state,
state, reg, ({
                                if (reg->type & MEM_RCU) {
                                        reg->type &= ~(MEM_RCU |
PTR_MAYBE_NULL);
                                        reg->type |= PTR_UNTRUSTED;
                                }
                        }));
clears iter state that depends on rcu.

I thought about changing mark_stack_slots_iter() to do
st->type = PTR_TO_STACK | MEM_RCU;
so that the above clearing logic kicks in,
but it might be better to have something iter specific.
is_iter_reg_valid_init() should probably be changed to
make sure reg->type is not UNTRUSTED.

Andrii,
do you have better suggestions?
Chuyi Zhou Sept. 7, 2023, 12:02 p.m. UTC | #4
Hello,
在 2023/9/7 01:17, Alexei Starovoitov 写道:
[...cut...]
>>> This iter can be used in all ctx-s which is nice, but let's
>>> make the verifier enforce rcu_read_lock/unlock done by bpf prog
>>> instead of doing in the ctor/dtor of iter, since
>>> in sleepable progs the verifier won't recognize that body is RCU CS.
>>> We'd need to teach the verifier to allow bpf_iter_process_new()
>>> inside in_rcu_cs() and make sure there is no rcu_read_unlock
>>> while BPF_ITER_STATE_ACTIVE.
>>> bpf_iter_process_destroy() would become a nop.
>>
>> Thanks for your review!
>>
>> I think bpf_iter_process_{new, next, destroy} should be protected by
>> bpf_rcu_read_lock/unlock explicitly whether the prog is sleepable or
>> not, right?
> 
> Correct. By explicit bpf_rcu_read_lock() in case of sleepable progs
> or just by using them in normal bpf progs that have implicit rcu_read_lock()
> done before calling into them.
Thanks for your explanation, I missed the latter.
> 
>> I'm not very familiar with the BPF verifier, but I believe
>> there is still a risk in directly calling these kfuns even if
>> in_rcu_cs() is true.
>>
>> Maby what we actually need here is to enforce BPF verifier to check
>> env->cur_state->active_rcu_lock is true when we want to call these kfuncs.
> 
> active_rcu_lock means explicit bpf_rcu_read_lock.
> Currently we do allow bpf_rcu_read_lock in non-sleepable, but it's pointless.
> 
> Technically we can extend the check:
>                  if (in_rbtree_lock_required_cb(env) && (rcu_lock ||
> rcu_unlock)) {
>                          verbose(env, "Calling
> bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
>                          return -EACCES;
>                  }
> to discourage their use in all non-sleepable, but it will break some progs.
> 
> I think it's ok to check in_rcu_cs() to allow bpf_iter_process_*().
> If bpf prog adds explicit and unnecessary bpf_rcu_read_lock() around
> the iter ops it won't do any harm.
> Just need to make sure that rcu unlock logic:
>                  } else if (rcu_unlock) {
>                          bpf_for_each_reg_in_vstate(env->cur_state,
> state, reg, ({
>                                  if (reg->type & MEM_RCU) {
>                                          reg->type &= ~(MEM_RCU |
> PTR_MAYBE_NULL);
>                                          reg->type |= PTR_UNTRUSTED;
>                                  }
>                          }));
> clears iter state that depends on rcu.
> 
> I thought about changing mark_stack_slots_iter() to do
> st->type = PTR_TO_STACK | MEM_RCU;
> so that the above clearing logic kicks in,
> but it might be better to have something iter specific.
> is_iter_reg_valid_init() should probably be changed to
> make sure reg->type is not UNTRUSTED.
> 
Maybe it's something looks like the following?

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb78212fa5b2..9185c4a40a21 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1172,7 +1172,15 @@ static bool is_dynptr_type_expected(struct 
bpf_verifier_env *env, struct bpf_reg

  static void __mark_reg_known_zero(struct bpf_reg_state *reg);

+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+/* check whether we are using bpf_iter_process_*() or bpf_iter_css_*() */
+static bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+
+}
  static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+                                struct bpf_kfunc_call_arg_meta *meta,
                                  struct bpf_reg_state *reg, int insn_idx,
                                  struct btf *btf, u32 btf_id, int nr_slots)
  {
@@ -1193,6 +1201,12 @@ static int mark_stack_slots_iter(struct 
bpf_verifier_env *env,

                 __mark_reg_known_zero(st);
                 st->type = PTR_TO_STACK; /* we don't have dedicated reg 
type */
+               if (is_iter_need_rcu(meta)) {
+                       if (in_rcu_cs(env))
+                               st->type |= MEM_RCU;
+                       else
+                               st->type |= PTR_UNTRUSTED;
+               }
                 st->live |= REG_LIVE_WRITTEN;
                 st->ref_obj_id = i == 0 ? id : 0;
                 st->iter.btf = btf;
@@ -1281,6 +1295,8 @@ static bool is_iter_reg_valid_init(struct 
bpf_verifier_env *env, struct bpf_reg_
                 struct bpf_stack_state *slot = &state->stack[spi - i];
                 struct bpf_reg_state *st = &slot->spilled_ptr;

+               if (st->type & PTR_UNTRUSTED)
+                       return false;
                 /* only main (first) slot has ref_obj_id set */
                 if (i == 0 && !st->ref_obj_id)
                         return false;

> Andrii,
> do you have better suggestions?
Alexei Starovoitov Sept. 8, 2023, 6:10 p.m. UTC | #5
On Thu, Sep 7, 2023 at 5:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello,
> 在 2023/9/7 01:17, Alexei Starovoitov 写道:
> [...cut...]
> >>> This iter can be used in all ctx-s which is nice, but let's
> >>> make the verifier enforce rcu_read_lock/unlock done by bpf prog
> >>> instead of doing in the ctor/dtor of iter, since
> >>> in sleepable progs the verifier won't recognize that body is RCU CS.
> >>> We'd need to teach the verifier to allow bpf_iter_process_new()
> >>> inside in_rcu_cs() and make sure there is no rcu_read_unlock
> >>> while BPF_ITER_STATE_ACTIVE.
> >>> bpf_iter_process_destroy() would become a nop.
> >>
> >> Thanks for your review!
> >>
> >> I think bpf_iter_process_{new, next, destroy} should be protected by
> >> bpf_rcu_read_lock/unlock explicitly whether the prog is sleepable or
> >> not, right?
> >
> > Correct. By explicit bpf_rcu_read_lock() in case of sleepable progs
> > or just by using them in normal bpf progs that have implicit rcu_read_lock()
> > done before calling into them.
> Thanks for your explanation, I missed the latter.
> >
> >> I'm not very familiar with the BPF verifier, but I believe
> >> there is still a risk in directly calling these kfuns even if
> >> in_rcu_cs() is true.
> >>
> >> Maby what we actually need here is to enforce BPF verifier to check
> >> env->cur_state->active_rcu_lock is true when we want to call these kfuncs.
> >
> > active_rcu_lock means explicit bpf_rcu_read_lock.
> > Currently we do allow bpf_rcu_read_lock in non-sleepable, but it's pointless.
> >
> > Technically we can extend the check:
> >                  if (in_rbtree_lock_required_cb(env) && (rcu_lock ||
> > rcu_unlock)) {
> >                          verbose(env, "Calling
> > bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
> >                          return -EACCES;
> >                  }
> > to discourage their use in all non-sleepable, but it will break some progs.
> >
> > I think it's ok to check in_rcu_cs() to allow bpf_iter_process_*().
> > If bpf prog adds explicit and unnecessary bpf_rcu_read_lock() around
> > the iter ops it won't do any harm.
> > Just need to make sure that rcu unlock logic:
> >                  } else if (rcu_unlock) {
> >                          bpf_for_each_reg_in_vstate(env->cur_state,
> > state, reg, ({
> >                                  if (reg->type & MEM_RCU) {
> >                                          reg->type &= ~(MEM_RCU |
> > PTR_MAYBE_NULL);
> >                                          reg->type |= PTR_UNTRUSTED;
> >                                  }
> >                          }));
> > clears iter state that depends on rcu.
> >
> > I thought about changing mark_stack_slots_iter() to do
> > st->type = PTR_TO_STACK | MEM_RCU;
> > so that the above clearing logic kicks in,
> > but it might be better to have something iter specific.
> > is_iter_reg_valid_init() should probably be changed to
> > make sure reg->type is not UNTRUSTED.
> >
> Maybe it's something looks like the following?
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index bb78212fa5b2..9185c4a40a21 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1172,7 +1172,15 @@ static bool is_dynptr_type_expected(struct
> bpf_verifier_env *env, struct bpf_reg
>
>   static void __mark_reg_known_zero(struct bpf_reg_state *reg);
>
> +static bool in_rcu_cs(struct bpf_verifier_env *env);
> +
> +/* check whether we are using bpf_iter_process_*() or bpf_iter_css_*() */
> +static bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta)
> +{
> +
> +}
>   static int mark_stack_slots_iter(struct bpf_verifier_env *env,
> +                                struct bpf_kfunc_call_arg_meta *meta,
>                                   struct bpf_reg_state *reg, int insn_idx,
>                                   struct btf *btf, u32 btf_id, int nr_slots)
>   {
> @@ -1193,6 +1201,12 @@ static int mark_stack_slots_iter(struct
> bpf_verifier_env *env,
>
>                  __mark_reg_known_zero(st);
>                  st->type = PTR_TO_STACK; /* we don't have dedicated reg
> type */
> +               if (is_iter_need_rcu(meta)) {
> +                       if (in_rcu_cs(env))
> +                               st->type |= MEM_RCU;
> +                       else
> +                               st->type |= PTR_UNTRUSTED;
> +               }
>                  st->live |= REG_LIVE_WRITTEN;
>                  st->ref_obj_id = i == 0 ? id : 0;
>                  st->iter.btf = btf;
> @@ -1281,6 +1295,8 @@ static bool is_iter_reg_valid_init(struct
> bpf_verifier_env *env, struct bpf_reg_
>                  struct bpf_stack_state *slot = &state->stack[spi - i];
>                  struct bpf_reg_state *st = &slot->spilled_ptr;
>
> +               if (st->type & PTR_UNTRUSTED)
> +                       return false;


Yep. All makes sense to me.
Andrii Nakryiko Sept. 12, 2023, 10:12 p.m. UTC | #6
On Wed, Sep 6, 2023 at 10:18 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Sep 6, 2023 at 5:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >
> > Hello, Alexei.
> >
> > 在 2023/9/6 04:09, Alexei Starovoitov 写道:
> > > On Sun, Aug 27, 2023 at 12:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> > >>
> > >> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> > >> creation and manipulation of struct bpf_iter_process in open-coded iterator
> > >> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> > >> iterate all processes in the system.
> > >>
> > >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> > >> ---
> > >>   include/uapi/linux/bpf.h       |  4 ++++
> > >>   kernel/bpf/helpers.c           |  3 +++
> > >>   kernel/bpf/task_iter.c         | 31 +++++++++++++++++++++++++++++++
> > >>   tools/include/uapi/linux/bpf.h |  4 ++++
> > >>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
> > >>   5 files changed, 47 insertions(+)
> > >>
> > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > >> index 2a6e9b99564b..cfbd527e3733 100644
> > >> --- a/include/uapi/linux/bpf.h
> > >> +++ b/include/uapi/linux/bpf.h
> > >> @@ -7199,4 +7199,8 @@ struct bpf_iter_css_task {
> > >>          __u64 __opaque[1];
> > >>   } __attribute__((aligned(8)));
> > >>
> > >> +struct bpf_iter_process {
> > >> +       __u64 __opaque[1];
> > >> +} __attribute__((aligned(8)));
> > >> +
> > >>   #endif /* _UAPI__LINUX_BPF_H__ */
> > >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > >> index cf113ad24837..81a2005edc26 100644
> > >> --- a/kernel/bpf/helpers.c
> > >> +++ b/kernel/bpf/helpers.c
> > >> @@ -2458,6 +2458,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
> > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
> > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
> > >> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
> > >> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
> > >> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
> > >>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> > >>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> > >>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> > >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > >> index b1bdba40b684..a6717a76c1e0 100644
> > >> --- a/kernel/bpf/task_iter.c
> > >> +++ b/kernel/bpf/task_iter.c
> > >> @@ -862,6 +862,37 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
> > >>          kfree(kit->css_it);
> > >>   }
> > >>
> > >> +struct bpf_iter_process_kern {
> > >> +       struct task_struct *tsk;
> > >> +} __attribute__((aligned(8)));
> > >> +
> > >> +__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
> > >> +{
> > >> +       struct bpf_iter_process_kern *kit = (void *)it;
> > >> +
> > >> +       BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
> > >> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
> > >> +                                       __alignof__(struct bpf_iter_process));
> > >> +
> > >> +       rcu_read_lock();
> > >> +       kit->tsk = &init_task;
> > >> +       return 0;
> > >> +}
> > >> +
> > >> +__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
> > >> +{
> > >> +       struct bpf_iter_process_kern *kit = (void *)it;
> > >> +
> > >> +       kit->tsk = next_task(kit->tsk);
> > >> +
> > >> +       return kit->tsk == &init_task ? NULL : kit->tsk;
> > >> +}
> > >> +
> > >> +__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
> > >> +{
> > >> +       rcu_read_unlock();
> > >> +}
> > >
> > > This iter can be used in all ctx-s which is nice, but let's
> > > make the verifier enforce rcu_read_lock/unlock done by bpf prog
> > > instead of doing in the ctor/dtor of iter, since
> > > in sleepable progs the verifier won't recognize that body is RCU CS.
> > > We'd need to teach the verifier to allow bpf_iter_process_new()
> > > inside in_rcu_cs() and make sure there is no rcu_read_unlock
> > > while BPF_ITER_STATE_ACTIVE.
> > > bpf_iter_process_destroy() would become a nop.
> >
> > Thanks for your review!
> >
> > I think bpf_iter_process_{new, next, destroy} should be protected by
> > bpf_rcu_read_lock/unlock explicitly whether the prog is sleepable or
> > not, right?
>
> Correct. By explicit bpf_rcu_read_lock() in case of sleepable progs
> or just by using them in normal bpf progs that have implicit rcu_read_lock()
> done before calling into them.
>
> > I'm not very familiar with the BPF verifier, but I believe
> > there is still a risk in directly calling these kfuns even if
> > in_rcu_cs() is true.
> >
> > Maby what we actually need here is to enforce BPF verifier to check
> > env->cur_state->active_rcu_lock is true when we want to call these kfuncs.
>
> active_rcu_lock means explicit bpf_rcu_read_lock.
> Currently we do allow bpf_rcu_read_lock in non-sleepable, but it's pointless.
>
> Technically we can extend the check:
>                 if (in_rbtree_lock_required_cb(env) && (rcu_lock ||
> rcu_unlock)) {
>                         verbose(env, "Calling
> bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
>                         return -EACCES;
>                 }
> to discourage their use in all non-sleepable, but it will break some progs.
>
> I think it's ok to check in_rcu_cs() to allow bpf_iter_process_*().
> If bpf prog adds explicit and unnecessary bpf_rcu_read_lock() around
> the iter ops it won't do any harm.
> Just need to make sure that rcu unlock logic:
>                 } else if (rcu_unlock) {
>                         bpf_for_each_reg_in_vstate(env->cur_state,
> state, reg, ({
>                                 if (reg->type & MEM_RCU) {
>                                         reg->type &= ~(MEM_RCU |
> PTR_MAYBE_NULL);
>                                         reg->type |= PTR_UNTRUSTED;
>                                 }
>                         }));
> clears iter state that depends on rcu.
>
> I thought about changing mark_stack_slots_iter() to do
> st->type = PTR_TO_STACK | MEM_RCU;
> so that the above clearing logic kicks in,
> but it might be better to have something iter specific.
> is_iter_reg_valid_init() should probably be changed to
> make sure reg->type is not UNTRUSTED.
>
> Andrii,
> do you have better suggestions?

What if we just remember inside bpf_reg_state.iter state whether
iterator needs to be RCU protected (it's just one bit if we don't
allow nesting rcu_read_lock()/rcu_read_unlock(), or we'd need to
remember RCU nestedness level), and then when validating iter_next and
iter_destroy() kfuncs, check that we are still in RCU-protected region
(if we have nestedness, then iter->rcu_nest_level <=
cur_rcu_nest_level, if I understand correctly). And if not, provide a
clear and nice message.

That seems straightforward enough, but am I missing anything subtle?
Kumar Kartikeya Dwivedi Sept. 12, 2023, 10:20 p.m. UTC | #7
On Wed, 13 Sept 2023 at 00:12, Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, Sep 6, 2023 at 10:18 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Sep 6, 2023 at 5:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> > >
> > > Hello, Alexei.
> > >
> > > 在 2023/9/6 04:09, Alexei Starovoitov 写道:
> > > > On Sun, Aug 27, 2023 at 12:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> > > >>
> > > >> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> > > >> creation and manipulation of struct bpf_iter_process in open-coded iterator
> > > >> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> > > >> iterate all processes in the system.
> > > >>
> > > >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> > > >> ---
> > > >>   include/uapi/linux/bpf.h       |  4 ++++
> > > >>   kernel/bpf/helpers.c           |  3 +++
> > > >>   kernel/bpf/task_iter.c         | 31 +++++++++++++++++++++++++++++++
> > > >>   tools/include/uapi/linux/bpf.h |  4 ++++
> > > >>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
> > > >>   5 files changed, 47 insertions(+)
> > > >>
> > > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > >> index 2a6e9b99564b..cfbd527e3733 100644
> > > >> --- a/include/uapi/linux/bpf.h
> > > >> +++ b/include/uapi/linux/bpf.h
> > > >> @@ -7199,4 +7199,8 @@ struct bpf_iter_css_task {
> > > >>          __u64 __opaque[1];
> > > >>   } __attribute__((aligned(8)));
> > > >>
> > > >> +struct bpf_iter_process {
> > > >> +       __u64 __opaque[1];
> > > >> +} __attribute__((aligned(8)));
> > > >> +
> > > >>   #endif /* _UAPI__LINUX_BPF_H__ */
> > > >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > >> index cf113ad24837..81a2005edc26 100644
> > > >> --- a/kernel/bpf/helpers.c
> > > >> +++ b/kernel/bpf/helpers.c
> > > >> @@ -2458,6 +2458,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> > > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
> > > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
> > > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
> > > >> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
> > > >> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
> > > >> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
> > > >>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> > > >>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> > > >>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> > > >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > > >> index b1bdba40b684..a6717a76c1e0 100644
> > > >> --- a/kernel/bpf/task_iter.c
> > > >> +++ b/kernel/bpf/task_iter.c
> > > >> @@ -862,6 +862,37 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
> > > >>          kfree(kit->css_it);
> > > >>   }
> > > >>
> > > >> +struct bpf_iter_process_kern {
> > > >> +       struct task_struct *tsk;
> > > >> +} __attribute__((aligned(8)));
> > > >> +
> > > >> +__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
> > > >> +{
> > > >> +       struct bpf_iter_process_kern *kit = (void *)it;
> > > >> +
> > > >> +       BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
> > > >> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
> > > >> +                                       __alignof__(struct bpf_iter_process));
> > > >> +
> > > >> +       rcu_read_lock();
> > > >> +       kit->tsk = &init_task;
> > > >> +       return 0;
> > > >> +}
> > > >> +
> > > >> +__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
> > > >> +{
> > > >> +       struct bpf_iter_process_kern *kit = (void *)it;
> > > >> +
> > > >> +       kit->tsk = next_task(kit->tsk);
> > > >> +
> > > >> +       return kit->tsk == &init_task ? NULL : kit->tsk;
> > > >> +}
> > > >> +
> > > >> +__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
> > > >> +{
> > > >> +       rcu_read_unlock();
> > > >> +}
> > > >
> > > > This iter can be used in all ctx-s which is nice, but let's
> > > > make the verifier enforce rcu_read_lock/unlock done by bpf prog
> > > > instead of doing in the ctor/dtor of iter, since
> > > > in sleepable progs the verifier won't recognize that body is RCU CS.
> > > > We'd need to teach the verifier to allow bpf_iter_process_new()
> > > > inside in_rcu_cs() and make sure there is no rcu_read_unlock
> > > > while BPF_ITER_STATE_ACTIVE.
> > > > bpf_iter_process_destroy() would become a nop.
> > >
> > > Thanks for your review!
> > >
> > > I think bpf_iter_process_{new, next, destroy} should be protected by
> > > bpf_rcu_read_lock/unlock explicitly whether the prog is sleepable or
> > > not, right?
> >
> > Correct. By explicit bpf_rcu_read_lock() in case of sleepable progs
> > or just by using them in normal bpf progs that have implicit rcu_read_lock()
> > done before calling into them.
> >
> > > I'm not very familiar with the BPF verifier, but I believe
> > > there is still a risk in directly calling these kfuns even if
> > > in_rcu_cs() is true.
> > >
> > > Maby what we actually need here is to enforce BPF verifier to check
> > > env->cur_state->active_rcu_lock is true when we want to call these kfuncs.
> >
> > active_rcu_lock means explicit bpf_rcu_read_lock.
> > Currently we do allow bpf_rcu_read_lock in non-sleepable, but it's pointless.
> >
> > Technically we can extend the check:
> >                 if (in_rbtree_lock_required_cb(env) && (rcu_lock ||
> > rcu_unlock)) {
> >                         verbose(env, "Calling
> > bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
> >                         return -EACCES;
> >                 }
> > to discourage their use in all non-sleepable, but it will break some progs.
> >
> > I think it's ok to check in_rcu_cs() to allow bpf_iter_process_*().
> > If bpf prog adds explicit and unnecessary bpf_rcu_read_lock() around
> > the iter ops it won't do any harm.
> > Just need to make sure that rcu unlock logic:
> >                 } else if (rcu_unlock) {
> >                         bpf_for_each_reg_in_vstate(env->cur_state,
> > state, reg, ({
> >                                 if (reg->type & MEM_RCU) {
> >                                         reg->type &= ~(MEM_RCU |
> > PTR_MAYBE_NULL);
> >                                         reg->type |= PTR_UNTRUSTED;
> >                                 }
> >                         }));
> > clears iter state that depends on rcu.
> >
> > I thought about changing mark_stack_slots_iter() to do
> > st->type = PTR_TO_STACK | MEM_RCU;
> > so that the above clearing logic kicks in,
> > but it might be better to have something iter specific.
> > is_iter_reg_valid_init() should probably be changed to
> > make sure reg->type is not UNTRUSTED.
> >
> > Andrii,
> > do you have better suggestions?
>
> What if we just remember inside bpf_reg_state.iter state whether
> iterator needs to be RCU protected (it's just one bit if we don't
> allow nesting rcu_read_lock()/rcu_read_unlock(), or we'd need to
> remember RCU nestedness level), and then when validating iter_next and
> iter_destroy() kfuncs, check that we are still in RCU-protected region
> (if we have nestedness, then iter->rcu_nest_level <=
> cur_rcu_nest_level, if I understand correctly). And if not, provide a
> clear and nice message.
>
> That seems straightforward enough, but am I missing anything subtle?
>

We also need to ensure one does not do a bpf_rcu_read_unlock and
bpf_rcu_read_lock again between the iter_new and
iter_next/iter_destroy calls. Simply checking we are in an RCU
protected region will pass the verifier in such a case.

A simple solution might be associating an ID with the RCU CS, so make
active_rcu_lock a 32-bit ID which is monotonically increasing for each
new RCU region. Ofcourse, all of this only matters for sleepable
programs. Then check if id recorded in iter state is same on next and
destroy.
Andrii Nakryiko Sept. 14, 2023, 5:16 p.m. UTC | #8
On Tue, Sep 12, 2023 at 3:21 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Wed, 13 Sept 2023 at 00:12, Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Wed, Sep 6, 2023 at 10:18 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Sep 6, 2023 at 5:38 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> > > >
> > > > Hello, Alexei.
> > > >
> > > > 在 2023/9/6 04:09, Alexei Starovoitov 写道:
> > > > > On Sun, Aug 27, 2023 at 12:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> > > > >>
> > > > >> This patch adds kfuncs bpf_iter_process_{new,next,destroy} which allow
> > > > >> creation and manipulation of struct bpf_iter_process in open-coded iterator
> > > > >> style. BPF programs can use these kfuncs or through bpf_for_each macro to
> > > > >> iterate all processes in the system.
> > > > >>
> > > > >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> > > > >> ---
> > > > >>   include/uapi/linux/bpf.h       |  4 ++++
> > > > >>   kernel/bpf/helpers.c           |  3 +++
> > > > >>   kernel/bpf/task_iter.c         | 31 +++++++++++++++++++++++++++++++
> > > > >>   tools/include/uapi/linux/bpf.h |  4 ++++
> > > > >>   tools/lib/bpf/bpf_helpers.h    |  5 +++++
> > > > >>   5 files changed, 47 insertions(+)
> > > > >>
> > > > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > >> index 2a6e9b99564b..cfbd527e3733 100644
> > > > >> --- a/include/uapi/linux/bpf.h
> > > > >> +++ b/include/uapi/linux/bpf.h
> > > > >> @@ -7199,4 +7199,8 @@ struct bpf_iter_css_task {
> > > > >>          __u64 __opaque[1];
> > > > >>   } __attribute__((aligned(8)));
> > > > >>
> > > > >> +struct bpf_iter_process {
> > > > >> +       __u64 __opaque[1];
> > > > >> +} __attribute__((aligned(8)));
> > > > >> +
> > > > >>   #endif /* _UAPI__LINUX_BPF_H__ */
> > > > >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > > >> index cf113ad24837..81a2005edc26 100644
> > > > >> --- a/kernel/bpf/helpers.c
> > > > >> +++ b/kernel/bpf/helpers.c
> > > > >> @@ -2458,6 +2458,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> > > > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
> > > > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
> > > > >>   BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
> > > > >> +BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
> > > > >> +BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
> > > > >> +BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
> > > > >>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> > > > >>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> > > > >>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> > > > >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > > > >> index b1bdba40b684..a6717a76c1e0 100644
> > > > >> --- a/kernel/bpf/task_iter.c
> > > > >> +++ b/kernel/bpf/task_iter.c
> > > > >> @@ -862,6 +862,37 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
> > > > >>          kfree(kit->css_it);
> > > > >>   }
> > > > >>
> > > > >> +struct bpf_iter_process_kern {
> > > > >> +       struct task_struct *tsk;
> > > > >> +} __attribute__((aligned(8)));
> > > > >> +
> > > > >> +__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
> > > > >> +{
> > > > >> +       struct bpf_iter_process_kern *kit = (void *)it;
> > > > >> +
> > > > >> +       BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
> > > > >> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
> > > > >> +                                       __alignof__(struct bpf_iter_process));
> > > > >> +
> > > > >> +       rcu_read_lock();
> > > > >> +       kit->tsk = &init_task;
> > > > >> +       return 0;
> > > > >> +}
> > > > >> +
> > > > >> +__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
> > > > >> +{
> > > > >> +       struct bpf_iter_process_kern *kit = (void *)it;
> > > > >> +
> > > > >> +       kit->tsk = next_task(kit->tsk);
> > > > >> +
> > > > >> +       return kit->tsk == &init_task ? NULL : kit->tsk;
> > > > >> +}
> > > > >> +
> > > > >> +__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
> > > > >> +{
> > > > >> +       rcu_read_unlock();
> > > > >> +}
> > > > >
> > > > > This iter can be used in all ctx-s which is nice, but let's
> > > > > make the verifier enforce rcu_read_lock/unlock done by bpf prog
> > > > > instead of doing in the ctor/dtor of iter, since
> > > > > in sleepable progs the verifier won't recognize that body is RCU CS.
> > > > > We'd need to teach the verifier to allow bpf_iter_process_new()
> > > > > inside in_rcu_cs() and make sure there is no rcu_read_unlock
> > > > > while BPF_ITER_STATE_ACTIVE.
> > > > > bpf_iter_process_destroy() would become a nop.
> > > >
> > > > Thanks for your review!
> > > >
> > > > I think bpf_iter_process_{new, next, destroy} should be protected by
> > > > bpf_rcu_read_lock/unlock explicitly whether the prog is sleepable or
> > > > not, right?
> > >
> > > Correct. By explicit bpf_rcu_read_lock() in case of sleepable progs
> > > or just by using them in normal bpf progs that have implicit rcu_read_lock()
> > > done before calling into them.
> > >
> > > > I'm not very familiar with the BPF verifier, but I believe
> > > > there is still a risk in directly calling these kfuns even if
> > > > in_rcu_cs() is true.
> > > >
> > > > Maby what we actually need here is to enforce BPF verifier to check
> > > > env->cur_state->active_rcu_lock is true when we want to call these kfuncs.
> > >
> > > active_rcu_lock means explicit bpf_rcu_read_lock.
> > > Currently we do allow bpf_rcu_read_lock in non-sleepable, but it's pointless.
> > >
> > > Technically we can extend the check:
> > >                 if (in_rbtree_lock_required_cb(env) && (rcu_lock ||
> > > rcu_unlock)) {
> > >                         verbose(env, "Calling
> > > bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
> > >                         return -EACCES;
> > >                 }
> > > to discourage their use in all non-sleepable, but it will break some progs.
> > >
> > > I think it's ok to check in_rcu_cs() to allow bpf_iter_process_*().
> > > If bpf prog adds explicit and unnecessary bpf_rcu_read_lock() around
> > > the iter ops it won't do any harm.
> > > Just need to make sure that rcu unlock logic:
> > >                 } else if (rcu_unlock) {
> > >                         bpf_for_each_reg_in_vstate(env->cur_state,
> > > state, reg, ({
> > >                                 if (reg->type & MEM_RCU) {
> > >                                         reg->type &= ~(MEM_RCU |
> > > PTR_MAYBE_NULL);
> > >                                         reg->type |= PTR_UNTRUSTED;
> > >                                 }
> > >                         }));
> > > clears iter state that depends on rcu.
> > >
> > > I thought about changing mark_stack_slots_iter() to do
> > > st->type = PTR_TO_STACK | MEM_RCU;
> > > so that the above clearing logic kicks in,
> > > but it might be better to have something iter specific.
> > > is_iter_reg_valid_init() should probably be changed to
> > > make sure reg->type is not UNTRUSTED.
> > >
> > > Andrii,
> > > do you have better suggestions?
> >
> > What if we just remember inside bpf_reg_state.iter state whether
> > iterator needs to be RCU protected (it's just one bit if we don't
> > allow nesting rcu_read_lock()/rcu_read_unlock(), or we'd need to
> > remember RCU nestedness level), and then when validating iter_next and
> > iter_destroy() kfuncs, check that we are still in RCU-protected region
> > (if we have nestedness, then iter->rcu_nest_level <=
> > cur_rcu_nest_level, if I understand correctly). And if not, provide a
> > clear and nice message.
> >
> > That seems straightforward enough, but am I missing anything subtle?
> >
>
> We also need to ensure one does not do a bpf_rcu_read_unlock and
> bpf_rcu_read_lock again between the iter_new and
> iter_next/iter_destroy calls. Simply checking we are in an RCU
> protected region will pass the verifier in such a case.

Yep, you are right, what I proposed is too naive, of course.

>
> A simple solution might be associating an ID with the RCU CS, so make
> active_rcu_lock a 32-bit ID which is monotonically increasing for each
> new RCU region. Ofcourse, all of this only matters for sleepable
> programs. Then check if id recorded in iter state is same on next and
> destroy.

Yep, I think each RCU region should ideally be tracked separately and
get a unique ID. Kind of like a ref. It is some lifetime/scope, not
necessarily an actual kernel object. And if/when we have it, we can
grab the ID of most nested RCU scope, associate it with RCU-protected
iter, and then make sure that this RCU scope is active at every
next/destroy invocation.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a6e9b99564b..cfbd527e3733 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7199,4 +7199,8 @@  struct bpf_iter_css_task {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_process {
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index cf113ad24837..81a2005edc26 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2458,6 +2458,9 @@  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_process_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_process_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_process_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b1bdba40b684..a6717a76c1e0 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -862,6 +862,37 @@  __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
 	kfree(kit->css_it);
 }
 
+struct bpf_iter_process_kern {
+	struct task_struct *tsk;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc int bpf_iter_process_new(struct bpf_iter_process *it)
+{
+	struct bpf_iter_process_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_process_kern) != sizeof(struct bpf_iter_process));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_process_kern) !=
+					__alignof__(struct bpf_iter_process));
+
+	rcu_read_lock();
+	kit->tsk = &init_task;
+	return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it)
+{
+	struct bpf_iter_process_kern *kit = (void *)it;
+
+	kit->tsk = next_task(kit->tsk);
+
+	return kit->tsk == &init_task ? NULL : kit->tsk;
+}
+
+__bpf_kfunc void bpf_iter_process_destroy(struct bpf_iter_process *it)
+{
+	rcu_read_unlock();
+}
+
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
 static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2a6e9b99564b..cfbd527e3733 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7199,4 +7199,8 @@  struct bpf_iter_css_task {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_iter_process {
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index f4d74b2aaddd..7d6a828d98b5 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -309,6 +309,11 @@  extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
 extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
 extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
 
+struct bpf_iter_process;
+extern int bpf_iter_process_new(struct bpf_iter_process *it) __weak __ksym;
+extern struct task_struct *bpf_iter_process_next(struct bpf_iter_process *it) __weak __ksym;
+extern void bpf_iter_process_destroy(struct bpf_iter_process *it) __weak __ksym;
+
 #ifndef bpf_for_each
 /* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
  * using BPF open-coded iterators without having to write mundane explicit