Message ID | 20230814172825.1363378-1-yonghong.song@linux.dev (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | Add support for local percpu kptr | expand |
On Mon, Aug 14, 2023 at 10:28:25AM -0700, Yonghong Song wrote: > @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, > if (kptr_field->type == BPF_KPTR_UNREF) > perm_flags |= PTR_UNTRUSTED; > > + if (kptr_field->type == BPF_KPTR_PERCPU_REF) > + perm_flags |= MEM_PERCPU | MEM_ALLOC; this bit doesn't look right and ... > + > if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) > goto bad_type; > > - if (!btf_is_kernel(reg->btf)) { > + if (kptr_field->type != BPF_KPTR_PERCPU_REF && !btf_is_kernel(reg->btf)) { > verbose(env, "R%d must point to kernel BTF\n", regno); > return -EINVAL; > } > + if (kptr_field->type == BPF_KPTR_PERCPU_REF && btf_is_kernel(reg->btf)) { > + verbose(env, "R%d must point to prog BTF\n", regno); > + return -EINVAL; > + } .. here it really doesn't look right. The map_kptr_match_type() should have been used for kptrs pointing to kernel objects only. But you're calling it for MEM_ALLOC object with prog's BTF... > + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: > + if (meta->func_id != BPF_FUNC_kptr_xchg) { > + verbose(env, "verifier internal error: unimplemented handling of MEM_PERCPU | MEM_ALLOC\n"); > + return -EFAULT; > + } this part should be handling it, but ... > + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) > + return -EACCES; why call this here? Existing: case PTR_TO_BTF_ID | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } doesn't call map_kptr_match_type(). Where do we check that btf of arg1 and arg2 matches for kptr_xchg of MEM_ALLOC objs? Do we have a bug? Yep. We do :( diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c index 06838083079c..a6f546f4da9a 100644 --- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c +++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c @@ -14,10 +14,12 @@ struct node_data { struct bpf_rb_node node; }; +struct node_data2 { long foo[4];}; + struct map_value { struct prog_test_ref_kfunc *not_kptr; struct prog_test_ref_kfunc __kptr *val; - struct node_data __kptr *node; + struct node_data2 __kptr *node; }; /* This is necessary so that LLVM generates BTF for node_data struct @@ -32,6 +34,7 @@ struct map_value { * Had to do the same w/ bpf_kfunc_call_test_release below */ struct node_data *just_here_because_btf_bug; +struct node_data2 *just_here_because_btf_bug2; passes the verifier and runs into kernel WARN_ONCE. Let's fix this issue first before proceeding with this series.
On Mon, 14 Aug 2023 at 22:59, Yonghong Song <yonghong.song@linux.dev> wrote: > > Add two new kfunc's, bpf_percpu_obj_new_impl() and > bpf_percpu_obj_drop_impl(), to allocate a percpu obj. > Two functions are very similar to bpf_obj_new_impl() > and bpf_obj_drop_impl(). The major difference is related > to percpu handling. > > bpf_rcu_read_lock() > struct val_t __percpu *v = map_val->percpu_data; > ... > bpf_rcu_read_unlock() > > For a percpu data map_val like above 'v', the reg->type > is set as > PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU > if inside rcu critical section. > > MEM_RCU marking here is similar to NON_OWN_REF as 'v' > is not a owning referenace. But NON_OWN_REF is typo: reference > trusted and typically inside the spinlock while > MEM_RCU is under rcu read lock. RCU is preferred here > since percpu data structures mean potential concurrent > access into its contents. > > Also, bpf_percpu_obj_new_impl() is restricted to only accept > scalar struct which means nested kptr's are not allowed > but some other special field, e.g., bpf_list_head, bpf_spin_lock, etc. > could be nested (nested 'struct'). Later patch will improve verifier to > handle such nested special fields. > > Signed-off-by: Yonghong Song <yonghong.song@linux.dev> > --- > include/linux/bpf.h | 3 +- > kernel/bpf/helpers.c | 49 +++++++++++++++++++++++ > kernel/bpf/syscall.c | 21 +++++++--- > kernel/bpf/verifier.c | 90 ++++++++++++++++++++++++++++++++++--------- > 4 files changed, 137 insertions(+), 26 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index e6348fd0a785..a2cb380c43c7 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -197,7 +197,8 @@ struct btf_field_kptr { > struct btf *btf; > struct module *module; > /* dtor used if btf_is_kernel(btf), otherwise the type is > - * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used > + * program-allocated, dtor is NULL, and __bpf_obj_drop_impl > + * or __bpf_percpu_drop_impl is used > */ > btf_dtor_kfunc_t dtor; > u32 btf_id; > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index eb91cae0612a..dd14cb7da4af 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -1900,6 +1900,29 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) > return p; > } > > +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) > +{ > + struct btf_struct_meta *meta = meta__ign; > + const struct btf_record *rec; > + u64 size = local_type_id__k; > + void __percpu *pptr; > + void *p; > + int cpu; > + > + p = bpf_mem_alloc(&bpf_global_percpu_ma, size); > + if (!p) > + return NULL; > + if (meta) { > + pptr = *((void __percpu **)p); > + rec = meta->record; > + for_each_possible_cpu(cpu) { > + bpf_obj_init(rec, per_cpu_ptr(pptr, cpu)); > + } > + } > + > + return p; > +} > + > /* Must be called under migrate_disable(), as required by bpf_mem_free */ > void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) > { > @@ -1924,6 +1947,30 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) > __bpf_obj_drop_impl(p, meta ? meta->record : NULL); > } > > +/* Must be called under migrate_disable(), as required by bpf_mem_free_rcu */ > +void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec) > +{ > + void __percpu *pptr; > + int cpu; > + > + if (rec) { > + pptr = *((void __percpu **)p); > + for_each_possible_cpu(cpu) { > + bpf_obj_free_fields(rec, per_cpu_ptr(pptr, cpu)); Should this loop be done after we have waited for the RCU grace period? Otherwise any other CPU can reinitialize a field after this is done, move objects into lists/rbtree, and leak memory. Please correct me if I'm mistaken. > + } > + } > + > + bpf_mem_free_rcu(&bpf_global_percpu_ma, p); > +} > + > +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) > +{ > + struct btf_struct_meta *meta = meta__ign; > + void *p = p__alloc; > + > + __bpf_percpu_obj_drop_impl(p, meta ? meta->record : NULL); > +} > + > __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) > { > struct btf_struct_meta *meta = meta__ign; > @@ -2436,7 +2483,9 @@ BTF_SET8_START(generic_btf_ids) > BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) > #endif > BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) > +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) > BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) > +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) > BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) > BTF_ID_FLAGS(func, bpf_list_push_front_impl) > BTF_ID_FLAGS(func, bpf_list_push_back_impl) > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 1c30b6ee84d4..9ceb6fd9a0e2 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -627,6 +627,7 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) > } > > extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); > +extern void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec); > > void bpf_obj_free_fields(const struct btf_record *rec, void *obj) > { > @@ -660,13 +661,21 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) > if (!btf_is_kernel(field->kptr.btf)) { > pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, > field->kptr.btf_id); > - if (field->type != BPF_KPTR_PERCPU_REF) > + > + if (field->type == BPF_KPTR_PERCPU_REF) { > + migrate_disable(); > + __bpf_percpu_obj_drop_impl(xchgd_field, pointee_struct_meta ? > + pointee_struct_meta->record : > + NULL); > + migrate_enable(); > + } else { > WARN_ON_ONCE(!pointee_struct_meta); > - migrate_disable(); > - __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? > - pointee_struct_meta->record : > - NULL); > - migrate_enable(); > + migrate_disable(); > + __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? > + pointee_struct_meta->record : > + NULL); > + migrate_enable(); > + } > } else { > field->kptr.dtor(xchgd_field); > } > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 4ccca1f6c998..a985fbf18a11 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta { > /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, > * generally to pass info about user-defined local kptr types to later > * verification logic > - * bpf_obj_drop > + * bpf_obj_drop/bpf_percpu_obj_drop > * Record the local kptr type to be drop'd > * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) > * Record the local kptr type to be refcount_incr'd and use > @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, > if (kptr_field->type == BPF_KPTR_UNREF) > perm_flags |= PTR_UNTRUSTED; > > + if (kptr_field->type == BPF_KPTR_PERCPU_REF) > + perm_flags |= MEM_PERCPU | MEM_ALLOC; > + I think just this would permit PTR_TO_BTF_ID | MEM_ALLOC for percpu kptr? It would probably be good to include negative selftests for kptr_xchg type matching with percpu_kptr to prevent things like these. Alexei already said map_kptr_match_type is not being invoked for MEM_ALLOC kptr_xchg, so that is also an existing bug. > if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) > goto bad_type; > > [...] > /* We need to verify reg->type and reg->btf, before accessing reg->btf */ > reg_name = btf_type_name(reg->btf, reg->btf_id); > > @@ -5084,7 +5091,17 @@ static bool rcu_safe_kptr(const struct btf_field *field) > { > const struct btf_field_kptr *kptr = &field->kptr; > > - return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); > + return field->type == BPF_KPTR_PERCPU_REF || > + (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id)); > +} > + > +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) > +{ > + if (!rcu_safe_kptr(kptr_field) || !in_rcu_cs(env)) > + return PTR_MAYBE_NULL | PTR_UNTRUSTED; > + if (kptr_field->type != BPF_KPTR_PERCPU_REF) > + return PTR_MAYBE_NULL | MEM_RCU; > + return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; The inverted conditions are a bit hard to follow. Maybe better to explicitly check for both RCU cases, and default to untrusted otherwise? > } > > [...] >
On 8/18/23 5:29 PM, Alexei Starovoitov wrote: > On Mon, Aug 14, 2023 at 10:28:25AM -0700, Yonghong Song wrote: >> @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, >> if (kptr_field->type == BPF_KPTR_UNREF) >> perm_flags |= PTR_UNTRUSTED; >> >> + if (kptr_field->type == BPF_KPTR_PERCPU_REF) >> + perm_flags |= MEM_PERCPU | MEM_ALLOC; > > this bit doesn't look right and ... > >> + >> if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) >> goto bad_type; >> >> - if (!btf_is_kernel(reg->btf)) { >> + if (kptr_field->type != BPF_KPTR_PERCPU_REF && !btf_is_kernel(reg->btf)) { >> verbose(env, "R%d must point to kernel BTF\n", regno); >> return -EINVAL; >> } >> + if (kptr_field->type == BPF_KPTR_PERCPU_REF && btf_is_kernel(reg->btf)) { >> + verbose(env, "R%d must point to prog BTF\n", regno); >> + return -EINVAL; >> + } > > .. here it really doesn't look right. > The map_kptr_match_type() should have been used for kptrs pointing to kernel objects only. > But you're calling it for MEM_ALLOC object with prog's BTF... > >> + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: >> + if (meta->func_id != BPF_FUNC_kptr_xchg) { >> + verbose(env, "verifier internal error: unimplemented handling of MEM_PERCPU | MEM_ALLOC\n"); >> + return -EFAULT; >> + } > > this part should be handling it, but ... > >> + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) >> + return -EACCES; > > why call this here? > > Existing: > case PTR_TO_BTF_ID | MEM_ALLOC: > if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && > meta->func_id != BPF_FUNC_kptr_xchg) { > verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); > return -EFAULT; > } > doesn't call map_kptr_match_type(). > Where do we check that btf of arg1 and arg2 matches for kptr_xchg of MEM_ALLOC objs? Do we have a bug? > > Yep. We do :( > > diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c > index 06838083079c..a6f546f4da9a 100644 > --- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c > +++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c > @@ -14,10 +14,12 @@ struct node_data { > struct bpf_rb_node node; > }; > > +struct node_data2 { long foo[4];}; > + > struct map_value { > struct prog_test_ref_kfunc *not_kptr; > struct prog_test_ref_kfunc __kptr *val; > - struct node_data __kptr *node; > + struct node_data2 __kptr *node; > }; > > /* This is necessary so that LLVM generates BTF for node_data struct > @@ -32,6 +34,7 @@ struct map_value { > * Had to do the same w/ bpf_kfunc_call_test_release below > */ > struct node_data *just_here_because_btf_bug; > +struct node_data2 *just_here_because_btf_bug2; > > passes the verifier and runs into kernel WARN_ONCE. > > Let's fix this issue first before proceeding with this series. Sounds good. I will investigate and fix this issue before sending out v2.
On 8/18/23 6:24 PM, Kumar Kartikeya Dwivedi wrote: > On Mon, 14 Aug 2023 at 22:59, Yonghong Song <yonghong.song@linux.dev> wrote: >> >> Add two new kfunc's, bpf_percpu_obj_new_impl() and >> bpf_percpu_obj_drop_impl(), to allocate a percpu obj. >> Two functions are very similar to bpf_obj_new_impl() >> and bpf_obj_drop_impl(). The major difference is related >> to percpu handling. >> >> bpf_rcu_read_lock() >> struct val_t __percpu *v = map_val->percpu_data; >> ... >> bpf_rcu_read_unlock() >> >> For a percpu data map_val like above 'v', the reg->type >> is set as >> PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU >> if inside rcu critical section. >> >> MEM_RCU marking here is similar to NON_OWN_REF as 'v' >> is not a owning referenace. But NON_OWN_REF is > > typo: reference Ack. > >> trusted and typically inside the spinlock while >> MEM_RCU is under rcu read lock. RCU is preferred here >> since percpu data structures mean potential concurrent >> access into its contents. >> >> Also, bpf_percpu_obj_new_impl() is restricted to only accept >> scalar struct which means nested kptr's are not allowed >> but some other special field, e.g., bpf_list_head, bpf_spin_lock, etc. >> could be nested (nested 'struct'). Later patch will improve verifier to >> handle such nested special fields. >> >> Signed-off-by: Yonghong Song <yonghong.song@linux.dev> >> --- >> include/linux/bpf.h | 3 +- >> kernel/bpf/helpers.c | 49 +++++++++++++++++++++++ >> kernel/bpf/syscall.c | 21 +++++++--- >> kernel/bpf/verifier.c | 90 ++++++++++++++++++++++++++++++++++--------- >> 4 files changed, 137 insertions(+), 26 deletions(-) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index e6348fd0a785..a2cb380c43c7 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -197,7 +197,8 @@ struct btf_field_kptr { >> struct btf *btf; >> struct module *module; >> /* dtor used if btf_is_kernel(btf), otherwise the type is >> - * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used >> + * program-allocated, dtor is NULL, and __bpf_obj_drop_impl >> + * or __bpf_percpu_drop_impl is used >> */ >> btf_dtor_kfunc_t dtor; >> u32 btf_id; >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c >> index eb91cae0612a..dd14cb7da4af 100644 >> --- a/kernel/bpf/helpers.c >> +++ b/kernel/bpf/helpers.c >> @@ -1900,6 +1900,29 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) >> return p; >> } >> >> +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) >> +{ >> + struct btf_struct_meta *meta = meta__ign; >> + const struct btf_record *rec; >> + u64 size = local_type_id__k; >> + void __percpu *pptr; >> + void *p; >> + int cpu; >> + >> + p = bpf_mem_alloc(&bpf_global_percpu_ma, size); >> + if (!p) >> + return NULL; >> + if (meta) { >> + pptr = *((void __percpu **)p); >> + rec = meta->record; >> + for_each_possible_cpu(cpu) { >> + bpf_obj_init(rec, per_cpu_ptr(pptr, cpu)); >> + } >> + } >> + >> + return p; >> +} >> + >> /* Must be called under migrate_disable(), as required by bpf_mem_free */ >> void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) >> { >> @@ -1924,6 +1947,30 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) >> __bpf_obj_drop_impl(p, meta ? meta->record : NULL); >> } >> >> +/* Must be called under migrate_disable(), as required by bpf_mem_free_rcu */ >> +void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec) >> +{ >> + void __percpu *pptr; >> + int cpu; >> + >> + if (rec) { >> + pptr = *((void __percpu **)p); >> + for_each_possible_cpu(cpu) { >> + bpf_obj_free_fields(rec, per_cpu_ptr(pptr, cpu)); > > Should this loop be done after we have waited for the RCU grace period? > Otherwise any other CPU can reinitialize a field after this is done, > move objects into lists/rbtree, and leak memory. > Please correct me if I'm mistaken. Thanks for spotting this. I think you are correct. The above scenario is indeed possible. one cpu takes a direct reference of __percpu_kptr and do a bunch of stuff, and the other cpu is doing a bpf_kptr_xchg to get the __percpu_kptr and drops it. We should really drop the __percpu_kptr itself and the fields in its record after a rcu grace period so the exist direct reference operation won't be affected. Will fix it in the v2. > >> + } >> + } >> + >> + bpf_mem_free_rcu(&bpf_global_percpu_ma, p); >> +} >> + >> +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) >> +{ >> + struct btf_struct_meta *meta = meta__ign; >> + void *p = p__alloc; >> + >> + __bpf_percpu_obj_drop_impl(p, meta ? meta->record : NULL); >> +} >> + >> __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) >> { >> struct btf_struct_meta *meta = meta__ign; >> @@ -2436,7 +2483,9 @@ BTF_SET8_START(generic_btf_ids) >> BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) >> #endif >> BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) >> +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) >> BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) >> +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) >> BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) >> BTF_ID_FLAGS(func, bpf_list_push_front_impl) >> BTF_ID_FLAGS(func, bpf_list_push_back_impl) >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index 1c30b6ee84d4..9ceb6fd9a0e2 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -627,6 +627,7 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) >> } >> >> extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); >> +extern void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec); >> >> void bpf_obj_free_fields(const struct btf_record *rec, void *obj) >> { >> @@ -660,13 +661,21 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) >> if (!btf_is_kernel(field->kptr.btf)) { >> pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, >> field->kptr.btf_id); >> - if (field->type != BPF_KPTR_PERCPU_REF) >> + >> + if (field->type == BPF_KPTR_PERCPU_REF) { >> + migrate_disable(); >> + __bpf_percpu_obj_drop_impl(xchgd_field, pointee_struct_meta ? >> + pointee_struct_meta->record : >> + NULL); >> + migrate_enable(); >> + } else { >> WARN_ON_ONCE(!pointee_struct_meta); >> - migrate_disable(); >> - __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? >> - pointee_struct_meta->record : >> - NULL); >> - migrate_enable(); >> + migrate_disable(); >> + __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? >> + pointee_struct_meta->record : >> + NULL); >> + migrate_enable(); >> + } >> } else { >> field->kptr.dtor(xchgd_field); >> } >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c >> index 4ccca1f6c998..a985fbf18a11 100644 >> --- a/kernel/bpf/verifier.c >> +++ b/kernel/bpf/verifier.c >> @@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta { >> /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, >> * generally to pass info about user-defined local kptr types to later >> * verification logic >> - * bpf_obj_drop >> + * bpf_obj_drop/bpf_percpu_obj_drop >> * Record the local kptr type to be drop'd >> * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) >> * Record the local kptr type to be refcount_incr'd and use >> @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, >> if (kptr_field->type == BPF_KPTR_UNREF) >> perm_flags |= PTR_UNTRUSTED; >> >> + if (kptr_field->type == BPF_KPTR_PERCPU_REF) >> + perm_flags |= MEM_PERCPU | MEM_ALLOC; >> + > > I think just this would permit PTR_TO_BTF_ID | MEM_ALLOC for percpu kptr? > It would probably be good to include negative selftests for kptr_xchg > type matching with percpu_kptr to prevent things like these. > > Alexei already said map_kptr_match_type is not being invoked for > MEM_ALLOC kptr_xchg, so that is also an existing bug. I will fix that bug first and this part of change probably not needed any more. > >> if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) >> goto bad_type; >> >> [...] >> /* We need to verify reg->type and reg->btf, before accessing reg->btf */ >> reg_name = btf_type_name(reg->btf, reg->btf_id); >> >> @@ -5084,7 +5091,17 @@ static bool rcu_safe_kptr(const struct btf_field *field) >> { >> const struct btf_field_kptr *kptr = &field->kptr; >> >> - return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); >> + return field->type == BPF_KPTR_PERCPU_REF || >> + (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id)); >> +} >> + >> +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) >> +{ >> + if (!rcu_safe_kptr(kptr_field) || !in_rcu_cs(env)) >> + return PTR_MAYBE_NULL | PTR_UNTRUSTED; >> + if (kptr_field->type != BPF_KPTR_PERCPU_REF) >> + return PTR_MAYBE_NULL | MEM_RCU; >> + return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; > > The inverted conditions are a bit hard to follow. Maybe better to > explicitly check for both RCU cases, and default to untrusted > otherwise? Okay. Will do. > >> } >> >> [...] >>
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e6348fd0a785..a2cb380c43c7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -197,7 +197,8 @@ struct btf_field_kptr { struct btf *btf; struct module *module; /* dtor used if btf_is_kernel(btf), otherwise the type is - * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used + * program-allocated, dtor is NULL, and __bpf_obj_drop_impl + * or __bpf_percpu_drop_impl is used */ btf_dtor_kfunc_t dtor; u32 btf_id; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index eb91cae0612a..dd14cb7da4af 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1900,6 +1900,29 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return p; } +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + const struct btf_record *rec; + u64 size = local_type_id__k; + void __percpu *pptr; + void *p; + int cpu; + + p = bpf_mem_alloc(&bpf_global_percpu_ma, size); + if (!p) + return NULL; + if (meta) { + pptr = *((void __percpu **)p); + rec = meta->record; + for_each_possible_cpu(cpu) { + bpf_obj_init(rec, per_cpu_ptr(pptr, cpu)); + } + } + + return p; +} + /* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) { @@ -1924,6 +1947,30 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) __bpf_obj_drop_impl(p, meta ? meta->record : NULL); } +/* Must be called under migrate_disable(), as required by bpf_mem_free_rcu */ +void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec) +{ + void __percpu *pptr; + int cpu; + + if (rec) { + pptr = *((void __percpu **)p); + for_each_possible_cpu(cpu) { + bpf_obj_free_fields(rec, per_cpu_ptr(pptr, cpu)); + } + } + + bpf_mem_free_rcu(&bpf_global_percpu_ma, p); +} + +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + void *p = p__alloc; + + __bpf_percpu_obj_drop_impl(p, meta ? meta->record : NULL); +} + __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; @@ -2436,7 +2483,9 @@ BTF_SET8_START(generic_btf_ids) BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1c30b6ee84d4..9ceb6fd9a0e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -627,6 +627,7 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) } extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); +extern void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec); void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { @@ -660,13 +661,21 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); - if (field->type != BPF_KPTR_PERCPU_REF) + + if (field->type == BPF_KPTR_PERCPU_REF) { + migrate_disable(); + __bpf_percpu_obj_drop_impl(xchgd_field, pointee_struct_meta ? + pointee_struct_meta->record : + NULL); + migrate_enable(); + } else { WARN_ON_ONCE(!pointee_struct_meta); - migrate_disable(); - __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? - pointee_struct_meta->record : - NULL); - migrate_enable(); + migrate_disable(); + __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? + pointee_struct_meta->record : + NULL); + migrate_enable(); + } } else { field->kptr.dtor(xchgd_field); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4ccca1f6c998..a985fbf18a11 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta { /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, * generally to pass info about user-defined local kptr types to later * verification logic - * bpf_obj_drop + * bpf_obj_drop/bpf_percpu_obj_drop * Record the local kptr type to be drop'd * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) * Record the local kptr type to be refcount_incr'd and use @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, if (kptr_field->type == BPF_KPTR_UNREF) perm_flags |= PTR_UNTRUSTED; + if (kptr_field->type == BPF_KPTR_PERCPU_REF) + perm_flags |= MEM_PERCPU | MEM_ALLOC; + if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) goto bad_type; - if (!btf_is_kernel(reg->btf)) { + if (kptr_field->type != BPF_KPTR_PERCPU_REF && !btf_is_kernel(reg->btf)) { verbose(env, "R%d must point to kernel BTF\n", regno); return -EINVAL; } + if (kptr_field->type == BPF_KPTR_PERCPU_REF && btf_is_kernel(reg->btf)) { + verbose(env, "R%d must point to prog BTF\n", regno); + return -EINVAL; + } /* We need to verify reg->type and reg->btf, before accessing reg->btf */ reg_name = btf_type_name(reg->btf, reg->btf_id); @@ -5084,7 +5091,17 @@ static bool rcu_safe_kptr(const struct btf_field *field) { const struct btf_field_kptr *kptr = &field->kptr; - return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); + return field->type == BPF_KPTR_PERCPU_REF || + (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id)); +} + +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) +{ + if (!rcu_safe_kptr(kptr_field) || !in_rcu_cs(env)) + return PTR_MAYBE_NULL | PTR_UNTRUSTED; + if (kptr_field->type != BPF_KPTR_PERCPU_REF) + return PTR_MAYBE_NULL | MEM_RCU; + return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; } static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, @@ -5110,7 +5127,8 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { + if (class != BPF_LDX && + (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU_REF)) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -5121,10 +5139,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, - rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ? - PTR_MAYBE_NULL | MEM_RCU : - PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -5178,6 +5193,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU_REF: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -7316,7 +7332,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (kptr_field->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU_REF) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } @@ -7827,8 +7843,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) { type &= ~MEM_ALLOC; + type &= ~MEM_PERCPU; + } for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; @@ -7918,6 +7936,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, } /* Handled by helper specific checks */ break; + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: + if (meta->func_id != BPF_FUNC_kptr_xchg) { + verbose(env, "verifier internal error: unimplemented handling of MEM_PERCPU | MEM_ALLOC\n"); + return -EFAULT; + } + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + return -EACCES; + break; case PTR_TO_BTF_ID | MEM_PERCPU: case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: /* Handled by helper specific checks */ @@ -9885,8 +9911,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_kptr_xchg) { ret_btf = meta.kptr_field->kptr.btf; ret_btf_id = meta.kptr_field->kptr.btf_id; - if (!btf_is_kernel(ret_btf)) + if (!btf_is_kernel(ret_btf)) { regs[BPF_REG_0].type |= MEM_ALLOC; + if (meta.kptr_field->type == BPF_KPTR_PERCPU_REF) + regs[BPF_REG_0].type |= MEM_PERCPU; + } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -10271,6 +10300,8 @@ enum special_kfunc_type { KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, + KF_bpf_percpu_obj_new_impl, + KF_bpf_percpu_obj_drop_impl, }; BTF_SET_START(special_kfunc_set) @@ -10291,6 +10322,8 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10313,6 +10346,8 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11003,7 +11038,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } break; case KF_ARG_PTR_TO_ALLOC_BTF_ID: - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC) && + reg->type != (PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC)) { verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } @@ -11012,7 +11048,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } if (meta->btf == btf_vmlinux && - meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + (meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl])) { meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; } @@ -11410,6 +11447,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Only exception is bpf_obj_new_impl */ if (meta.btf != btf_vmlinux || (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && + meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] && meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; @@ -11423,11 +11461,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf *ret_btf; u32 ret_btf_id; - if (unlikely(!bpf_global_ma_set)) + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) + return -ENOMEM; + + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set) return -ENOMEM; if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { @@ -11440,13 +11482,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* This may be NULL due to user not supplying a BTF */ if (!ret_btf) { - verbose(env, "bpf_obj_new requires prog BTF\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); return -EINVAL; } ret_t = btf_type_by_id(ret_btf, ret_btf_id); if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && + !__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); return -EINVAL; } @@ -11454,6 +11501,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; insn_aux->obj_new_size = ret_t->size; insn_aux->kptr_struct_meta = @@ -11594,7 +11643,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); @@ -18266,7 +18316,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) return 0; - if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; @@ -18277,6 +18328,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[3] = *insn; *cnt = 4; } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] || desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
Add two new kfunc's, bpf_percpu_obj_new_impl() and bpf_percpu_obj_drop_impl(), to allocate a percpu obj. Two functions are very similar to bpf_obj_new_impl() and bpf_obj_drop_impl(). The major difference is related to percpu handling. bpf_rcu_read_lock() struct val_t __percpu *v = map_val->percpu_data; ... bpf_rcu_read_unlock() For a percpu data map_val like above 'v', the reg->type is set as PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU if inside rcu critical section. MEM_RCU marking here is similar to NON_OWN_REF as 'v' is not a owning referenace. But NON_OWN_REF is trusted and typically inside the spinlock while MEM_RCU is under rcu read lock. RCU is preferred here since percpu data structures mean potential concurrent access into its contents. Also, bpf_percpu_obj_new_impl() is restricted to only accept scalar struct which means nested kptr's are not allowed but some other special field, e.g., bpf_list_head, bpf_spin_lock, etc. could be nested (nested 'struct'). Later patch will improve verifier to handle such nested special fields. Signed-off-by: Yonghong Song <yonghong.song@linux.dev> --- include/linux/bpf.h | 3 +- kernel/bpf/helpers.c | 49 +++++++++++++++++++++++ kernel/bpf/syscall.c | 21 +++++++--- kernel/bpf/verifier.c | 90 ++++++++++++++++++++++++++++++++++--------- 4 files changed, 137 insertions(+), 26 deletions(-)