diff mbox series

[bpf-next,03/15] bpf: Add alloc/xchg/direct_access support for local percpu kptr

Message ID 20230814172825.1363378-1-yonghong.song@linux.dev (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Add support for local percpu kptr | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 2812 this patch: 2815
netdev/cc_maintainers warning 7 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com sdf@google.com song@kernel.org jolsa@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 1526 this patch: 1526
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 2840 this patch: 2843
netdev/checkpatch warning WARNING: 'referenace' may be misspelled - perhaps 'reference'? WARNING: externs should be avoided in .c files WARNING: line length of 101 exceeds 80 columns WARNING: line length of 102 exceeds 80 columns WARNING: line length of 107 exceeds 80 columns WARNING: line length of 109 exceeds 80 columns WARNING: line length of 114 exceeds 80 columns WARNING: line length of 116 exceeds 80 columns WARNING: line length of 125 exceeds 80 columns WARNING: line length of 126 exceeds 80 columns WARNING: line length of 128 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-30 fail Logs for veristat
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-12 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 fail Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 fail Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 fail Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 fail Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 fail Logs for veristat
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs_no_alu32 on aarch64 with gcc

Commit Message

Yonghong Song Aug. 14, 2023, 5:28 p.m. UTC
Add two new kfunc's, bpf_percpu_obj_new_impl() and
bpf_percpu_obj_drop_impl(), to allocate a percpu obj.
Two functions are very similar to bpf_obj_new_impl()
and bpf_obj_drop_impl(). The major difference is related
to percpu handling.

    bpf_rcu_read_lock()
    struct val_t __percpu *v = map_val->percpu_data;
    ...
    bpf_rcu_read_unlock()

For a percpu data map_val like above 'v', the reg->type
is set as
	PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU
if inside rcu critical section.

MEM_RCU marking here is similar to NON_OWN_REF as 'v'
is not a owning referenace. But NON_OWN_REF is
trusted and typically inside the spinlock while
MEM_RCU is under rcu read lock. RCU is preferred here
since percpu data structures mean potential concurrent
access into its contents.

Also, bpf_percpu_obj_new_impl() is restricted to only accept
scalar struct which means nested kptr's are not allowed
but some other special field, e.g., bpf_list_head, bpf_spin_lock, etc.
could be nested (nested 'struct'). Later patch will improve verifier to
handle such nested special fields.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h   |  3 +-
 kernel/bpf/helpers.c  | 49 +++++++++++++++++++++++
 kernel/bpf/syscall.c  | 21 +++++++---
 kernel/bpf/verifier.c | 90 ++++++++++++++++++++++++++++++++++---------
 4 files changed, 137 insertions(+), 26 deletions(-)

Comments

Alexei Starovoitov Aug. 19, 2023, 12:29 a.m. UTC | #1
On Mon, Aug 14, 2023 at 10:28:25AM -0700, Yonghong Song wrote:
> @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
>  	if (kptr_field->type == BPF_KPTR_UNREF)
>  		perm_flags |= PTR_UNTRUSTED;
>  
> +	if (kptr_field->type == BPF_KPTR_PERCPU_REF)
> +		perm_flags |= MEM_PERCPU | MEM_ALLOC;

this bit doesn't look right and ...

> +
>  	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
>  		goto bad_type;
>  
> -	if (!btf_is_kernel(reg->btf)) {
> +	if (kptr_field->type != BPF_KPTR_PERCPU_REF && !btf_is_kernel(reg->btf)) {
>  		verbose(env, "R%d must point to kernel BTF\n", regno);
>  		return -EINVAL;
>  	}
> +	if (kptr_field->type == BPF_KPTR_PERCPU_REF && btf_is_kernel(reg->btf)) {
> +		verbose(env, "R%d must point to prog BTF\n", regno);
> +		return -EINVAL;
> +	}

.. here it really doesn't look right.
The map_kptr_match_type() should have been used for kptrs pointing to kernel objects only.
But you're calling it for MEM_ALLOC object with prog's BTF...

> +	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
> +		if (meta->func_id != BPF_FUNC_kptr_xchg) {
> +			verbose(env, "verifier internal error: unimplemented handling of MEM_PERCPU | MEM_ALLOC\n");
> +			return -EFAULT;
> +		}

this part should be handling it, but ...

> +		if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
> +			return -EACCES;

why call this here?

Existing:
        case PTR_TO_BTF_ID | MEM_ALLOC:
                if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
                    meta->func_id != BPF_FUNC_kptr_xchg) {
                        verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
                        return -EFAULT;
                }
doesn't call map_kptr_match_type().
Where do we check that btf of arg1 and arg2 matches for kptr_xchg of MEM_ALLOC objs? Do we have a bug?

Yep. We do :(

diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
index 06838083079c..a6f546f4da9a 100644
--- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c
+++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
@@ -14,10 +14,12 @@ struct node_data {
        struct bpf_rb_node node;
 };

+struct node_data2 { long foo[4];};
+
 struct map_value {
        struct prog_test_ref_kfunc *not_kptr;
        struct prog_test_ref_kfunc __kptr *val;
-       struct node_data __kptr *node;
+       struct node_data2 __kptr *node;
 };

 /* This is necessary so that LLVM generates BTF for node_data struct
@@ -32,6 +34,7 @@ struct map_value {
  * Had to do the same w/ bpf_kfunc_call_test_release below
  */
 struct node_data *just_here_because_btf_bug;
+struct node_data2 *just_here_because_btf_bug2;

passes the verifier and runs into kernel WARN_ONCE.

Let's fix this issue first before proceeding with this series.
Kumar Kartikeya Dwivedi Aug. 19, 2023, 1:24 a.m. UTC | #2
On Mon, 14 Aug 2023 at 22:59, Yonghong Song <yonghong.song@linux.dev> wrote:
>
> Add two new kfunc's, bpf_percpu_obj_new_impl() and
> bpf_percpu_obj_drop_impl(), to allocate a percpu obj.
> Two functions are very similar to bpf_obj_new_impl()
> and bpf_obj_drop_impl(). The major difference is related
> to percpu handling.
>
>     bpf_rcu_read_lock()
>     struct val_t __percpu *v = map_val->percpu_data;
>     ...
>     bpf_rcu_read_unlock()
>
> For a percpu data map_val like above 'v', the reg->type
> is set as
>         PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU
> if inside rcu critical section.
>
> MEM_RCU marking here is similar to NON_OWN_REF as 'v'
> is not a owning referenace. But NON_OWN_REF is

typo: reference

> trusted and typically inside the spinlock while
> MEM_RCU is under rcu read lock. RCU is preferred here
> since percpu data structures mean potential concurrent
> access into its contents.
>
> Also, bpf_percpu_obj_new_impl() is restricted to only accept
> scalar struct which means nested kptr's are not allowed
> but some other special field, e.g., bpf_list_head, bpf_spin_lock, etc.
> could be nested (nested 'struct'). Later patch will improve verifier to
> handle such nested special fields.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h   |  3 +-
>  kernel/bpf/helpers.c  | 49 +++++++++++++++++++++++
>  kernel/bpf/syscall.c  | 21 +++++++---
>  kernel/bpf/verifier.c | 90 ++++++++++++++++++++++++++++++++++---------
>  4 files changed, 137 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index e6348fd0a785..a2cb380c43c7 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -197,7 +197,8 @@ struct btf_field_kptr {
>         struct btf *btf;
>         struct module *module;
>         /* dtor used if btf_is_kernel(btf), otherwise the type is
> -        * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
> +        * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl
> +        * or __bpf_percpu_drop_impl is used
>          */
>         btf_dtor_kfunc_t dtor;
>         u32 btf_id;
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index eb91cae0612a..dd14cb7da4af 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1900,6 +1900,29 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
>         return p;
>  }
>
> +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
> +{
> +       struct btf_struct_meta *meta = meta__ign;
> +       const struct btf_record *rec;
> +       u64 size = local_type_id__k;
> +       void __percpu *pptr;
> +       void *p;
> +       int cpu;
> +
> +       p = bpf_mem_alloc(&bpf_global_percpu_ma, size);
> +       if (!p)
> +               return NULL;
> +       if (meta) {
> +               pptr = *((void __percpu **)p);
> +               rec = meta->record;
> +               for_each_possible_cpu(cpu) {
> +                       bpf_obj_init(rec, per_cpu_ptr(pptr, cpu));
> +               }
> +       }
> +
> +       return p;
> +}
> +
>  /* Must be called under migrate_disable(), as required by bpf_mem_free */
>  void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
>  {
> @@ -1924,6 +1947,30 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
>         __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
>  }
>
> +/* Must be called under migrate_disable(), as required by bpf_mem_free_rcu */
> +void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec)
> +{
> +       void __percpu *pptr;
> +       int cpu;
> +
> +       if (rec) {
> +               pptr = *((void __percpu **)p);
> +               for_each_possible_cpu(cpu) {
> +                       bpf_obj_free_fields(rec, per_cpu_ptr(pptr, cpu));

Should this loop be done after we have waited for the RCU grace period?
Otherwise any other CPU can reinitialize a field after this is done,
move objects into lists/rbtree, and leak memory.
Please correct me if I'm mistaken.

> +               }
> +       }
> +
> +       bpf_mem_free_rcu(&bpf_global_percpu_ma, p);
> +}
> +
> +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
> +{
> +       struct btf_struct_meta *meta = meta__ign;
> +       void *p = p__alloc;
> +
> +       __bpf_percpu_obj_drop_impl(p, meta ? meta->record : NULL);
> +}
> +
>  __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
>  {
>         struct btf_struct_meta *meta = meta__ign;
> @@ -2436,7 +2483,9 @@ BTF_SET8_START(generic_btf_ids)
>  BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
>  #endif
>  BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
> +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
>  BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_list_push_front_impl)
>  BTF_ID_FLAGS(func, bpf_list_push_back_impl)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 1c30b6ee84d4..9ceb6fd9a0e2 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -627,6 +627,7 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
>  }
>
>  extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
> +extern void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec);
>
>  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
>  {
> @@ -660,13 +661,21 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
>                         if (!btf_is_kernel(field->kptr.btf)) {
>                                 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
>                                                                            field->kptr.btf_id);
> -                               if (field->type != BPF_KPTR_PERCPU_REF)
> +
> +                               if (field->type == BPF_KPTR_PERCPU_REF) {
> +                                       migrate_disable();
> +                                       __bpf_percpu_obj_drop_impl(xchgd_field, pointee_struct_meta ?
> +                                                                               pointee_struct_meta->record :
> +                                                                               NULL);
> +                                       migrate_enable();
> +                               } else {
>                                         WARN_ON_ONCE(!pointee_struct_meta);
> -                               migrate_disable();
> -                               __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
> -                                                                pointee_struct_meta->record :
> -                                                                NULL);
> -                               migrate_enable();
> +                                       migrate_disable();
> +                                       __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
> +                                                                        pointee_struct_meta->record :
> +                                                                        NULL);
> +                                       migrate_enable();
> +                               }
>                         } else {
>                                 field->kptr.dtor(xchgd_field);
>                         }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 4ccca1f6c998..a985fbf18a11 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta {
>         /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
>          * generally to pass info about user-defined local kptr types to later
>          * verification logic
> -        *   bpf_obj_drop
> +        *   bpf_obj_drop/bpf_percpu_obj_drop
>          *     Record the local kptr type to be drop'd
>          *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
>          *     Record the local kptr type to be refcount_incr'd and use
> @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
>         if (kptr_field->type == BPF_KPTR_UNREF)
>                 perm_flags |= PTR_UNTRUSTED;
>
> +       if (kptr_field->type == BPF_KPTR_PERCPU_REF)
> +               perm_flags |= MEM_PERCPU | MEM_ALLOC;
> +

I think just this would permit PTR_TO_BTF_ID | MEM_ALLOC for percpu kptr?
It would probably be good to include negative selftests for kptr_xchg
type matching with percpu_kptr to prevent things like these.

Alexei already said map_kptr_match_type is not being invoked for
MEM_ALLOC kptr_xchg, so that is also an existing bug.

>         if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
>                 goto bad_type;
>
> [...]
>         /* We need to verify reg->type and reg->btf, before accessing reg->btf */
>         reg_name = btf_type_name(reg->btf, reg->btf_id);
>
> @@ -5084,7 +5091,17 @@ static bool rcu_safe_kptr(const struct btf_field *field)
>  {
>         const struct btf_field_kptr *kptr = &field->kptr;
>
> -       return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
> +       return field->type == BPF_KPTR_PERCPU_REF ||
> +              (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
> +}
> +
> +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
> +{
> +       if (!rcu_safe_kptr(kptr_field) || !in_rcu_cs(env))
> +               return PTR_MAYBE_NULL | PTR_UNTRUSTED;
> +       if (kptr_field->type != BPF_KPTR_PERCPU_REF)
> +               return PTR_MAYBE_NULL | MEM_RCU;
> +       return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;

The inverted conditions are a bit hard to follow. Maybe better to
explicitly check for both RCU cases, and default to untrusted
otherwise?

>  }
>
> [...]
>
Yonghong Song Aug. 20, 2023, 3:47 a.m. UTC | #3
On 8/18/23 5:29 PM, Alexei Starovoitov wrote:
> On Mon, Aug 14, 2023 at 10:28:25AM -0700, Yonghong Song wrote:
>> @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
>>   	if (kptr_field->type == BPF_KPTR_UNREF)
>>   		perm_flags |= PTR_UNTRUSTED;
>>   
>> +	if (kptr_field->type == BPF_KPTR_PERCPU_REF)
>> +		perm_flags |= MEM_PERCPU | MEM_ALLOC;
> 
> this bit doesn't look right and ...
> 
>> +
>>   	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
>>   		goto bad_type;
>>   
>> -	if (!btf_is_kernel(reg->btf)) {
>> +	if (kptr_field->type != BPF_KPTR_PERCPU_REF && !btf_is_kernel(reg->btf)) {
>>   		verbose(env, "R%d must point to kernel BTF\n", regno);
>>   		return -EINVAL;
>>   	}
>> +	if (kptr_field->type == BPF_KPTR_PERCPU_REF && btf_is_kernel(reg->btf)) {
>> +		verbose(env, "R%d must point to prog BTF\n", regno);
>> +		return -EINVAL;
>> +	}
> 
> .. here it really doesn't look right.
> The map_kptr_match_type() should have been used for kptrs pointing to kernel objects only.
> But you're calling it for MEM_ALLOC object with prog's BTF...
> 
>> +	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
>> +		if (meta->func_id != BPF_FUNC_kptr_xchg) {
>> +			verbose(env, "verifier internal error: unimplemented handling of MEM_PERCPU | MEM_ALLOC\n");
>> +			return -EFAULT;
>> +		}
> 
> this part should be handling it, but ...
> 
>> +		if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
>> +			return -EACCES;
> 
> why call this here?
> 
> Existing:
>          case PTR_TO_BTF_ID | MEM_ALLOC:
>                  if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
>                      meta->func_id != BPF_FUNC_kptr_xchg) {
>                          verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
>                          return -EFAULT;
>                  }
> doesn't call map_kptr_match_type().
> Where do we check that btf of arg1 and arg2 matches for kptr_xchg of MEM_ALLOC objs? Do we have a bug?
> 
> Yep. We do :(
> 
> diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
> index 06838083079c..a6f546f4da9a 100644
> --- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c
> +++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c
> @@ -14,10 +14,12 @@ struct node_data {
>          struct bpf_rb_node node;
>   };
> 
> +struct node_data2 { long foo[4];};
> +
>   struct map_value {
>          struct prog_test_ref_kfunc *not_kptr;
>          struct prog_test_ref_kfunc __kptr *val;
> -       struct node_data __kptr *node;
> +       struct node_data2 __kptr *node;
>   };
> 
>   /* This is necessary so that LLVM generates BTF for node_data struct
> @@ -32,6 +34,7 @@ struct map_value {
>    * Had to do the same w/ bpf_kfunc_call_test_release below
>    */
>   struct node_data *just_here_because_btf_bug;
> +struct node_data2 *just_here_because_btf_bug2;
> 
> passes the verifier and runs into kernel WARN_ONCE.
> 
> Let's fix this issue first before proceeding with this series.

Sounds good. I will investigate and fix this issue before sending
out v2.
Yonghong Song Aug. 20, 2023, 4:04 a.m. UTC | #4
On 8/18/23 6:24 PM, Kumar Kartikeya Dwivedi wrote:
> On Mon, 14 Aug 2023 at 22:59, Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> Add two new kfunc's, bpf_percpu_obj_new_impl() and
>> bpf_percpu_obj_drop_impl(), to allocate a percpu obj.
>> Two functions are very similar to bpf_obj_new_impl()
>> and bpf_obj_drop_impl(). The major difference is related
>> to percpu handling.
>>
>>      bpf_rcu_read_lock()
>>      struct val_t __percpu *v = map_val->percpu_data;
>>      ...
>>      bpf_rcu_read_unlock()
>>
>> For a percpu data map_val like above 'v', the reg->type
>> is set as
>>          PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU
>> if inside rcu critical section.
>>
>> MEM_RCU marking here is similar to NON_OWN_REF as 'v'
>> is not a owning referenace. But NON_OWN_REF is
> 
> typo: reference

Ack.

> 
>> trusted and typically inside the spinlock while
>> MEM_RCU is under rcu read lock. RCU is preferred here
>> since percpu data structures mean potential concurrent
>> access into its contents.
>>
>> Also, bpf_percpu_obj_new_impl() is restricted to only accept
>> scalar struct which means nested kptr's are not allowed
>> but some other special field, e.g., bpf_list_head, bpf_spin_lock, etc.
>> could be nested (nested 'struct'). Later patch will improve verifier to
>> handle such nested special fields.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h   |  3 +-
>>   kernel/bpf/helpers.c  | 49 +++++++++++++++++++++++
>>   kernel/bpf/syscall.c  | 21 +++++++---
>>   kernel/bpf/verifier.c | 90 ++++++++++++++++++++++++++++++++++---------
>>   4 files changed, 137 insertions(+), 26 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index e6348fd0a785..a2cb380c43c7 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -197,7 +197,8 @@ struct btf_field_kptr {
>>          struct btf *btf;
>>          struct module *module;
>>          /* dtor used if btf_is_kernel(btf), otherwise the type is
>> -        * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
>> +        * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl
>> +        * or __bpf_percpu_drop_impl is used
>>           */
>>          btf_dtor_kfunc_t dtor;
>>          u32 btf_id;
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index eb91cae0612a..dd14cb7da4af 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -1900,6 +1900,29 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
>>          return p;
>>   }
>>
>> +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
>> +{
>> +       struct btf_struct_meta *meta = meta__ign;
>> +       const struct btf_record *rec;
>> +       u64 size = local_type_id__k;
>> +       void __percpu *pptr;
>> +       void *p;
>> +       int cpu;
>> +
>> +       p = bpf_mem_alloc(&bpf_global_percpu_ma, size);
>> +       if (!p)
>> +               return NULL;
>> +       if (meta) {
>> +               pptr = *((void __percpu **)p);
>> +               rec = meta->record;
>> +               for_each_possible_cpu(cpu) {
>> +                       bpf_obj_init(rec, per_cpu_ptr(pptr, cpu));
>> +               }
>> +       }
>> +
>> +       return p;
>> +}
>> +
>>   /* Must be called under migrate_disable(), as required by bpf_mem_free */
>>   void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
>>   {
>> @@ -1924,6 +1947,30 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
>>          __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
>>   }
>>
>> +/* Must be called under migrate_disable(), as required by bpf_mem_free_rcu */
>> +void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec)
>> +{
>> +       void __percpu *pptr;
>> +       int cpu;
>> +
>> +       if (rec) {
>> +               pptr = *((void __percpu **)p);
>> +               for_each_possible_cpu(cpu) {
>> +                       bpf_obj_free_fields(rec, per_cpu_ptr(pptr, cpu));
> 
> Should this loop be done after we have waited for the RCU grace period?
> Otherwise any other CPU can reinitialize a field after this is done,
> move objects into lists/rbtree, and leak memory.
> Please correct me if I'm mistaken.

Thanks for spotting this. I think you are correct. The above scenario is
indeed possible. one cpu takes a direct reference of __percpu_kptr and
do a bunch of stuff, and the other cpu is doing a bpf_kptr_xchg to
get the __percpu_kptr and drops it. We should really drop the 
__percpu_kptr itself and the fields in its record after a rcu
grace period so the exist direct reference operation won't be
affected.

Will fix it in the v2.

> 
>> +               }
>> +       }
>> +
>> +       bpf_mem_free_rcu(&bpf_global_percpu_ma, p);
>> +}
>> +
>> +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
>> +{
>> +       struct btf_struct_meta *meta = meta__ign;
>> +       void *p = p__alloc;
>> +
>> +       __bpf_percpu_obj_drop_impl(p, meta ? meta->record : NULL);
>> +}
>> +
>>   __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
>>   {
>>          struct btf_struct_meta *meta = meta__ign;
>> @@ -2436,7 +2483,9 @@ BTF_SET8_START(generic_btf_ids)
>>   BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
>>   #endif
>>   BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
>> +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
>>   BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_list_push_front_impl)
>>   BTF_ID_FLAGS(func, bpf_list_push_back_impl)
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 1c30b6ee84d4..9ceb6fd9a0e2 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -627,6 +627,7 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
>>   }
>>
>>   extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
>> +extern void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec);
>>
>>   void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
>>   {
>> @@ -660,13 +661,21 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
>>                          if (!btf_is_kernel(field->kptr.btf)) {
>>                                  pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
>>                                                                             field->kptr.btf_id);
>> -                               if (field->type != BPF_KPTR_PERCPU_REF)
>> +
>> +                               if (field->type == BPF_KPTR_PERCPU_REF) {
>> +                                       migrate_disable();
>> +                                       __bpf_percpu_obj_drop_impl(xchgd_field, pointee_struct_meta ?
>> +                                                                               pointee_struct_meta->record :
>> +                                                                               NULL);
>> +                                       migrate_enable();
>> +                               } else {
>>                                          WARN_ON_ONCE(!pointee_struct_meta);
>> -                               migrate_disable();
>> -                               __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
>> -                                                                pointee_struct_meta->record :
>> -                                                                NULL);
>> -                               migrate_enable();
>> +                                       migrate_disable();
>> +                                       __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
>> +                                                                        pointee_struct_meta->record :
>> +                                                                        NULL);
>> +                                       migrate_enable();
>> +                               }
>>                          } else {
>>                                  field->kptr.dtor(xchgd_field);
>>                          }
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 4ccca1f6c998..a985fbf18a11 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta {
>>          /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
>>           * generally to pass info about user-defined local kptr types to later
>>           * verification logic
>> -        *   bpf_obj_drop
>> +        *   bpf_obj_drop/bpf_percpu_obj_drop
>>           *     Record the local kptr type to be drop'd
>>           *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
>>           *     Record the local kptr type to be refcount_incr'd and use
>> @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
>>          if (kptr_field->type == BPF_KPTR_UNREF)
>>                  perm_flags |= PTR_UNTRUSTED;
>>
>> +       if (kptr_field->type == BPF_KPTR_PERCPU_REF)
>> +               perm_flags |= MEM_PERCPU | MEM_ALLOC;
>> +
> 
> I think just this would permit PTR_TO_BTF_ID | MEM_ALLOC for percpu kptr?
> It would probably be good to include negative selftests for kptr_xchg
> type matching with percpu_kptr to prevent things like these.
> 
> Alexei already said map_kptr_match_type is not being invoked for
> MEM_ALLOC kptr_xchg, so that is also an existing bug.

I will fix that bug first and this part of change probably not
needed any more.

> 
>>          if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
>>                  goto bad_type;
>>
>> [...]
>>          /* We need to verify reg->type and reg->btf, before accessing reg->btf */
>>          reg_name = btf_type_name(reg->btf, reg->btf_id);
>>
>> @@ -5084,7 +5091,17 @@ static bool rcu_safe_kptr(const struct btf_field *field)
>>   {
>>          const struct btf_field_kptr *kptr = &field->kptr;
>>
>> -       return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
>> +       return field->type == BPF_KPTR_PERCPU_REF ||
>> +              (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
>> +}
>> +
>> +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
>> +{
>> +       if (!rcu_safe_kptr(kptr_field) || !in_rcu_cs(env))
>> +               return PTR_MAYBE_NULL | PTR_UNTRUSTED;
>> +       if (kptr_field->type != BPF_KPTR_PERCPU_REF)
>> +               return PTR_MAYBE_NULL | MEM_RCU;
>> +       return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;
> 
> The inverted conditions are a bit hard to follow. Maybe better to
> explicitly check for both RCU cases, and default to untrusted
> otherwise?

Okay. Will do.

> 
>>   }
>>
>> [...]
>>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e6348fd0a785..a2cb380c43c7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -197,7 +197,8 @@  struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
 	/* dtor used if btf_is_kernel(btf), otherwise the type is
-	 * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
+	 * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl
+	 * or __bpf_percpu_drop_impl is used
 	 */
 	btf_dtor_kfunc_t dtor;
 	u32 btf_id;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb91cae0612a..dd14cb7da4af 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1900,6 +1900,29 @@  __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+	struct btf_struct_meta *meta = meta__ign;
+	const struct btf_record *rec;
+	u64 size = local_type_id__k;
+	void __percpu *pptr;
+	void *p;
+	int cpu;
+
+	p = bpf_mem_alloc(&bpf_global_percpu_ma, size);
+	if (!p)
+		return NULL;
+	if (meta) {
+		pptr = *((void __percpu **)p);
+		rec = meta->record;
+		for_each_possible_cpu(cpu) {
+			bpf_obj_init(rec, per_cpu_ptr(pptr, cpu));
+		}
+	}
+
+	return p;
+}
+
 /* Must be called under migrate_disable(), as required by bpf_mem_free */
 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
 {
@@ -1924,6 +1947,30 @@  __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
 }
 
+/* Must be called under migrate_disable(), as required by bpf_mem_free_rcu */
+void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+	void __percpu *pptr;
+	int cpu;
+
+	if (rec) {
+		pptr = *((void __percpu **)p);
+		for_each_possible_cpu(cpu) {
+			bpf_obj_free_fields(rec, per_cpu_ptr(pptr, cpu));
+		}
+	}
+
+	bpf_mem_free_rcu(&bpf_global_percpu_ma, p);
+}
+
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+	struct btf_struct_meta *meta = meta__ign;
+	void *p = p__alloc;
+
+	__bpf_percpu_obj_drop_impl(p, meta ? meta->record : NULL);
+}
+
 __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
 {
 	struct btf_struct_meta *meta = meta__ign;
@@ -2436,7 +2483,9 @@  BTF_SET8_START(generic_btf_ids)
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1c30b6ee84d4..9ceb6fd9a0e2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -627,6 +627,7 @@  void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 }
 
 extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+extern void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec);
 
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
@@ -660,13 +661,21 @@  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			if (!btf_is_kernel(field->kptr.btf)) {
 				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
 									   field->kptr.btf_id);
-				if (field->type != BPF_KPTR_PERCPU_REF)
+
+				if (field->type == BPF_KPTR_PERCPU_REF) {
+					migrate_disable();
+					__bpf_percpu_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+										pointee_struct_meta->record :
+										NULL);
+					migrate_enable();
+				} else {
 					WARN_ON_ONCE(!pointee_struct_meta);
-				migrate_disable();
-				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
-								 pointee_struct_meta->record :
-								 NULL);
-				migrate_enable();
+					migrate_disable();
+					__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+									 pointee_struct_meta->record :
+									 NULL);
+					migrate_enable();
+				}
 			} else {
 				field->kptr.dtor(xchgd_field);
 			}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4ccca1f6c998..a985fbf18a11 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -304,7 +304,7 @@  struct bpf_kfunc_call_arg_meta {
 	/* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
 	 * generally to pass info about user-defined local kptr types to later
 	 * verification logic
-	 *   bpf_obj_drop
+	 *   bpf_obj_drop/bpf_percpu_obj_drop
 	 *     Record the local kptr type to be drop'd
 	 *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
 	 *     Record the local kptr type to be refcount_incr'd and use
@@ -4997,13 +4997,20 @@  static int map_kptr_match_type(struct bpf_verifier_env *env,
 	if (kptr_field->type == BPF_KPTR_UNREF)
 		perm_flags |= PTR_UNTRUSTED;
 
+	if (kptr_field->type == BPF_KPTR_PERCPU_REF)
+		perm_flags |= MEM_PERCPU | MEM_ALLOC;
+
 	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
 		goto bad_type;
 
-	if (!btf_is_kernel(reg->btf)) {
+	if (kptr_field->type != BPF_KPTR_PERCPU_REF && !btf_is_kernel(reg->btf)) {
 		verbose(env, "R%d must point to kernel BTF\n", regno);
 		return -EINVAL;
 	}
+	if (kptr_field->type == BPF_KPTR_PERCPU_REF && btf_is_kernel(reg->btf)) {
+		verbose(env, "R%d must point to prog BTF\n", regno);
+		return -EINVAL;
+	}
 	/* We need to verify reg->type and reg->btf, before accessing reg->btf */
 	reg_name = btf_type_name(reg->btf, reg->btf_id);
 
@@ -5084,7 +5091,17 @@  static bool rcu_safe_kptr(const struct btf_field *field)
 {
 	const struct btf_field_kptr *kptr = &field->kptr;
 
-	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+	return field->type == BPF_KPTR_PERCPU_REF ||
+	       (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
+}
+
+static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
+{
+	if (!rcu_safe_kptr(kptr_field) || !in_rcu_cs(env))
+		return PTR_MAYBE_NULL | PTR_UNTRUSTED;
+	if (kptr_field->type != BPF_KPTR_PERCPU_REF)
+		return PTR_MAYBE_NULL | MEM_RCU;
+	return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;
 }
 
 static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
@@ -5110,7 +5127,8 @@  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 	/* We only allow loading referenced kptr, since it will be marked as
 	 * untrusted, similar to unreferenced kptr.
 	 */
-	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
+	if (class != BPF_LDX &&
+	    (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU_REF)) {
 		verbose(env, "store to referenced kptr disallowed\n");
 		return -EACCES;
 	}
@@ -5121,10 +5139,7 @@  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
-				kptr_field->kptr.btf_id,
-				rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
-				PTR_MAYBE_NULL | MEM_RCU :
-				PTR_MAYBE_NULL | PTR_UNTRUSTED);
+				kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -5178,6 +5193,7 @@  static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			switch (field->type) {
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_KPTR_PERCPU_REF:
 				if (src != ACCESS_DIRECT) {
 					verbose(env, "kptr cannot be accessed indirectly by helper\n");
 					return -EACCES;
@@ -7316,7 +7332,7 @@  static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
 		return -EACCES;
 	}
-	if (kptr_field->type != BPF_KPTR_REF) {
+	if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU_REF) {
 		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
 		return -EACCES;
 	}
@@ -7827,8 +7843,10 @@  static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	if (base_type(arg_type) == ARG_PTR_TO_MEM)
 		type &= ~DYNPTR_TYPE_FLAG_MASK;
 
-	if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type))
+	if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
 		type &= ~MEM_ALLOC;
+		type &= ~MEM_PERCPU;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
 		expected = compatible->types[i];
@@ -7918,6 +7936,14 @@  static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 		}
 		/* Handled by helper specific checks */
 		break;
+	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
+		if (meta->func_id != BPF_FUNC_kptr_xchg) {
+			verbose(env, "verifier internal error: unimplemented handling of MEM_PERCPU | MEM_ALLOC\n");
+			return -EFAULT;
+		}
+		if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+			return -EACCES;
+		break;
 	case PTR_TO_BTF_ID | MEM_PERCPU:
 	case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
 		/* Handled by helper specific checks */
@@ -9885,8 +9911,11 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		if (func_id == BPF_FUNC_kptr_xchg) {
 			ret_btf = meta.kptr_field->kptr.btf;
 			ret_btf_id = meta.kptr_field->kptr.btf_id;
-			if (!btf_is_kernel(ret_btf))
+			if (!btf_is_kernel(ret_btf)) {
 				regs[BPF_REG_0].type |= MEM_ALLOC;
+				if (meta.kptr_field->type == BPF_KPTR_PERCPU_REF)
+					regs[BPF_REG_0].type |= MEM_PERCPU;
+			}
 		} else {
 			if (fn->ret_btf_id == BPF_PTR_POISON) {
 				verbose(env, "verifier internal error:");
@@ -10271,6 +10300,8 @@  enum special_kfunc_type {
 	KF_bpf_dynptr_slice,
 	KF_bpf_dynptr_slice_rdwr,
 	KF_bpf_dynptr_clone,
+	KF_bpf_percpu_obj_new_impl,
+	KF_bpf_percpu_obj_drop_impl,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -10291,6 +10322,8 @@  BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10313,6 +10346,8 @@  BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -11003,7 +11038,8 @@  static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			}
 			break;
 		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
-			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC) &&
+			    reg->type != (PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC)) {
 				verbose(env, "arg#%d expected pointer to allocated object\n", i);
 				return -EINVAL;
 			}
@@ -11012,7 +11048,8 @@  static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				return -EINVAL;
 			}
 			if (meta->btf == btf_vmlinux &&
-			    meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+			    (meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+			     meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl])) {
 				meta->arg_btf = reg->btf;
 				meta->arg_btf_id = reg->btf_id;
 			}
@@ -11410,6 +11447,7 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		/* Only exception is bpf_obj_new_impl */
 		if (meta.btf != btf_vmlinux ||
 		    (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+		     meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
 		     meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
 			verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
 			return -EINVAL;
@@ -11423,11 +11461,15 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
 
 		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
-			if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+			if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+			    meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
 				struct btf *ret_btf;
 				u32 ret_btf_id;
 
-				if (unlikely(!bpf_global_ma_set))
+				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+					return -ENOMEM;
+
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
 					return -ENOMEM;
 
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
@@ -11440,13 +11482,18 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 				/* This may be NULL due to user not supplying a BTF */
 				if (!ret_btf) {
-					verbose(env, "bpf_obj_new requires prog BTF\n");
+					verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
 					return -EINVAL;
 				}
 
 				ret_t = btf_type_by_id(ret_btf, ret_btf_id);
 				if (!ret_t || !__btf_type_is_struct(ret_t)) {
-					verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+					verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
+					return -EINVAL;
+				}
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
+				    !__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+					verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
 					return -EINVAL;
 				}
 
@@ -11454,6 +11501,8 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
 				regs[BPF_REG_0].btf = ret_btf;
 				regs[BPF_REG_0].btf_id = ret_btf_id;
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+					regs[BPF_REG_0].type |= MEM_PERCPU;
 
 				insn_aux->obj_new_size = ret_t->size;
 				insn_aux->kptr_struct_meta =
@@ -11594,7 +11643,8 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			regs[BPF_REG_0].id = ++env->id_gen;
 	} else if (btf_type_is_void(t)) {
 		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
-			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+			    meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
 				insn_aux->kptr_struct_meta =
 					btf_find_struct_meta(meta.arg_btf,
 							     meta.arg_btf_id);
@@ -18266,7 +18316,8 @@  static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn->imm = BPF_CALL_IMM(desc->addr);
 	if (insn->off)
 		return 0;
-	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+	    desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
 		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
 		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
 		u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
@@ -18277,6 +18328,7 @@  static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn_buf[3] = *insn;
 		*cnt = 4;
 	} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+		   desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
 		   desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
 		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
 		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };