diff mbox series

[v3,bpf-next,3/5] bpf: Introduce kptr_rcu.

Message ID 20230228040121.94253-4-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Introduce kptr_rcu. | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1427 this patch: 1427
netdev/cc_maintainers warning 17 maintainers not CCed: pabeni@redhat.com linux-kselftest@vger.kernel.org jolsa@kernel.org john.fastabend@gmail.com mykolal@fb.com martin.lau@linux.dev corbet@lwn.net yhs@fb.com kpsingh@kernel.org song@kernel.org linux-doc@vger.kernel.org eddyz87@gmail.com haoluo@google.com edumazet@google.com kuba@kernel.org shuah@kernel.org sdf@google.com
netdev/build_clang success Errors and warnings before: 152 this patch: 152
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1422 this patch: 1422
netdev/checkpatch warning CHECK: Prefer using the BIT macro WARNING: line length of 100 exceeds 80 columns WARNING: line length of 104 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-17

Commit Message

Alexei Starovoitov Feb. 28, 2023, 4:01 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
Derefrence of other kptr-s returns PTR_UNTRUSTED.

For example:
struct map_value {
   struct cgroup __kptr *cgrp;
};

SEC("tp_btf/cgroup_mkdir")
int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
{
  struct cgroup *cg, *cg2;

  cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
  bpf_kptr_xchg(&v->cgrp, cg);

  cg2 = v->cgrp; // This is new feature introduced by this patch.
  // cg2 is PTR_MAYBE_NULL | MEM_RCU.
  // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero

  bpf_cgroup_ancestor(cg2, level); // safe to do.
}

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst                  | 11 ++++---
 include/linux/bpf.h                           | 15 ++++++---
 include/linux/btf.h                           |  2 +-
 kernel/bpf/btf.c                              | 16 +++++++++
 kernel/bpf/helpers.c                          |  7 ++--
 kernel/bpf/syscall.c                          |  4 +++
 kernel/bpf/verifier.c                         | 33 ++++++++++++-------
 net/bpf/test_run.c                            |  3 +-
 .../selftests/bpf/progs/map_kptr_fail.c       |  4 +--
 tools/testing/selftests/bpf/verifier/calls.c  |  2 +-
 .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
 11 files changed, 69 insertions(+), 30 deletions(-)

Comments

David Vernet Feb. 28, 2023, 4:45 p.m. UTC | #1
On Mon, Feb 27, 2023 at 08:01:19PM -0800, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
> Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
> The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
> Derefrence of other kptr-s returns PTR_UNTRUSTED.
> 
> For example:
> struct map_value {
>    struct cgroup __kptr *cgrp;
> };
> 
> SEC("tp_btf/cgroup_mkdir")
> int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
> {
>   struct cgroup *cg, *cg2;
> 
>   cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
>   bpf_kptr_xchg(&v->cgrp, cg);
> 
>   cg2 = v->cgrp; // This is new feature introduced by this patch.
>   // cg2 is PTR_MAYBE_NULL | MEM_RCU.
>   // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero
> 
>   bpf_cgroup_ancestor(cg2, level); // safe to do.
> }
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  Documentation/bpf/kfuncs.rst                  | 11 ++++---
>  include/linux/bpf.h                           | 15 ++++++---
>  include/linux/btf.h                           |  2 +-
>  kernel/bpf/btf.c                              | 16 +++++++++
>  kernel/bpf/helpers.c                          |  7 ++--
>  kernel/bpf/syscall.c                          |  4 +++
>  kernel/bpf/verifier.c                         | 33 ++++++++++++-------
>  net/bpf/test_run.c                            |  3 +-
>  .../selftests/bpf/progs/map_kptr_fail.c       |  4 +--
>  tools/testing/selftests/bpf/verifier/calls.c  |  2 +-
>  .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
>  11 files changed, 69 insertions(+), 30 deletions(-)
> 
> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> index 7d7c1144372a..49c5cb6f46e7 100644
> --- a/Documentation/bpf/kfuncs.rst
> +++ b/Documentation/bpf/kfuncs.rst
> @@ -232,11 +232,12 @@ added later.
>  2.4.8 KF_RCU flag
>  -----------------
>  
> -The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
> -When used together with KF_ACQUIRE, it indicates the kfunc should have a
> -single argument which must be a trusted argument or a MEM_RCU pointer.
> -The argument may have reference count of 0 and the kfunc must take this
> -into consideration.
> +The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
> +KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
> +that the objects are valid and there is no use-after-free, but the pointers
> +maybe NULL and pointee object's reference count could have reached zero, hence

s/maybe/may be

> +kfuncs must do != NULL check and consider refcnt==0 case when accessing such
> +arguments.

Hmmm, given that it's only necessary to check refcnt==0 if the kfunc is
KF_ACQUIRE, wdyt about addending this paragraph with something like the
following (note as well the addition of the KF_RET_NULL suggestion):

...the pointers may be NULL, and the object's refcount could have
reached zero. The kfuncs must therefore do a != NULL check, and if
returning a KF_ACQUIRE pointer, also check that refcnt != 0. Note as
well that a KF_ACQUIRE kfunc that is KF_RCU should **very** likely also
be KF_RET_NULL, for both of these reasons.

>  .. _KF_deprecated_flag:
>  
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 520b238abd5a..d4b5faa0a777 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -178,11 +178,12 @@ enum btf_field_type {
>  	BPF_TIMER      = (1 << 1),
>  	BPF_KPTR_UNREF = (1 << 2),
>  	BPF_KPTR_REF   = (1 << 3),
> -	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
> -	BPF_LIST_HEAD  = (1 << 4),
> -	BPF_LIST_NODE  = (1 << 5),
> -	BPF_RB_ROOT    = (1 << 6),
> -	BPF_RB_NODE    = (1 << 7),
> +	BPF_KPTR_RCU   = (1 << 4), /* kernel internal. not exposed to bpf prog */
> +	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_RCU,
> +	BPF_LIST_HEAD  = (1 << 5),
> +	BPF_LIST_NODE  = (1 << 6),
> +	BPF_RB_ROOT    = (1 << 7),
> +	BPF_RB_NODE    = (1 << 8),
>  	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
>  				 BPF_RB_NODE | BPF_RB_ROOT,
>  };
> @@ -284,6 +285,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
>  	case BPF_KPTR_UNREF:
>  	case BPF_KPTR_REF:
>  		return "kptr";
> +	case BPF_KPTR_RCU:
> +		return "kptr_rcu";
>  	case BPF_LIST_HEAD:
>  		return "bpf_list_head";
>  	case BPF_LIST_NODE:
> @@ -307,6 +310,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
>  		return sizeof(struct bpf_timer);
>  	case BPF_KPTR_UNREF:
>  	case BPF_KPTR_REF:
> +	case BPF_KPTR_RCU:
>  		return sizeof(u64);
>  	case BPF_LIST_HEAD:
>  		return sizeof(struct bpf_list_head);
> @@ -331,6 +335,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
>  		return __alignof__(struct bpf_timer);
>  	case BPF_KPTR_UNREF:
>  	case BPF_KPTR_REF:
> +	case BPF_KPTR_RCU:
>  		return __alignof__(u64);
>  	case BPF_LIST_HEAD:
>  		return __alignof__(struct bpf_list_head);
> diff --git a/include/linux/btf.h b/include/linux/btf.h
> index 49e0fe6d8274..556b3e2e7471 100644
> --- a/include/linux/btf.h
> +++ b/include/linux/btf.h
> @@ -70,7 +70,7 @@
>  #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
>  #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
>  #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
> -#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
> +#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
>  
>  /*
>   * Tag marking a kernel function as a kfunc. This is meant to minimize the
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 01dee7d48e6d..a44ea1f6164b 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -3552,6 +3552,18 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
>  	return -EINVAL;
>  }
>  

Could you please add a comment here that once gcc has tag support, we
can replace this mechanism with just checking the type's BTF tag? I like
this a lot in the interim though -- it's a very easy way to add kfuncs
for new RCU-protected types, and will be trivially easy to remove and
cleanup later.

> +BTF_SET_START(rcu_protected_types)
> +BTF_ID(struct, prog_test_ref_kfunc)
> +BTF_ID(struct, cgroup)
> +BTF_SET_END(rcu_protected_types)
> +
> +static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
> +{
> +	if (!btf_is_kernel(btf))
> +		return false;
> +	return btf_id_set_contains(&rcu_protected_types, btf_id);
> +}
> +
>  static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
>  			  struct btf_field_info *info)
>  {
> @@ -3615,6 +3627,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
>  		field->kptr.dtor = (void *)addr;
>  	}
>  
> +	if (info->type == BPF_KPTR_REF && rcu_protected_object(kernel_btf, id))
> +		/* rcu dereference of this field will return MEM_RCU instead of PTR_UNTRUSTED */
> +		field->type = BPF_KPTR_RCU;

Can you move this into the if block above, and update the conditional to
just be:

if (rcu_protected_object(kernel_btf, id))

Also, outside the scope of your patch and subjective, but IMO it's a bit
confusing that we're looking at info->type, when field->type already ==
info->type. When reading the code it looks like field->type is unset
unless we set it to BPF_KPTR_RCU, but in reality we're just overwriting
it from being BPF_KPTR_REF. Might be worth tidying up at some point (I
can do that in a follow-on patch once this series lands).

>  	field->kptr.btf_id = id;
>  	field->kptr.btf = kernel_btf;
>  	field->kptr.module = mod;
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index a784be6f8bac..fed74afd45d1 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2094,11 +2094,12 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
>  {
>  	struct cgroup *ancestor;
>  
> -	if (level > cgrp->level || level < 0)
> +	if (!cgrp || level > cgrp->level || level < 0)
>  		return NULL;
>  
>  	ancestor = cgrp->ancestors[level];
> -	cgroup_get(ancestor);
> +	if (!cgroup_tryget(ancestor))
> +		return NULL;
>  	return ancestor;
>  }
>  
> @@ -2183,7 +2184,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
>  BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
> -BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
>  #endif
>  BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e3fcdc9836a6..2e730918911c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -539,6 +539,7 @@ void btf_record_free(struct btf_record *rec)
>  		switch (rec->fields[i].type) {
>  		case BPF_KPTR_UNREF:
>  		case BPF_KPTR_REF:
> +		case BPF_KPTR_RCU:
>  			if (rec->fields[i].kptr.module)
>  				module_put(rec->fields[i].kptr.module);
>  			btf_put(rec->fields[i].kptr.btf);
> @@ -584,6 +585,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
>  		switch (fields[i].type) {
>  		case BPF_KPTR_UNREF:
>  		case BPF_KPTR_REF:
> +		case BPF_KPTR_RCU:
>  			btf_get(fields[i].kptr.btf);
>  			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
>  				ret = -ENXIO;
> @@ -669,6 +671,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
>  			WRITE_ONCE(*(u64 *)field_ptr, 0);
>  			break;
>  		case BPF_KPTR_REF:
> +		case BPF_KPTR_RCU:

The fact that we're adding this case is IMO a sign that we're arguably
breaking abstractions a bit. BPF_KPTR_REF should really be the kptr type
that holds a reference and for which we should be firing the destructor,
and RCU protection should ideally be something we could just derive
later in the verifier. Not a huge problem given that this complexity is
completely hidden from the user, but I'm not fully understanding why
the extra complexity of BPF_KPTR_RCU is necessary. See below in another
comment in verifier.c.

>  			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));

Also completely unrelated to your patch set, but we should probably only
invoke field->kptr.dtor() if the value in field_ptr ends up being
non-NULL after the xchg. Otherwise, all KF_RELEASE kfuncs have to check
for NULL, even though they expect inherently trusted args. I can also do
that in a follow-on patch.

>  			break;
>  		case BPF_LIST_HEAD:
> @@ -1058,6 +1061,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
>  				break;
>  			case BPF_KPTR_UNREF:
>  			case BPF_KPTR_REF:
> +			case BPF_KPTR_RCU:
>  				if (map->map_type != BPF_MAP_TYPE_HASH &&
>  				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
>  				    map->map_type != BPF_MAP_TYPE_ARRAY &&
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index e4234266e76d..0b728ce0dde9 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -4183,7 +4183,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
>  			       struct bpf_reg_state *reg, u32 regno)
>  {
>  	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
> -	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
> +	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
>  	const char *reg_name = "";
>  
>  	/* Only unreferenced case accepts untrusted pointers */
> @@ -4230,12 +4230,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
>  	 * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
>  	 * is zero. We must also ensure that btf_struct_ids_match does not walk
>  	 * the struct to match type against first member of struct, i.e. reject
> -	 * second case from above. Hence, when type is BPF_KPTR_REF, we set
> +	 * second case from above. Hence, when type is BPF_KPTR_REF | BPF_KPTR_RCU, we set
>  	 * strict mode to true for type match.
>  	 */
>  	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
>  				  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
> -				  kptr_field->type == BPF_KPTR_REF))
> +				  kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_RCU))
>  		goto bad_type;
>  	return 0;
>  bad_type:
> @@ -4250,6 +4250,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
>  	return -EINVAL;
>  }
>  
> +/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
> + * can dereference RCU protected pointers and result is PTR_TRUSTED.
> + */
> +static bool in_rcu_cs(struct bpf_verifier_env *env)
> +{
> +	return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
> +}
> +
>  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
>  				 int value_regno, int insn_idx,
>  				 struct btf_field *kptr_field)
> @@ -4273,7 +4281,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
>  	/* We only allow loading referenced kptr, since it will be marked as
>  	 * untrusted, similar to unreferenced kptr.
>  	 */
> -	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
> +	if (class != BPF_LDX && kptr_field->type != BPF_KPTR_UNREF) {
>  		verbose(env, "store to referenced kptr disallowed\n");
>  		return -EACCES;
>  	}
> @@ -4284,7 +4292,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
>  		 * value from map as PTR_TO_BTF_ID, with the correct type.
>  		 */
>  		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
> -				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
> +				kptr_field->kptr.btf_id,
> +				kptr_field->type == BPF_KPTR_RCU && in_rcu_cs(env) ?

If we replaced this kptr_field->type == BPF_KPTR_RCU check with
something like btf_rcu_safe_kptr(kptr_field), corresponding to:

bool btf_rcu_safe_kptr(const struct btf_field *field)
{
	const struct btf_field_kptr *kptr = &field->kptr;

	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
}

Wouldn't that allow us to avoid having to define BPF_KPTR_RCU at all?
Given that BPF_KPTR_RCU is really just an instance of BPF_KPTR_REF which
may also derive safety from RCU protection, this seems both simpler and
more thematic. Or am I missing something?

> +				PTR_MAYBE_NULL | MEM_RCU :
> +				PTR_MAYBE_NULL | PTR_UNTRUSTED);
>  		/* For mark_ptr_or_null_reg */
>  		val_reg->id = ++env->id_gen;
>  	} else if (class == BPF_STX) {
> @@ -4338,6 +4349,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
>  			switch (field->type) {
>  			case BPF_KPTR_UNREF:
>  			case BPF_KPTR_REF:
> +			case BPF_KPTR_RCU:
>  				if (src != ACCESS_DIRECT) {
>  					verbose(env, "kptr cannot be accessed indirectly by helper\n");
>  					return -EACCES;
> @@ -5139,11 +5151,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
>  		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
>  		 * it could be null in some cases.
>  		 */
> -		if (!env->cur_state->active_rcu_lock ||
> -		    !(is_trusted_reg(reg) || is_rcu_reg(reg)))
> -			flag &= ~MEM_RCU;
> -		else
> +		if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg)))
>  			flag |= PTR_MAYBE_NULL;
> +		else
> +			flag &= ~MEM_RCU;
>  	} else if (reg->type & MEM_RCU) {
>  		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
>  		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
> @@ -6187,7 +6198,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
>  		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
>  		return -EACCES;
>  	}
> -	if (kptr_field->type != BPF_KPTR_REF) {
> +	if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_RCU) {
>  		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
>  		return -EACCES;
>  	}
> @@ -9111,7 +9122,7 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
>  	}
>  
>  	kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
> -	if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
> +	if (!kptr_field || (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_RCU)) {
>  		verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
>  			reg->off + reg->var_off.value);
>  		return -EINVAL;
> diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
> index 6f3d654b3339..73e5029ab5c9 100644
> --- a/net/bpf/test_run.c
> +++ b/net/bpf/test_run.c
> @@ -737,6 +737,7 @@ __bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
>  
>  __bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
>  {
> +	/* p could be NULL and p->cnt could be 0 */
>  }
>  
>  __bpf_kfunc void bpf_kfunc_call_test_destructive(void)
> @@ -784,7 +785,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
>  BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
>  BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
>  BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
> -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
> +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
>  BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
>  BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
>  BTF_SET8_END(test_sk_check_kfunc_ids)
> diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
> index e19e2a5f38cf..08f9ec18c345 100644
> --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
> +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
> @@ -281,7 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
>  }
>  
>  SEC("?tc")
> -__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
> +__failure __msg("R1 type=rcu_ptr_or_null_ expected=percpu_ptr_")
>  int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
>  {
>  	struct map_value *v;
> @@ -316,7 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
>  }
>  
>  SEC("?tc")
> -__failure __msg("R2 type=untrusted_ptr_ expected=ptr_")
> +__failure __msg("R2 must be referenced")
>  int reject_untrusted_xchg(struct __sk_buff *ctx)
>  {
>  	struct prog_test_ref_kfunc *p;
> diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
> index 289ed202ec66..9a326a800e5c 100644
> --- a/tools/testing/selftests/bpf/verifier/calls.c
> +++ b/tools/testing/selftests/bpf/verifier/calls.c
> @@ -243,7 +243,7 @@
>  	},
>  	.result_unpriv = REJECT,
>  	.result = REJECT,
> -	.errstr = "R1 must be referenced",
> +	.errstr = "R1 must be",
>  },
>  {
>  	"calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
> diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
> index 6914904344c0..d775ccb01989 100644
> --- a/tools/testing/selftests/bpf/verifier/map_kptr.c
> +++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
> @@ -336,7 +336,7 @@
>  	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
>  	.fixup_map_kptr = { 1 },
>  	.result = REJECT,
> -	.errstr = "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_",
> +	.errstr = "R1 type=rcu_ptr_or_null_ expected=percpu_ptr_",
>  },
>  {
>  	"map_kptr: ref: reject off != 0",
> -- 
> 2.30.2
>
Tejun Heo Feb. 28, 2023, 7:39 p.m. UTC | #2
On Mon, Feb 27, 2023 at 08:01:19PM -0800, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
> Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
> The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
> Derefrence of other kptr-s returns PTR_UNTRUSTED.
> 
> For example:
> struct map_value {
>    struct cgroup __kptr *cgrp;
> };
> 
> SEC("tp_btf/cgroup_mkdir")
> int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
> {
>   struct cgroup *cg, *cg2;
> 
>   cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
>   bpf_kptr_xchg(&v->cgrp, cg);
> 
>   cg2 = v->cgrp; // This is new feature introduced by this patch.
>   // cg2 is PTR_MAYBE_NULL | MEM_RCU.
>   // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero
> 
>   bpf_cgroup_ancestor(cg2, level); // safe to do.
> }
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

From cgroup POV:

 Acked-by: Tejun Heo <tj@kernel.org>

Thanks.
Alexei Starovoitov Feb. 28, 2023, 7:49 p.m. UTC | #3
On Tue, Feb 28, 2023 at 10:45:16AM -0600, David Vernet wrote:
> On Mon, Feb 27, 2023 at 08:01:19PM -0800, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> > 
> > The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
> > Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
> > The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
> > Derefrence of other kptr-s returns PTR_UNTRUSTED.
> > 
> > For example:
> > struct map_value {
> >    struct cgroup __kptr *cgrp;
> > };
> > 
> > SEC("tp_btf/cgroup_mkdir")
> > int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
> > {
> >   struct cgroup *cg, *cg2;
> > 
> >   cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
> >   bpf_kptr_xchg(&v->cgrp, cg);
> > 
> >   cg2 = v->cgrp; // This is new feature introduced by this patch.
> >   // cg2 is PTR_MAYBE_NULL | MEM_RCU.
> >   // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero
> > 
> >   bpf_cgroup_ancestor(cg2, level); // safe to do.
> > }
> > 
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  Documentation/bpf/kfuncs.rst                  | 11 ++++---
> >  include/linux/bpf.h                           | 15 ++++++---
> >  include/linux/btf.h                           |  2 +-
> >  kernel/bpf/btf.c                              | 16 +++++++++
> >  kernel/bpf/helpers.c                          |  7 ++--
> >  kernel/bpf/syscall.c                          |  4 +++
> >  kernel/bpf/verifier.c                         | 33 ++++++++++++-------
> >  net/bpf/test_run.c                            |  3 +-
> >  .../selftests/bpf/progs/map_kptr_fail.c       |  4 +--
> >  tools/testing/selftests/bpf/verifier/calls.c  |  2 +-
> >  .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
> >  11 files changed, 69 insertions(+), 30 deletions(-)
> > 
> > diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> > index 7d7c1144372a..49c5cb6f46e7 100644
> > --- a/Documentation/bpf/kfuncs.rst
> > +++ b/Documentation/bpf/kfuncs.rst
> > @@ -232,11 +232,12 @@ added later.
> >  2.4.8 KF_RCU flag
> >  -----------------
> >  
> > -The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
> > -When used together with KF_ACQUIRE, it indicates the kfunc should have a
> > -single argument which must be a trusted argument or a MEM_RCU pointer.
> > -The argument may have reference count of 0 and the kfunc must take this
> > -into consideration.
> > +The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
> > +KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
> > +that the objects are valid and there is no use-after-free, but the pointers
> > +maybe NULL and pointee object's reference count could have reached zero, hence
> 
> s/maybe/may be
> 
> > +kfuncs must do != NULL check and consider refcnt==0 case when accessing such
> > +arguments.
> 
> Hmmm, given that it's only necessary to check refcnt==0 if the kfunc is
> KF_ACQUIRE, wdyt about addending this paragraph with something like the
> following (note as well the addition of the KF_RET_NULL suggestion):
> 
> ...the pointers may be NULL, and the object's refcount could have
> reached zero. The kfuncs must therefore do a != NULL check, and if
> returning a KF_ACQUIRE pointer, also check that refcnt != 0. Note as
> well that a KF_ACQUIRE kfunc that is KF_RCU should **very** likely also
> be KF_RET_NULL, for both of these reasons.

Good suggestion.

> >  .. _KF_deprecated_flag:
> >  
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 520b238abd5a..d4b5faa0a777 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -178,11 +178,12 @@ enum btf_field_type {
> >  	BPF_TIMER      = (1 << 1),
> >  	BPF_KPTR_UNREF = (1 << 2),
> >  	BPF_KPTR_REF   = (1 << 3),
> > -	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
> > -	BPF_LIST_HEAD  = (1 << 4),
> > -	BPF_LIST_NODE  = (1 << 5),
> > -	BPF_RB_ROOT    = (1 << 6),
> > -	BPF_RB_NODE    = (1 << 7),
> > +	BPF_KPTR_RCU   = (1 << 4), /* kernel internal. not exposed to bpf prog */
> > +	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_RCU,
> > +	BPF_LIST_HEAD  = (1 << 5),
> > +	BPF_LIST_NODE  = (1 << 6),
> > +	BPF_RB_ROOT    = (1 << 7),
> > +	BPF_RB_NODE    = (1 << 8),
> >  	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
> >  				 BPF_RB_NODE | BPF_RB_ROOT,
> >  };
> > @@ -284,6 +285,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
> >  	case BPF_KPTR_UNREF:
> >  	case BPF_KPTR_REF:
> >  		return "kptr";
> > +	case BPF_KPTR_RCU:
> > +		return "kptr_rcu";
> >  	case BPF_LIST_HEAD:
> >  		return "bpf_list_head";
> >  	case BPF_LIST_NODE:
> > @@ -307,6 +310,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
> >  		return sizeof(struct bpf_timer);
> >  	case BPF_KPTR_UNREF:
> >  	case BPF_KPTR_REF:
> > +	case BPF_KPTR_RCU:
> >  		return sizeof(u64);
> >  	case BPF_LIST_HEAD:
> >  		return sizeof(struct bpf_list_head);
> > @@ -331,6 +335,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
> >  		return __alignof__(struct bpf_timer);
> >  	case BPF_KPTR_UNREF:
> >  	case BPF_KPTR_REF:
> > +	case BPF_KPTR_RCU:
> >  		return __alignof__(u64);
> >  	case BPF_LIST_HEAD:
> >  		return __alignof__(struct bpf_list_head);
> > diff --git a/include/linux/btf.h b/include/linux/btf.h
> > index 49e0fe6d8274..556b3e2e7471 100644
> > --- a/include/linux/btf.h
> > +++ b/include/linux/btf.h
> > @@ -70,7 +70,7 @@
> >  #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
> >  #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
> >  #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
> > -#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
> > +#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
> >  
> >  /*
> >   * Tag marking a kernel function as a kfunc. This is meant to minimize the
> > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > index 01dee7d48e6d..a44ea1f6164b 100644
> > --- a/kernel/bpf/btf.c
> > +++ b/kernel/bpf/btf.c
> > @@ -3552,6 +3552,18 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
> >  	return -EINVAL;
> >  }
> >  
> 
> Could you please add a comment here that once gcc has tag support, we
> can replace this mechanism with just checking the type's BTF tag? I like
> this a lot in the interim though -- it's a very easy way to add kfuncs
> for new RCU-protected types, and will be trivially easy to remove and
> cleanup later.

+1

> > +BTF_SET_START(rcu_protected_types)
> > +BTF_ID(struct, prog_test_ref_kfunc)
> > +BTF_ID(struct, cgroup)
> > +BTF_SET_END(rcu_protected_types)
> > +
> > +static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
> > +{
> > +	if (!btf_is_kernel(btf))
> > +		return false;
> > +	return btf_id_set_contains(&rcu_protected_types, btf_id);
> > +}
> > +
> >  static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
> >  			  struct btf_field_info *info)
> >  {
> > @@ -3615,6 +3627,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
> >  		field->kptr.dtor = (void *)addr;
> >  	}
> >  
> > +	if (info->type == BPF_KPTR_REF && rcu_protected_object(kernel_btf, id))
> > +		/* rcu dereference of this field will return MEM_RCU instead of PTR_UNTRUSTED */
> > +		field->type = BPF_KPTR_RCU;
> 
> Can you move this into the if block above, and update the conditional to
> just be:
> 
> if (rcu_protected_object(kernel_btf, id))

good idea.

> Also, outside the scope of your patch and subjective, but IMO it's a bit
> confusing that we're looking at info->type, when field->type already ==
> info->type. When reading the code it looks like field->type is unset
> unless we set it to BPF_KPTR_RCU, but in reality we're just overwriting
> it from being BPF_KPTR_REF. Might be worth tidying up at some point (I
> can do that in a follow-on patch once this series lands).

The caller of btf_parse_kptr() provided temporary btf_field_info array.
Since there is only one caller it's easy to see. Not sure what clean up you have in mind.

> >  	field->kptr.btf_id = id;
> >  	field->kptr.btf = kernel_btf;
> >  	field->kptr.module = mod;
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index a784be6f8bac..fed74afd45d1 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -2094,11 +2094,12 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
> >  {
> >  	struct cgroup *ancestor;
> >  
> > -	if (level > cgrp->level || level < 0)
> > +	if (!cgrp || level > cgrp->level || level < 0)
> >  		return NULL;
> >  
> >  	ancestor = cgrp->ancestors[level];
> > -	cgroup_get(ancestor);
> > +	if (!cgroup_tryget(ancestor))
> > +		return NULL;
> >  	return ancestor;
> >  }
> >  
> > @@ -2183,7 +2184,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
> >  BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
> >  BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
> >  BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
> > -BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
> > +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
> >  BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
> >  #endif
> >  BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index e3fcdc9836a6..2e730918911c 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -539,6 +539,7 @@ void btf_record_free(struct btf_record *rec)
> >  		switch (rec->fields[i].type) {
> >  		case BPF_KPTR_UNREF:
> >  		case BPF_KPTR_REF:
> > +		case BPF_KPTR_RCU:
> >  			if (rec->fields[i].kptr.module)
> >  				module_put(rec->fields[i].kptr.module);
> >  			btf_put(rec->fields[i].kptr.btf);
> > @@ -584,6 +585,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
> >  		switch (fields[i].type) {
> >  		case BPF_KPTR_UNREF:
> >  		case BPF_KPTR_REF:
> > +		case BPF_KPTR_RCU:
> >  			btf_get(fields[i].kptr.btf);
> >  			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
> >  				ret = -ENXIO;
> > @@ -669,6 +671,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
> >  			WRITE_ONCE(*(u64 *)field_ptr, 0);
> >  			break;
> >  		case BPF_KPTR_REF:
> > +		case BPF_KPTR_RCU:
> 
> The fact that we're adding this case is IMO a sign that we're arguably
> breaking abstractions a bit. BPF_KPTR_REF should really be the kptr type
> that holds a reference and for which we should be firing the destructor,
> and RCU protection should ideally be something we could just derive
> later in the verifier. 

I've considered keeping BPF_KPTR_REF as-is and just add a "bool is_kptr_rcu;"
to indicate it's a BPF_KPTR_REF with extra RCU properties, but they are different
enough. So it's cleaner to make them stand out.
With BPF_KPTR_RCU being different type it's impossible for other bits
in the verifier to silently accept BPF_KPTR_REF that shouldn't have RCU property.

> Not a huge problem given that this complexity is
> completely hidden from the user, but I'm not fully understanding why
> the extra complexity of BPF_KPTR_RCU is necessary. See below in another
> comment in verifier.c.
> 
> >  			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
> 
> Also completely unrelated to your patch set, but we should probably only
> invoke field->kptr.dtor() if the value in field_ptr ends up being
> non-NULL after the xchg. Otherwise, all KF_RELEASE kfuncs have to check
> for NULL, even though they expect inherently trusted args. I can also do
> that in a follow-on patch.

Good point. The verifier forces bpf progs to do if (ptr != NULL) bpf_..__release(ptr);
but we still have duplicated !=NULL check inside dtor-s,
because both BPF_KPTR_RCU and BPF_KPTR_REF can be NULL here.
It would be good to clean up indeed.

> 
> >  			break;
> >  		case BPF_LIST_HEAD:
> > @@ -1058,6 +1061,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
> >  				break;
> >  			case BPF_KPTR_UNREF:
> >  			case BPF_KPTR_REF:
> > +			case BPF_KPTR_RCU:
> >  				if (map->map_type != BPF_MAP_TYPE_HASH &&
> >  				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
> >  				    map->map_type != BPF_MAP_TYPE_ARRAY &&
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index e4234266e76d..0b728ce0dde9 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -4183,7 +4183,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
> >  			       struct bpf_reg_state *reg, u32 regno)
> >  {
> >  	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
> > -	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
> > +	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
> >  	const char *reg_name = "";
> >  
> >  	/* Only unreferenced case accepts untrusted pointers */
> > @@ -4230,12 +4230,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
> >  	 * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
> >  	 * is zero. We must also ensure that btf_struct_ids_match does not walk
> >  	 * the struct to match type against first member of struct, i.e. reject
> > -	 * second case from above. Hence, when type is BPF_KPTR_REF, we set
> > +	 * second case from above. Hence, when type is BPF_KPTR_REF | BPF_KPTR_RCU, we set
> >  	 * strict mode to true for type match.
> >  	 */
> >  	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
> >  				  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
> > -				  kptr_field->type == BPF_KPTR_REF))
> > +				  kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_RCU))
> >  		goto bad_type;
> >  	return 0;
> >  bad_type:
> > @@ -4250,6 +4250,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
> >  	return -EINVAL;
> >  }
> >  
> > +/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
> > + * can dereference RCU protected pointers and result is PTR_TRUSTED.
> > + */
> > +static bool in_rcu_cs(struct bpf_verifier_env *env)
> > +{
> > +	return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
> > +}
> > +
> >  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
> >  				 int value_regno, int insn_idx,
> >  				 struct btf_field *kptr_field)
> > @@ -4273,7 +4281,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
> >  	/* We only allow loading referenced kptr, since it will be marked as
> >  	 * untrusted, similar to unreferenced kptr.
> >  	 */
> > -	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
> > +	if (class != BPF_LDX && kptr_field->type != BPF_KPTR_UNREF) {
> >  		verbose(env, "store to referenced kptr disallowed\n");
> >  		return -EACCES;
> >  	}
> > @@ -4284,7 +4292,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
> >  		 * value from map as PTR_TO_BTF_ID, with the correct type.
> >  		 */
> >  		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
> > -				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
> > +				kptr_field->kptr.btf_id,
> > +				kptr_field->type == BPF_KPTR_RCU && in_rcu_cs(env) ?
> 
> If we replaced this kptr_field->type == BPF_KPTR_RCU check with
> something like btf_rcu_safe_kptr(kptr_field), corresponding to:
> 
> bool btf_rcu_safe_kptr(const struct btf_field *field)
> {
> 	const struct btf_field_kptr *kptr = &field->kptr;
> 
> 	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
> }
> 
> Wouldn't that allow us to avoid having to define BPF_KPTR_RCU at all?
> Given that BPF_KPTR_RCU is really just an instance of BPF_KPTR_REF which
> may also derive safety from RCU protection, this seems both simpler and
> more thematic. Or am I missing something?

See my earlier reply. It felt cleaner to keep them separate so that
BPF_KPTR_RCU won't be accepted in placed where only BPF_KPTR_REF is ok.
I'm probably overthinking.
Looking at the code again all places with BPF_KPTR_REF were appended with BPF_KPTR_RCU.
So, yeah, let's go with your suggestion above. A lot less code to maintain and
if it turns out to be an issue we can go back to separate types.
David Vernet Feb. 28, 2023, 8:05 p.m. UTC | #4
On Tue, Feb 28, 2023 at 11:49:23AM -0800, Alexei Starovoitov wrote:
> On Tue, Feb 28, 2023 at 10:45:16AM -0600, David Vernet wrote:
> > On Mon, Feb 27, 2023 at 08:01:19PM -0800, Alexei Starovoitov wrote:
> > > From: Alexei Starovoitov <ast@kernel.org>
> > > 
> > > The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
> > > Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
> > > The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
> > > Derefrence of other kptr-s returns PTR_UNTRUSTED.
> > > 
> > > For example:
> > > struct map_value {
> > >    struct cgroup __kptr *cgrp;
> > > };
> > > 
> > > SEC("tp_btf/cgroup_mkdir")
> > > int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
> > > {
> > >   struct cgroup *cg, *cg2;
> > > 
> > >   cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
> > >   bpf_kptr_xchg(&v->cgrp, cg);
> > > 
> > >   cg2 = v->cgrp; // This is new feature introduced by this patch.
> > >   // cg2 is PTR_MAYBE_NULL | MEM_RCU.
> > >   // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero
> > > 
> > >   bpf_cgroup_ancestor(cg2, level); // safe to do.
> > > }
> > > 
> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > > ---
> > >  Documentation/bpf/kfuncs.rst                  | 11 ++++---
> > >  include/linux/bpf.h                           | 15 ++++++---
> > >  include/linux/btf.h                           |  2 +-
> > >  kernel/bpf/btf.c                              | 16 +++++++++
> > >  kernel/bpf/helpers.c                          |  7 ++--
> > >  kernel/bpf/syscall.c                          |  4 +++
> > >  kernel/bpf/verifier.c                         | 33 ++++++++++++-------
> > >  net/bpf/test_run.c                            |  3 +-
> > >  .../selftests/bpf/progs/map_kptr_fail.c       |  4 +--
> > >  tools/testing/selftests/bpf/verifier/calls.c  |  2 +-
> > >  .../testing/selftests/bpf/verifier/map_kptr.c |  2 +-
> > >  11 files changed, 69 insertions(+), 30 deletions(-)
> > > 
> > > diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> > > index 7d7c1144372a..49c5cb6f46e7 100644
> > > --- a/Documentation/bpf/kfuncs.rst
> > > +++ b/Documentation/bpf/kfuncs.rst
> > > @@ -232,11 +232,12 @@ added later.
> > >  2.4.8 KF_RCU flag
> > >  -----------------
> > >  
> > > -The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
> > > -When used together with KF_ACQUIRE, it indicates the kfunc should have a
> > > -single argument which must be a trusted argument or a MEM_RCU pointer.
> > > -The argument may have reference count of 0 and the kfunc must take this
> > > -into consideration.
> > > +The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
> > > +KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
> > > +that the objects are valid and there is no use-after-free, but the pointers
> > > +maybe NULL and pointee object's reference count could have reached zero, hence
> > 
> > s/maybe/may be
> > 
> > > +kfuncs must do != NULL check and consider refcnt==0 case when accessing such
> > > +arguments.
> > 
> > Hmmm, given that it's only necessary to check refcnt==0 if the kfunc is
> > KF_ACQUIRE, wdyt about addending this paragraph with something like the
> > following (note as well the addition of the KF_RET_NULL suggestion):
> > 
> > ...the pointers may be NULL, and the object's refcount could have
> > reached zero. The kfuncs must therefore do a != NULL check, and if
> > returning a KF_ACQUIRE pointer, also check that refcnt != 0. Note as
> > well that a KF_ACQUIRE kfunc that is KF_RCU should **very** likely also
> > be KF_RET_NULL, for both of these reasons.
> 
> Good suggestion.
> 
> > >  .. _KF_deprecated_flag:
> > >  
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 520b238abd5a..d4b5faa0a777 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -178,11 +178,12 @@ enum btf_field_type {
> > >  	BPF_TIMER      = (1 << 1),
> > >  	BPF_KPTR_UNREF = (1 << 2),
> > >  	BPF_KPTR_REF   = (1 << 3),
> > > -	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
> > > -	BPF_LIST_HEAD  = (1 << 4),
> > > -	BPF_LIST_NODE  = (1 << 5),
> > > -	BPF_RB_ROOT    = (1 << 6),
> > > -	BPF_RB_NODE    = (1 << 7),
> > > +	BPF_KPTR_RCU   = (1 << 4), /* kernel internal. not exposed to bpf prog */
> > > +	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_RCU,
> > > +	BPF_LIST_HEAD  = (1 << 5),
> > > +	BPF_LIST_NODE  = (1 << 6),
> > > +	BPF_RB_ROOT    = (1 << 7),
> > > +	BPF_RB_NODE    = (1 << 8),
> > >  	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
> > >  				 BPF_RB_NODE | BPF_RB_ROOT,
> > >  };
> > > @@ -284,6 +285,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
> > >  	case BPF_KPTR_UNREF:
> > >  	case BPF_KPTR_REF:
> > >  		return "kptr";
> > > +	case BPF_KPTR_RCU:
> > > +		return "kptr_rcu";
> > >  	case BPF_LIST_HEAD:
> > >  		return "bpf_list_head";
> > >  	case BPF_LIST_NODE:
> > > @@ -307,6 +310,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
> > >  		return sizeof(struct bpf_timer);
> > >  	case BPF_KPTR_UNREF:
> > >  	case BPF_KPTR_REF:
> > > +	case BPF_KPTR_RCU:
> > >  		return sizeof(u64);
> > >  	case BPF_LIST_HEAD:
> > >  		return sizeof(struct bpf_list_head);
> > > @@ -331,6 +335,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
> > >  		return __alignof__(struct bpf_timer);
> > >  	case BPF_KPTR_UNREF:
> > >  	case BPF_KPTR_REF:
> > > +	case BPF_KPTR_RCU:
> > >  		return __alignof__(u64);
> > >  	case BPF_LIST_HEAD:
> > >  		return __alignof__(struct bpf_list_head);
> > > diff --git a/include/linux/btf.h b/include/linux/btf.h
> > > index 49e0fe6d8274..556b3e2e7471 100644
> > > --- a/include/linux/btf.h
> > > +++ b/include/linux/btf.h
> > > @@ -70,7 +70,7 @@
> > >  #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
> > >  #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
> > >  #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
> > > -#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
> > > +#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
> > >  
> > >  /*
> > >   * Tag marking a kernel function as a kfunc. This is meant to minimize the
> > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > index 01dee7d48e6d..a44ea1f6164b 100644
> > > --- a/kernel/bpf/btf.c
> > > +++ b/kernel/bpf/btf.c
> > > @@ -3552,6 +3552,18 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
> > >  	return -EINVAL;
> > >  }
> > >  
> > 
> > Could you please add a comment here that once gcc has tag support, we
> > can replace this mechanism with just checking the type's BTF tag? I like
> > this a lot in the interim though -- it's a very easy way to add kfuncs
> > for new RCU-protected types, and will be trivially easy to remove and
> > cleanup later.
> 
> +1
> 
> > > +BTF_SET_START(rcu_protected_types)
> > > +BTF_ID(struct, prog_test_ref_kfunc)
> > > +BTF_ID(struct, cgroup)
> > > +BTF_SET_END(rcu_protected_types)
> > > +
> > > +static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
> > > +{
> > > +	if (!btf_is_kernel(btf))
> > > +		return false;
> > > +	return btf_id_set_contains(&rcu_protected_types, btf_id);
> > > +}
> > > +
> > >  static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
> > >  			  struct btf_field_info *info)
> > >  {
> > > @@ -3615,6 +3627,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
> > >  		field->kptr.dtor = (void *)addr;
> > >  	}
> > >  
> > > +	if (info->type == BPF_KPTR_REF && rcu_protected_object(kernel_btf, id))
> > > +		/* rcu dereference of this field will return MEM_RCU instead of PTR_UNTRUSTED */
> > > +		field->type = BPF_KPTR_RCU;
> > 
> > Can you move this into the if block above, and update the conditional to
> > just be:
> > 
> > if (rcu_protected_object(kernel_btf, id))
> 
> good idea.
> 
> > Also, outside the scope of your patch and subjective, but IMO it's a bit
> > confusing that we're looking at info->type, when field->type already ==
> > info->type. When reading the code it looks like field->type is unset
> > unless we set it to BPF_KPTR_RCU, but in reality we're just overwriting
> > it from being BPF_KPTR_REF. Might be worth tidying up at some point (I
> > can do that in a follow-on patch once this series lands).
> 
> The caller of btf_parse_kptr() provided temporary btf_field_info array.
> Since there is only one caller it's easy to see. Not sure what clean up you have in mind.

I was thinking of checking field->type instead of info->type in the if
statements to make it more clear that field->type is already set. This
is really just a subjective point about readability. Probably wasn't
worth the additional noise on the patch.

> > >  	field->kptr.btf_id = id;
> > >  	field->kptr.btf = kernel_btf;
> > >  	field->kptr.module = mod;
> > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > index a784be6f8bac..fed74afd45d1 100644
> > > --- a/kernel/bpf/helpers.c
> > > +++ b/kernel/bpf/helpers.c
> > > @@ -2094,11 +2094,12 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
> > >  {
> > >  	struct cgroup *ancestor;
> > >  
> > > -	if (level > cgrp->level || level < 0)
> > > +	if (!cgrp || level > cgrp->level || level < 0)
> > >  		return NULL;
> > >  
> > >  	ancestor = cgrp->ancestors[level];
> > > -	cgroup_get(ancestor);
> > > +	if (!cgroup_tryget(ancestor))
> > > +		return NULL;
> > >  	return ancestor;
> > >  }
> > >  
> > > @@ -2183,7 +2184,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
> > >  BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
> > >  BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
> > >  BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
> > > -BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
> > > +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
> > >  BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
> > >  #endif
> > >  BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
> > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > index e3fcdc9836a6..2e730918911c 100644
> > > --- a/kernel/bpf/syscall.c
> > > +++ b/kernel/bpf/syscall.c
> > > @@ -539,6 +539,7 @@ void btf_record_free(struct btf_record *rec)
> > >  		switch (rec->fields[i].type) {
> > >  		case BPF_KPTR_UNREF:
> > >  		case BPF_KPTR_REF:
> > > +		case BPF_KPTR_RCU:
> > >  			if (rec->fields[i].kptr.module)
> > >  				module_put(rec->fields[i].kptr.module);
> > >  			btf_put(rec->fields[i].kptr.btf);
> > > @@ -584,6 +585,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
> > >  		switch (fields[i].type) {
> > >  		case BPF_KPTR_UNREF:
> > >  		case BPF_KPTR_REF:
> > > +		case BPF_KPTR_RCU:
> > >  			btf_get(fields[i].kptr.btf);
> > >  			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
> > >  				ret = -ENXIO;
> > > @@ -669,6 +671,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
> > >  			WRITE_ONCE(*(u64 *)field_ptr, 0);
> > >  			break;
> > >  		case BPF_KPTR_REF:
> > > +		case BPF_KPTR_RCU:
> > 
> > The fact that we're adding this case is IMO a sign that we're arguably
> > breaking abstractions a bit. BPF_KPTR_REF should really be the kptr type
> > that holds a reference and for which we should be firing the destructor,
> > and RCU protection should ideally be something we could just derive
> > later in the verifier. 
> 
> I've considered keeping BPF_KPTR_REF as-is and just add a "bool is_kptr_rcu;"
> to indicate it's a BPF_KPTR_REF with extra RCU properties, but they are different
> enough. So it's cleaner to make them stand out.
> With BPF_KPTR_RCU being different type it's impossible for other bits
> in the verifier to silently accept BPF_KPTR_REF that shouldn't have RCU property.

Makes sense, will reply below.

> > Not a huge problem given that this complexity is
> > completely hidden from the user, but I'm not fully understanding why
> > the extra complexity of BPF_KPTR_RCU is necessary. See below in another
> > comment in verifier.c.
> > 
> > >  			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
> > 
> > Also completely unrelated to your patch set, but we should probably only
> > invoke field->kptr.dtor() if the value in field_ptr ends up being
> > non-NULL after the xchg. Otherwise, all KF_RELEASE kfuncs have to check
> > for NULL, even though they expect inherently trusted args. I can also do
> > that in a follow-on patch.
> 
> Good point. The verifier forces bpf progs to do if (ptr != NULL) bpf_..__release(ptr);
> but we still have duplicated !=NULL check inside dtor-s,
> because both BPF_KPTR_RCU and BPF_KPTR_REF can be NULL here.
> It would be good to clean up indeed.

Great, I'll send a patch after this set lands.

> > 
> > >  			break;
> > >  		case BPF_LIST_HEAD:
> > > @@ -1058,6 +1061,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
> > >  				break;
> > >  			case BPF_KPTR_UNREF:
> > >  			case BPF_KPTR_REF:
> > > +			case BPF_KPTR_RCU:
> > >  				if (map->map_type != BPF_MAP_TYPE_HASH &&
> > >  				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
> > >  				    map->map_type != BPF_MAP_TYPE_ARRAY &&
> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > index e4234266e76d..0b728ce0dde9 100644
> > > --- a/kernel/bpf/verifier.c
> > > +++ b/kernel/bpf/verifier.c
> > > @@ -4183,7 +4183,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
> > >  			       struct bpf_reg_state *reg, u32 regno)
> > >  {
> > >  	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
> > > -	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
> > > +	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
> > >  	const char *reg_name = "";
> > >  
> > >  	/* Only unreferenced case accepts untrusted pointers */
> > > @@ -4230,12 +4230,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
> > >  	 * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
> > >  	 * is zero. We must also ensure that btf_struct_ids_match does not walk
> > >  	 * the struct to match type against first member of struct, i.e. reject
> > > -	 * second case from above. Hence, when type is BPF_KPTR_REF, we set
> > > +	 * second case from above. Hence, when type is BPF_KPTR_REF | BPF_KPTR_RCU, we set
> > >  	 * strict mode to true for type match.
> > >  	 */
> > >  	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
> > >  				  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
> > > -				  kptr_field->type == BPF_KPTR_REF))
> > > +				  kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_RCU))
> > >  		goto bad_type;
> > >  	return 0;
> > >  bad_type:
> > > @@ -4250,6 +4250,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
> > >  	return -EINVAL;
> > >  }
> > >  
> > > +/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
> > > + * can dereference RCU protected pointers and result is PTR_TRUSTED.
> > > + */
> > > +static bool in_rcu_cs(struct bpf_verifier_env *env)
> > > +{
> > > +	return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
> > > +}
> > > +
> > >  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
> > >  				 int value_regno, int insn_idx,
> > >  				 struct btf_field *kptr_field)
> > > @@ -4273,7 +4281,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
> > >  	/* We only allow loading referenced kptr, since it will be marked as
> > >  	 * untrusted, similar to unreferenced kptr.
> > >  	 */
> > > -	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
> > > +	if (class != BPF_LDX && kptr_field->type != BPF_KPTR_UNREF) {
> > >  		verbose(env, "store to referenced kptr disallowed\n");
> > >  		return -EACCES;
> > >  	}
> > > @@ -4284,7 +4292,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
> > >  		 * value from map as PTR_TO_BTF_ID, with the correct type.
> > >  		 */
> > >  		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
> > > -				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
> > > +				kptr_field->kptr.btf_id,
> > > +				kptr_field->type == BPF_KPTR_RCU && in_rcu_cs(env) ?
> > 
> > If we replaced this kptr_field->type == BPF_KPTR_RCU check with
> > something like btf_rcu_safe_kptr(kptr_field), corresponding to:
> > 
> > bool btf_rcu_safe_kptr(const struct btf_field *field)
> > {
> > 	const struct btf_field_kptr *kptr = &field->kptr;
> > 
> > 	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
> > }
> > 
> > Wouldn't that allow us to avoid having to define BPF_KPTR_RCU at all?
> > Given that BPF_KPTR_RCU is really just an instance of BPF_KPTR_REF which
> > may also derive safety from RCU protection, this seems both simpler and
> > more thematic. Or am I missing something?
> 
> See my earlier reply. It felt cleaner to keep them separate so that
> BPF_KPTR_RCU won't be accepted in placed where only BPF_KPTR_REF is ok.
> I'm probably overthinking.
> Looking at the code again all places with BPF_KPTR_REF were appended with BPF_KPTR_RCU.
> So, yeah, let's go with your suggestion above. A lot less code to maintain and

Yep, that's what I was thinking. The fact that there are so many places
where we're appending BPF_KPTR_RCU to BPF_KPTR_REF suggests to me
they're more similar than they are different, so either a bool
rcu_protected field as you suggested above, or a dynamic check as I
suggested, seems like the preferable tradeoff.

> if it turns out to be an issue we can go back to separate types.

Sounds good to me.
diff mbox series

Patch

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 7d7c1144372a..49c5cb6f46e7 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -232,11 +232,12 @@  added later.
 2.4.8 KF_RCU flag
 -----------------
 
-The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
-When used together with KF_ACQUIRE, it indicates the kfunc should have a
-single argument which must be a trusted argument or a MEM_RCU pointer.
-The argument may have reference count of 0 and the kfunc must take this
-into consideration.
+The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
+KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
+that the objects are valid and there is no use-after-free, but the pointers
+maybe NULL and pointee object's reference count could have reached zero, hence
+kfuncs must do != NULL check and consider refcnt==0 case when accessing such
+arguments.
 
 .. _KF_deprecated_flag:
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520b238abd5a..d4b5faa0a777 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -178,11 +178,12 @@  enum btf_field_type {
 	BPF_TIMER      = (1 << 1),
 	BPF_KPTR_UNREF = (1 << 2),
 	BPF_KPTR_REF   = (1 << 3),
-	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
-	BPF_LIST_HEAD  = (1 << 4),
-	BPF_LIST_NODE  = (1 << 5),
-	BPF_RB_ROOT    = (1 << 6),
-	BPF_RB_NODE    = (1 << 7),
+	BPF_KPTR_RCU   = (1 << 4), /* kernel internal. not exposed to bpf prog */
+	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_RCU,
+	BPF_LIST_HEAD  = (1 << 5),
+	BPF_LIST_NODE  = (1 << 6),
+	BPF_RB_ROOT    = (1 << 7),
+	BPF_RB_NODE    = (1 << 8),
 	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
 				 BPF_RB_NODE | BPF_RB_ROOT,
 };
@@ -284,6 +285,8 @@  static inline const char *btf_field_type_name(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return "kptr";
+	case BPF_KPTR_RCU:
+		return "kptr_rcu";
 	case BPF_LIST_HEAD:
 		return "bpf_list_head";
 	case BPF_LIST_NODE:
@@ -307,6 +310,7 @@  static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_RCU:
 		return sizeof(u64);
 	case BPF_LIST_HEAD:
 		return sizeof(struct bpf_list_head);
@@ -331,6 +335,7 @@  static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_RCU:
 		return __alignof__(u64);
 	case BPF_LIST_HEAD:
 		return __alignof__(struct bpf_list_head);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 49e0fe6d8274..556b3e2e7471 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -70,7 +70,7 @@ 
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
-#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
+#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 01dee7d48e6d..a44ea1f6164b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3552,6 +3552,18 @@  static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 	return -EINVAL;
 }
 
+BTF_SET_START(rcu_protected_types)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(struct, cgroup)
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+	if (!btf_is_kernel(btf))
+		return false;
+	return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
 static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 			  struct btf_field_info *info)
 {
@@ -3615,6 +3627,10 @@  static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		field->kptr.dtor = (void *)addr;
 	}
 
+	if (info->type == BPF_KPTR_REF && rcu_protected_object(kernel_btf, id))
+		/* rcu dereference of this field will return MEM_RCU instead of PTR_UNTRUSTED */
+		field->type = BPF_KPTR_RCU;
+
 	field->kptr.btf_id = id;
 	field->kptr.btf = kernel_btf;
 	field->kptr.module = mod;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a784be6f8bac..fed74afd45d1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2094,11 +2094,12 @@  __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
 {
 	struct cgroup *ancestor;
 
-	if (level > cgrp->level || level < 0)
+	if (!cgrp || level > cgrp->level || level < 0)
 		return NULL;
 
 	ancestor = cgrp->ancestors[level];
-	cgroup_get(ancestor);
+	if (!cgroup_tryget(ancestor))
+		return NULL;
 	return ancestor;
 }
 
@@ -2183,7 +2184,7 @@  BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e3fcdc9836a6..2e730918911c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -539,6 +539,7 @@  void btf_record_free(struct btf_record *rec)
 		switch (rec->fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_RCU:
 			if (rec->fields[i].kptr.module)
 				module_put(rec->fields[i].kptr.module);
 			btf_put(rec->fields[i].kptr.btf);
@@ -584,6 +585,7 @@  struct btf_record *btf_record_dup(const struct btf_record *rec)
 		switch (fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_RCU:
 			btf_get(fields[i].kptr.btf);
 			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
 				ret = -ENXIO;
@@ -669,6 +671,7 @@  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
 		case BPF_KPTR_REF:
+		case BPF_KPTR_RCU:
 			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
 			break;
 		case BPF_LIST_HEAD:
@@ -1058,6 +1061,7 @@  static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				break;
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_KPTR_RCU:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_ARRAY &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e4234266e76d..0b728ce0dde9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4183,7 +4183,7 @@  static int map_kptr_match_type(struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg, u32 regno)
 {
 	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
-	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
+	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
 	const char *reg_name = "";
 
 	/* Only unreferenced case accepts untrusted pointers */
@@ -4230,12 +4230,12 @@  static int map_kptr_match_type(struct bpf_verifier_env *env,
 	 * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
 	 * is zero. We must also ensure that btf_struct_ids_match does not walk
 	 * the struct to match type against first member of struct, i.e. reject
-	 * second case from above. Hence, when type is BPF_KPTR_REF, we set
+	 * second case from above. Hence, when type is BPF_KPTR_REF | BPF_KPTR_RCU, we set
 	 * strict mode to true for type match.
 	 */
 	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
 				  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
-				  kptr_field->type == BPF_KPTR_REF))
+				  kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_RCU))
 		goto bad_type;
 	return 0;
 bad_type:
@@ -4250,6 +4250,14 @@  static int map_kptr_match_type(struct bpf_verifier_env *env,
 	return -EINVAL;
 }
 
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+}
+
 static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 				 int value_regno, int insn_idx,
 				 struct btf_field *kptr_field)
@@ -4273,7 +4281,7 @@  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 	/* We only allow loading referenced kptr, since it will be marked as
 	 * untrusted, similar to unreferenced kptr.
 	 */
-	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
+	if (class != BPF_LDX && kptr_field->type != BPF_KPTR_UNREF) {
 		verbose(env, "store to referenced kptr disallowed\n");
 		return -EACCES;
 	}
@@ -4284,7 +4292,10 @@  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
-				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+				kptr_field->kptr.btf_id,
+				kptr_field->type == BPF_KPTR_RCU && in_rcu_cs(env) ?
+				PTR_MAYBE_NULL | MEM_RCU :
+				PTR_MAYBE_NULL | PTR_UNTRUSTED);
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -4338,6 +4349,7 @@  static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			switch (field->type) {
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_KPTR_RCU:
 				if (src != ACCESS_DIRECT) {
 					verbose(env, "kptr cannot be accessed indirectly by helper\n");
 					return -EACCES;
@@ -5139,11 +5151,10 @@  static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
 		 * it could be null in some cases.
 		 */
-		if (!env->cur_state->active_rcu_lock ||
-		    !(is_trusted_reg(reg) || is_rcu_reg(reg)))
-			flag &= ~MEM_RCU;
-		else
+		if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg)))
 			flag |= PTR_MAYBE_NULL;
+		else
+			flag &= ~MEM_RCU;
 	} else if (reg->type & MEM_RCU) {
 		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
 		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
@@ -6187,7 +6198,7 @@  static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
 		return -EACCES;
 	}
-	if (kptr_field->type != BPF_KPTR_REF) {
+	if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_RCU) {
 		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
 		return -EACCES;
 	}
@@ -9111,7 +9122,7 @@  static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
 	}
 
 	kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
-	if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
+	if (!kptr_field || (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_RCU)) {
 		verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
 			reg->off + reg->var_off.value);
 		return -EINVAL;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6f3d654b3339..73e5029ab5c9 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -737,6 +737,7 @@  __bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
 
 __bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
 {
+	/* p could be NULL and p->cnt could be 0 */
 }
 
 __bpf_kfunc void bpf_kfunc_call_test_destructive(void)
@@ -784,7 +785,7 @@  BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_SET8_END(test_sk_check_kfunc_ids)
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index e19e2a5f38cf..08f9ec18c345 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -281,7 +281,7 @@  int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
+__failure __msg("R1 type=rcu_ptr_or_null_ expected=percpu_ptr_")
 int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -316,7 +316,7 @@  int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("R2 type=untrusted_ptr_ expected=ptr_")
+__failure __msg("R2 must be referenced")
 int reject_untrusted_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 289ed202ec66..9a326a800e5c 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -243,7 +243,7 @@ 
 	},
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "R1 must be referenced",
+	.errstr = "R1 must be",
 },
 {
 	"calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
index 6914904344c0..d775ccb01989 100644
--- a/tools/testing/selftests/bpf/verifier/map_kptr.c
+++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
@@ -336,7 +336,7 @@ 
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.fixup_map_kptr = { 1 },
 	.result = REJECT,
-	.errstr = "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_",
+	.errstr = "R1 type=rcu_ptr_or_null_ expected=percpu_ptr_",
 },
 {
 	"map_kptr: ref: reject off != 0",