diff mbox series

[bpf-next,v2,5/8] bpf: Add bpf_rcu_read_lock() verifier support

Message ID 20221108074114.264485-1-yhs@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Add bpf_rcu_read_lock() support | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 pending Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_parallel on s390x with gcc
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1364 this patch: 1364
netdev/cc_maintainers warning 7 maintainers not CCed: sdf@google.com kpsingh@kernel.org haoluo@google.com jolsa@kernel.org martin.lau@linux.dev song@kernel.org john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 157 this patch: 157
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1354 this patch: 1354
netdev/checkpatch warning WARNING: line length of 101 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yonghong Song Nov. 8, 2022, 7:41 a.m. UTC
To simplify the design and support the common practice, no
nested bpf_rcu_read_lock() is allowed. During verification,
each paired bpf_rcu_read_lock()/unlock() has a unique
region id, starting from 1. Each rcu ptr register also
remembers the region id when the ptr reg is initialized.
The following is a simple example to illustrate the
rcu lock regions and usage of rcu ptr's.

     ...                    <=== rcu lock region 0
     bpf_rcu_read_lock()    <=== rcu lock region 1
     rcu_ptr1 = ...         <=== rcu_ptr1 with region 1
     ... using rcu_ptr1 ...
     bpf_rcu_read_unlock()
     ...                    <=== rcu lock region -1
     bpf_rcu_read_lock()    <=== rcu lock region 2
     rcu_ptr2 = ...         <=== rcu_ptr2 with region 2
     ... using rcu_ptr2 ...
     ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2
     bpf_rcu_read_unlock()

Outside the rcu lock region, the rcu lock region id is 0 or negative of
previous valid rcu lock region id, so the next valid rcu lock region
id can be easily computed.

Note that rcu protection is not needed for non-sleepable program. But
it is supported to make cross-sleepable/nonsleepable development easier.
For non-sleepable program, the following insns can be inside the rcu
lock region:
  - any non call insns except BPF_ABS/BPF_IND
  - non sleepable helpers or kfuncs
Also, bpf_*_storage_get() helper's 5th hidden argument (for memory
allocation flag) should be GFP_ATOMIC.

If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of
this pointer and the load which gets this pointer needs to be
protected by bpf_rcu_read_lock(). The following shows a couple
of examples:
  struct task_struct {
        ...
        struct task_struct __rcu        *real_parent;
        struct css_set __rcu            *cgroups;
        ...
  };
  struct css_set {
        ...
        struct cgroup *dfl_cgrp;
        ...
  }
  ...
  task = bpf_get_current_task_btf();
  cgroups = task->cgroups;
  dfl_cgroup = cgroups->dfl_cgrp;
  ... using dfl_cgroup ...

The bpf_rcu_read_lock/unlock() should be added like below to
avoid verification failures.
  task = bpf_get_current_task_btf();
  bpf_rcu_read_lock();
  cgroups = task->cgroups;
  dfl_cgroup = cgroups->dfl_cgrp;
  bpf_rcu_read_unlock();
  ... using dfl_cgroup ...

The following is another example for task->real_parent.
  task = bpf_get_current_task_btf();
  bpf_rcu_read_lock();
  real_parent = task->real_parent;
  ... bpf_task_storage_get(&map, real_parent, 0, 0);
  bpf_rcu_read_unlock();

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h          |  1 +
 include/linux/bpf_verifier.h |  7 +++
 kernel/bpf/btf.c             | 32 ++++++++++++-
 kernel/bpf/verifier.c        | 92 +++++++++++++++++++++++++++++++-----
 4 files changed, 120 insertions(+), 12 deletions(-)

Comments

Kumar Kartikeya Dwivedi Nov. 8, 2022, 5:04 p.m. UTC | #1
On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote:
> To simplify the design and support the common practice, no
> nested bpf_rcu_read_lock() is allowed. During verification,
> each paired bpf_rcu_read_lock()/unlock() has a unique
> region id, starting from 1. Each rcu ptr register also
> remembers the region id when the ptr reg is initialized.
> The following is a simple example to illustrate the
> rcu lock regions and usage of rcu ptr's.
>
>      ...                    <=== rcu lock region 0
>      bpf_rcu_read_lock()    <=== rcu lock region 1
>      rcu_ptr1 = ...         <=== rcu_ptr1 with region 1
>      ... using rcu_ptr1 ...
>      bpf_rcu_read_unlock()
>      ...                    <=== rcu lock region -1
>      bpf_rcu_read_lock()    <=== rcu lock region 2
>      rcu_ptr2 = ...         <=== rcu_ptr2 with region 2
>      ... using rcu_ptr2 ...
>      ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2
>      bpf_rcu_read_unlock()
>
> Outside the rcu lock region, the rcu lock region id is 0 or negative of
> previous valid rcu lock region id, so the next valid rcu lock region
> id can be easily computed.
>
> Note that rcu protection is not needed for non-sleepable program. But
> it is supported to make cross-sleepable/nonsleepable development easier.
> For non-sleepable program, the following insns can be inside the rcu
> lock region:
>   - any non call insns except BPF_ABS/BPF_IND
>   - non sleepable helpers or kfuncs
> Also, bpf_*_storage_get() helper's 5th hidden argument (for memory
> allocation flag) should be GFP_ATOMIC.
>
> If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of
> this pointer and the load which gets this pointer needs to be
> protected by bpf_rcu_read_lock(). The following shows a couple
> of examples:
>   struct task_struct {
>         ...
>         struct task_struct __rcu        *real_parent;
>         struct css_set __rcu            *cgroups;
>         ...
>   };
>   struct css_set {
>         ...
>         struct cgroup *dfl_cgrp;
>         ...
>   }
>   ...
>   task = bpf_get_current_task_btf();
>   cgroups = task->cgroups;
>   dfl_cgroup = cgroups->dfl_cgrp;
>   ... using dfl_cgroup ...
>
> The bpf_rcu_read_lock/unlock() should be added like below to
> avoid verification failures.
>   task = bpf_get_current_task_btf();
>   bpf_rcu_read_lock();
>   cgroups = task->cgroups;
>   dfl_cgroup = cgroups->dfl_cgrp;
>   bpf_rcu_read_unlock();
>   ... using dfl_cgroup ...
>
> The following is another example for task->real_parent.
>   task = bpf_get_current_task_btf();
>   bpf_rcu_read_lock();
>   real_parent = task->real_parent;
>   ... bpf_task_storage_get(&map, real_parent, 0, 0);
>   bpf_rcu_read_unlock();
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h          |  1 +
>  include/linux/bpf_verifier.h |  7 +++
>  kernel/bpf/btf.c             | 32 ++++++++++++-
>  kernel/bpf/verifier.c        | 92 +++++++++++++++++++++++++++++++-----
>  4 files changed, 120 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index b4bbcafd1c9b..98af0c9ec721 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -761,6 +761,7 @@ struct bpf_prog_ops {
>  struct btf_struct_access_info {
>  	u32 next_btf_id;
>  	enum bpf_type_flag flag;
> +	bool is_rcu;
>  };
>
>  struct bpf_verifier_ops {
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 1a32baa78ce2..5d703637bb12 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -179,6 +179,10 @@ struct bpf_reg_state {
>  	 */
>  	s32 subreg_def;
>  	enum bpf_reg_liveness live;
> +	/* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where
> +	 * the rcu ptr reg is initialized.
> +	 */
> +	int active_rcu_lock;
>  	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
>  	bool precise;
>  };
> @@ -324,6 +328,8 @@ struct bpf_verifier_state {
>  	u32 insn_idx;
>  	u32 curframe;
>  	u32 active_spin_lock;
> +	/* <= 0: not in rcu read lock region; > 0: the rcu lock region id */
> +	int active_rcu_lock;
>  	bool speculative;
>
>  	/* first and last insn idx of this verifier state */
> @@ -424,6 +430,7 @@ struct bpf_insn_aux_data {
>  	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
>  	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
>  	bool zext_dst; /* this insn zero extends dst reg */
> +	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
>  	u8 alu_state; /* used in combination with alu_limit */
>
>  	/* below fields are initialized once */
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index d2ee1669a2f3..c5a9569f2ae0 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
>  		if (btf_type_is_ptr(mtype)) {
>  			const struct btf_type *stype, *t;
>  			enum bpf_type_flag tmp_flag = 0;
> +			bool is_rcu = false;
>  			u32 id;
>
>  			if (msize != size || off != moff) {
> @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
>  				/* check __percpu tag */
>  				if (strcmp(tag_value, "percpu") == 0)
>  					tmp_flag = MEM_PERCPU;
> +				/* check __rcu tag */
> +				if (strcmp(tag_value, "rcu") == 0)
> +					is_rcu = true;
>  			}
>
>  			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
>  			if (btf_type_is_struct(stype)) {
>  				info->next_btf_id = id;
>  				info->flag = tmp_flag;
> +				info->is_rcu = is_rcu;
>  				return WALK_PTR;
>  			}
>  		}
> @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>  {
>  	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
>  	bool rel = false, kptr_get = false, trusted_args = false;
> -	bool sleepable = false;
> +	bool sleepable = false, rcu_lock = false, rcu_unlock = false;
>  	struct bpf_verifier_log *log = &env->log;
>  	u32 i, nargs, ref_id, ref_obj_id = 0;
>  	bool is_kfunc = btf_is_kernel(btf);
> @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>  		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
>  		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
>  		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
> +		rcu_lock = kfunc_meta->flags & KF_RCU_LOCK;
> +		rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK;
> +	}
> +
> +	/* checking rcu read lock/unlock */
> +	if (env->cur_state->active_rcu_lock > 0) {
> +		if (rcu_lock) {
> +			bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name);
> +			return -EINVAL;
> +		} else if (rcu_unlock) {
> +			/* change active_rcu_lock to its corresponding negative value to
> +			 * preserve the previous lock region id.
> +			 */
> +			env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock;
> +		} else if (sleepable) {
> +			bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n",
> +				func_name);
> +			return -EINVAL;
> +		}
> +	} else if (rcu_lock) {
> +		/* a new lock region started, increase the region id. */
> +		env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1;
> +	} else if (rcu_unlock) {
> +		bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name);
> +		return -EINVAL;
>  	}
>

Can you provide more context on why having ids is better than simply
invalidating the registers when the section ends, and making active_rcu_lock a
boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having
MEM_RCU and mark it unknown.

You won't have to match the id in btf_struct_access as such registers won't ever
reach that function (if marked unknown on invalidation, they become scalars).
The reg state won't need another active_rcu_lock member either, it is simply
part of reg->type.

It seems to that simply invalidating registers when rcu_read_unlock is called is
both less code to write and simpler to understand.

Having ids also makes the pruning algorithm unecessarily conservative.
Later in states_equal, the check is:

> +	if (old->active_rcu_lock != cur->active_rcu_lock)
> +		return false;

which means even though the current state just holding the RCU read lock would
be enough to prune search, it would be rejected now due to distinct IDs (e.g. if
the current path didn't make exactly the same number of rcu_read_lock calls
compared to the old state).
Yonghong Song Nov. 8, 2022, 8:03 p.m. UTC | #2
On 11/8/22 9:04 AM, Kumar Kartikeya Dwivedi wrote:
> On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote:
>> To simplify the design and support the common practice, no
>> nested bpf_rcu_read_lock() is allowed. During verification,
>> each paired bpf_rcu_read_lock()/unlock() has a unique
>> region id, starting from 1. Each rcu ptr register also
>> remembers the region id when the ptr reg is initialized.
>> The following is a simple example to illustrate the
>> rcu lock regions and usage of rcu ptr's.
>>
>>       ...                    <=== rcu lock region 0
>>       bpf_rcu_read_lock()    <=== rcu lock region 1
>>       rcu_ptr1 = ...         <=== rcu_ptr1 with region 1
>>       ... using rcu_ptr1 ...
>>       bpf_rcu_read_unlock()
>>       ...                    <=== rcu lock region -1
>>       bpf_rcu_read_lock()    <=== rcu lock region 2
>>       rcu_ptr2 = ...         <=== rcu_ptr2 with region 2
>>       ... using rcu_ptr2 ...
>>       ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2
>>       bpf_rcu_read_unlock()
>>
>> Outside the rcu lock region, the rcu lock region id is 0 or negative of
>> previous valid rcu lock region id, so the next valid rcu lock region
>> id can be easily computed.
>>
>> Note that rcu protection is not needed for non-sleepable program. But
>> it is supported to make cross-sleepable/nonsleepable development easier.
>> For non-sleepable program, the following insns can be inside the rcu
>> lock region:
>>    - any non call insns except BPF_ABS/BPF_IND
>>    - non sleepable helpers or kfuncs
>> Also, bpf_*_storage_get() helper's 5th hidden argument (for memory
>> allocation flag) should be GFP_ATOMIC.
>>
>> If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of
>> this pointer and the load which gets this pointer needs to be
>> protected by bpf_rcu_read_lock(). The following shows a couple
>> of examples:
>>    struct task_struct {
>>          ...
>>          struct task_struct __rcu        *real_parent;
>>          struct css_set __rcu            *cgroups;
>>          ...
>>    };
>>    struct css_set {
>>          ...
>>          struct cgroup *dfl_cgrp;
>>          ...
>>    }
>>    ...
>>    task = bpf_get_current_task_btf();
>>    cgroups = task->cgroups;
>>    dfl_cgroup = cgroups->dfl_cgrp;
>>    ... using dfl_cgroup ...
>>
>> The bpf_rcu_read_lock/unlock() should be added like below to
>> avoid verification failures.
>>    task = bpf_get_current_task_btf();
>>    bpf_rcu_read_lock();
>>    cgroups = task->cgroups;
>>    dfl_cgroup = cgroups->dfl_cgrp;
>>    bpf_rcu_read_unlock();
>>    ... using dfl_cgroup ...
>>
>> The following is another example for task->real_parent.
>>    task = bpf_get_current_task_btf();
>>    bpf_rcu_read_lock();
>>    real_parent = task->real_parent;
>>    ... bpf_task_storage_get(&map, real_parent, 0, 0);
>>    bpf_rcu_read_unlock();
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h          |  1 +
>>   include/linux/bpf_verifier.h |  7 +++
>>   kernel/bpf/btf.c             | 32 ++++++++++++-
>>   kernel/bpf/verifier.c        | 92 +++++++++++++++++++++++++++++++-----
>>   4 files changed, 120 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index b4bbcafd1c9b..98af0c9ec721 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -761,6 +761,7 @@ struct bpf_prog_ops {
>>   struct btf_struct_access_info {
>>   	u32 next_btf_id;
>>   	enum bpf_type_flag flag;
>> +	bool is_rcu;
>>   };
>>
>>   struct bpf_verifier_ops {
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 1a32baa78ce2..5d703637bb12 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -179,6 +179,10 @@ struct bpf_reg_state {
>>   	 */
>>   	s32 subreg_def;
>>   	enum bpf_reg_liveness live;
>> +	/* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where
>> +	 * the rcu ptr reg is initialized.
>> +	 */
>> +	int active_rcu_lock;
>>   	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
>>   	bool precise;
>>   };
>> @@ -324,6 +328,8 @@ struct bpf_verifier_state {
>>   	u32 insn_idx;
>>   	u32 curframe;
>>   	u32 active_spin_lock;
>> +	/* <= 0: not in rcu read lock region; > 0: the rcu lock region id */
>> +	int active_rcu_lock;
>>   	bool speculative;
>>
>>   	/* first and last insn idx of this verifier state */
>> @@ -424,6 +430,7 @@ struct bpf_insn_aux_data {
>>   	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
>>   	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
>>   	bool zext_dst; /* this insn zero extends dst reg */
>> +	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
>>   	u8 alu_state; /* used in combination with alu_limit */
>>
>>   	/* below fields are initialized once */
>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>> index d2ee1669a2f3..c5a9569f2ae0 100644
>> --- a/kernel/bpf/btf.c
>> +++ b/kernel/bpf/btf.c
>> @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
>>   		if (btf_type_is_ptr(mtype)) {
>>   			const struct btf_type *stype, *t;
>>   			enum bpf_type_flag tmp_flag = 0;
>> +			bool is_rcu = false;
>>   			u32 id;
>>
>>   			if (msize != size || off != moff) {
>> @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
>>   				/* check __percpu tag */
>>   				if (strcmp(tag_value, "percpu") == 0)
>>   					tmp_flag = MEM_PERCPU;
>> +				/* check __rcu tag */
>> +				if (strcmp(tag_value, "rcu") == 0)
>> +					is_rcu = true;
>>   			}
>>
>>   			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
>>   			if (btf_type_is_struct(stype)) {
>>   				info->next_btf_id = id;
>>   				info->flag = tmp_flag;
>> +				info->is_rcu = is_rcu;
>>   				return WALK_PTR;
>>   			}
>>   		}
>> @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>>   {
>>   	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
>>   	bool rel = false, kptr_get = false, trusted_args = false;
>> -	bool sleepable = false;
>> +	bool sleepable = false, rcu_lock = false, rcu_unlock = false;
>>   	struct bpf_verifier_log *log = &env->log;
>>   	u32 i, nargs, ref_id, ref_obj_id = 0;
>>   	bool is_kfunc = btf_is_kernel(btf);
>> @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>>   		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
>>   		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
>>   		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
>> +		rcu_lock = kfunc_meta->flags & KF_RCU_LOCK;
>> +		rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK;
>> +	}
>> +
>> +	/* checking rcu read lock/unlock */
>> +	if (env->cur_state->active_rcu_lock > 0) {
>> +		if (rcu_lock) {
>> +			bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name);
>> +			return -EINVAL;
>> +		} else if (rcu_unlock) {
>> +			/* change active_rcu_lock to its corresponding negative value to
>> +			 * preserve the previous lock region id.
>> +			 */
>> +			env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock;
>> +		} else if (sleepable) {
>> +			bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n",
>> +				func_name);
>> +			return -EINVAL;
>> +		}
>> +	} else if (rcu_lock) {
>> +		/* a new lock region started, increase the region id. */
>> +		env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1;
>> +	} else if (rcu_unlock) {
>> +		bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name);
>> +		return -EINVAL;
>>   	}
>>
> 
> Can you provide more context on why having ids is better than simply
> invalidating the registers when the section ends, and making active_rcu_lock a
> boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having
> MEM_RCU and mark it unknown.

I think we also need to invalidate rcu-ptr related states as well in spills.

I also tried to support cases like:
	bpf_rcu_read_lock();
	rcu_ptr = ...
	   ... rcu_ptr ...
	bpf_rcu_read_unlock();
	... rcu_ptr ... /* no load, just use the rcu_ptr somehow */

In the above case, outside the rcu read lock region, there is no
load with rcu_ptr but it can still be used for other purposes
with a property of a pointer.

But for a second thought, it should be okay to invalidate
rcu_ptr during bpf_rcu_read_unlock() as a scalar. This should
satisfy almost all (if not all) cases.

> 
> You won't have to match the id in btf_struct_access as such registers won't ever
> reach that function (if marked unknown on invalidation, they become scalars).
> The reg state won't need another active_rcu_lock member either, it is simply
> part of reg->type.

Right, if I don't maintain region id's, no need to have 
reg->active_rcu_lock and using MEM_RCU should be enough.

> 
> It seems to that simply invalidating registers when rcu_read_unlock is called is
> both less code to write and simpler to understand.

invalidating rcu_ptr in registers and spills.

Let me try to implement this and compare to my current approach. I guess
MEM_RCU + invalidation at bpf_rcu_read_unlock() should be simpler as you
suggested.

> 
> Having ids also makes the pruning algorithm unecessarily conservative.
> Later in states_equal, the check is:
> 
>> +	if (old->active_rcu_lock != cur->active_rcu_lock)
>> +		return false;
> 
> which means even though the current state just holding the RCU read lock would
> be enough to prune search, it would be rejected now due to distinct IDs (e.g. if
> the current path didn't make exactly the same number of rcu_read_lock calls
> compared to the old state).

That is true. We should also check old/new verifier state. If both
verifier state are not in rcu read lock region. The above check can
be ignored. But agree this becomes a little bit more complicated.
Kumar Kartikeya Dwivedi Nov. 8, 2022, 8:19 p.m. UTC | #3
On Wed, Nov 09, 2022 at 01:33:04AM IST, Yonghong Song wrote:
>
>
> On 11/8/22 9:04 AM, Kumar Kartikeya Dwivedi wrote:
> > On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote:
> > > To simplify the design and support the common practice, no
> > > nested bpf_rcu_read_lock() is allowed. During verification,
> > > each paired bpf_rcu_read_lock()/unlock() has a unique
> > > region id, starting from 1. Each rcu ptr register also
> > > remembers the region id when the ptr reg is initialized.
> > > The following is a simple example to illustrate the
> > > rcu lock regions and usage of rcu ptr's.
> > >
> > >       ...                    <=== rcu lock region 0
> > >       bpf_rcu_read_lock()    <=== rcu lock region 1
> > >       rcu_ptr1 = ...         <=== rcu_ptr1 with region 1
> > >       ... using rcu_ptr1 ...
> > >       bpf_rcu_read_unlock()
> > >       ...                    <=== rcu lock region -1
> > >       bpf_rcu_read_lock()    <=== rcu lock region 2
> > >       rcu_ptr2 = ...         <=== rcu_ptr2 with region 2
> > >       ... using rcu_ptr2 ...
> > >       ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2
> > >       bpf_rcu_read_unlock()
> > >
> > > Outside the rcu lock region, the rcu lock region id is 0 or negative of
> > > previous valid rcu lock region id, so the next valid rcu lock region
> > > id can be easily computed.
> > >
> > > Note that rcu protection is not needed for non-sleepable program. But
> > > it is supported to make cross-sleepable/nonsleepable development easier.
> > > For non-sleepable program, the following insns can be inside the rcu
> > > lock region:
> > >    - any non call insns except BPF_ABS/BPF_IND
> > >    - non sleepable helpers or kfuncs
> > > Also, bpf_*_storage_get() helper's 5th hidden argument (for memory
> > > allocation flag) should be GFP_ATOMIC.
> > >
> > > If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of
> > > this pointer and the load which gets this pointer needs to be
> > > protected by bpf_rcu_read_lock(). The following shows a couple
> > > of examples:
> > >    struct task_struct {
> > >          ...
> > >          struct task_struct __rcu        *real_parent;
> > >          struct css_set __rcu            *cgroups;
> > >          ...
> > >    };
> > >    struct css_set {
> > >          ...
> > >          struct cgroup *dfl_cgrp;
> > >          ...
> > >    }
> > >    ...
> > >    task = bpf_get_current_task_btf();
> > >    cgroups = task->cgroups;
> > >    dfl_cgroup = cgroups->dfl_cgrp;
> > >    ... using dfl_cgroup ...
> > >
> > > The bpf_rcu_read_lock/unlock() should be added like below to
> > > avoid verification failures.
> > >    task = bpf_get_current_task_btf();
> > >    bpf_rcu_read_lock();
> > >    cgroups = task->cgroups;
> > >    dfl_cgroup = cgroups->dfl_cgrp;
> > >    bpf_rcu_read_unlock();
> > >    ... using dfl_cgroup ...
> > >
> > > The following is another example for task->real_parent.
> > >    task = bpf_get_current_task_btf();
> > >    bpf_rcu_read_lock();
> > >    real_parent = task->real_parent;
> > >    ... bpf_task_storage_get(&map, real_parent, 0, 0);
> > >    bpf_rcu_read_unlock();
> > >
> > > Signed-off-by: Yonghong Song <yhs@fb.com>
> > > ---
> > >   include/linux/bpf.h          |  1 +
> > >   include/linux/bpf_verifier.h |  7 +++
> > >   kernel/bpf/btf.c             | 32 ++++++++++++-
> > >   kernel/bpf/verifier.c        | 92 +++++++++++++++++++++++++++++++-----
> > >   4 files changed, 120 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index b4bbcafd1c9b..98af0c9ec721 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -761,6 +761,7 @@ struct bpf_prog_ops {
> > >   struct btf_struct_access_info {
> > >   	u32 next_btf_id;
> > >   	enum bpf_type_flag flag;
> > > +	bool is_rcu;
> > >   };
> > >
> > >   struct bpf_verifier_ops {
> > > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> > > index 1a32baa78ce2..5d703637bb12 100644
> > > --- a/include/linux/bpf_verifier.h
> > > +++ b/include/linux/bpf_verifier.h
> > > @@ -179,6 +179,10 @@ struct bpf_reg_state {
> > >   	 */
> > >   	s32 subreg_def;
> > >   	enum bpf_reg_liveness live;
> > > +	/* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where
> > > +	 * the rcu ptr reg is initialized.
> > > +	 */
> > > +	int active_rcu_lock;
> > >   	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
> > >   	bool precise;
> > >   };
> > > @@ -324,6 +328,8 @@ struct bpf_verifier_state {
> > >   	u32 insn_idx;
> > >   	u32 curframe;
> > >   	u32 active_spin_lock;
> > > +	/* <= 0: not in rcu read lock region; > 0: the rcu lock region id */
> > > +	int active_rcu_lock;
> > >   	bool speculative;
> > >
> > >   	/* first and last insn idx of this verifier state */
> > > @@ -424,6 +430,7 @@ struct bpf_insn_aux_data {
> > >   	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
> > >   	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
> > >   	bool zext_dst; /* this insn zero extends dst reg */
> > > +	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
> > >   	u8 alu_state; /* used in combination with alu_limit */
> > >
> > >   	/* below fields are initialized once */
> > > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > > index d2ee1669a2f3..c5a9569f2ae0 100644
> > > --- a/kernel/bpf/btf.c
> > > +++ b/kernel/bpf/btf.c
> > > @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
> > >   		if (btf_type_is_ptr(mtype)) {
> > >   			const struct btf_type *stype, *t;
> > >   			enum bpf_type_flag tmp_flag = 0;
> > > +			bool is_rcu = false;
> > >   			u32 id;
> > >
> > >   			if (msize != size || off != moff) {
> > > @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
> > >   				/* check __percpu tag */
> > >   				if (strcmp(tag_value, "percpu") == 0)
> > >   					tmp_flag = MEM_PERCPU;
> > > +				/* check __rcu tag */
> > > +				if (strcmp(tag_value, "rcu") == 0)
> > > +					is_rcu = true;
> > >   			}
> > >
> > >   			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
> > >   			if (btf_type_is_struct(stype)) {
> > >   				info->next_btf_id = id;
> > >   				info->flag = tmp_flag;
> > > +				info->is_rcu = is_rcu;
> > >   				return WALK_PTR;
> > >   			}
> > >   		}
> > > @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
> > >   {
> > >   	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
> > >   	bool rel = false, kptr_get = false, trusted_args = false;
> > > -	bool sleepable = false;
> > > +	bool sleepable = false, rcu_lock = false, rcu_unlock = false;
> > >   	struct bpf_verifier_log *log = &env->log;
> > >   	u32 i, nargs, ref_id, ref_obj_id = 0;
> > >   	bool is_kfunc = btf_is_kernel(btf);
> > > @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
> > >   		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
> > >   		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
> > >   		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
> > > +		rcu_lock = kfunc_meta->flags & KF_RCU_LOCK;
> > > +		rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK;
> > > +	}
> > > +
> > > +	/* checking rcu read lock/unlock */
> > > +	if (env->cur_state->active_rcu_lock > 0) {
> > > +		if (rcu_lock) {
> > > +			bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name);
> > > +			return -EINVAL;
> > > +		} else if (rcu_unlock) {
> > > +			/* change active_rcu_lock to its corresponding negative value to
> > > +			 * preserve the previous lock region id.
> > > +			 */
> > > +			env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock;
> > > +		} else if (sleepable) {
> > > +			bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n",
> > > +				func_name);
> > > +			return -EINVAL;
> > > +		}
> > > +	} else if (rcu_lock) {
> > > +		/* a new lock region started, increase the region id. */
> > > +		env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1;
> > > +	} else if (rcu_unlock) {
> > > +		bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name);
> > > +		return -EINVAL;
> > >   	}
> > >
> >
> > Can you provide more context on why having ids is better than simply
> > invalidating the registers when the section ends, and making active_rcu_lock a
> > boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having
> > MEM_RCU and mark it unknown.
>
> I think we also need to invalidate rcu-ptr related states as well in spills.
>
> I also tried to support cases like:
> 	bpf_rcu_read_lock();
> 	rcu_ptr = ...
> 	   ... rcu_ptr ...
> 	bpf_rcu_read_unlock();
> 	... rcu_ptr ... /* no load, just use the rcu_ptr somehow */
>
> In the above case, outside the rcu read lock region, there is no
> load with rcu_ptr but it can still be used for other purposes
> with a property of a pointer.
>
> But for a second thought, it should be okay to invalidate
> rcu_ptr during bpf_rcu_read_unlock() as a scalar. This should
> satisfy almost all (if not all) cases.
>
> >
> > You won't have to match the id in btf_struct_access as such registers won't ever
> > reach that function (if marked unknown on invalidation, they become scalars).
> > The reg state won't need another active_rcu_lock member either, it is simply
> > part of reg->type.
>
> Right, if I don't maintain region id's, no need to have reg->active_rcu_lock
> and using MEM_RCU should be enough.
>
> >
> > It seems to that simply invalidating registers when rcu_read_unlock is called is
> > both less code to write and simpler to understand.
>
> invalidating rcu_ptr in registers and spills.
>

If you use bpf_for_each_reg_in_vstate, it should cover both.
Yonghong Song Nov. 8, 2022, 8:40 p.m. UTC | #4
On 11/8/22 12:19 PM, Kumar Kartikeya Dwivedi wrote:
> On Wed, Nov 09, 2022 at 01:33:04AM IST, Yonghong Song wrote:
>>
>>
>> On 11/8/22 9:04 AM, Kumar Kartikeya Dwivedi wrote:
>>> On Tue, Nov 08, 2022 at 01:11:14PM IST, Yonghong Song wrote:
>>>> To simplify the design and support the common practice, no
>>>> nested bpf_rcu_read_lock() is allowed. During verification,
>>>> each paired bpf_rcu_read_lock()/unlock() has a unique
>>>> region id, starting from 1. Each rcu ptr register also
>>>> remembers the region id when the ptr reg is initialized.
>>>> The following is a simple example to illustrate the
>>>> rcu lock regions and usage of rcu ptr's.
>>>>
>>>>        ...                    <=== rcu lock region 0
>>>>        bpf_rcu_read_lock()    <=== rcu lock region 1
>>>>        rcu_ptr1 = ...         <=== rcu_ptr1 with region 1
>>>>        ... using rcu_ptr1 ...
>>>>        bpf_rcu_read_unlock()
>>>>        ...                    <=== rcu lock region -1
>>>>        bpf_rcu_read_lock()    <=== rcu lock region 2
>>>>        rcu_ptr2 = ...         <=== rcu_ptr2 with region 2
>>>>        ... using rcu_ptr2 ...
>>>>        ... using rcu_ptr1 ... <=== wrong, region 1 rcu_ptr in region 2
>>>>        bpf_rcu_read_unlock()
>>>>
>>>> Outside the rcu lock region, the rcu lock region id is 0 or negative of
>>>> previous valid rcu lock region id, so the next valid rcu lock region
>>>> id can be easily computed.
>>>>
>>>> Note that rcu protection is not needed for non-sleepable program. But
>>>> it is supported to make cross-sleepable/nonsleepable development easier.
>>>> For non-sleepable program, the following insns can be inside the rcu
>>>> lock region:
>>>>     - any non call insns except BPF_ABS/BPF_IND
>>>>     - non sleepable helpers or kfuncs
>>>> Also, bpf_*_storage_get() helper's 5th hidden argument (for memory
>>>> allocation flag) should be GFP_ATOMIC.
>>>>
>>>> If a pointer (PTR_TO_BTF_ID) is marked as rcu, then any use of
>>>> this pointer and the load which gets this pointer needs to be
>>>> protected by bpf_rcu_read_lock(). The following shows a couple
>>>> of examples:
>>>>     struct task_struct {
>>>>           ...
>>>>           struct task_struct __rcu        *real_parent;
>>>>           struct css_set __rcu            *cgroups;
>>>>           ...
>>>>     };
>>>>     struct css_set {
>>>>           ...
>>>>           struct cgroup *dfl_cgrp;
>>>>           ...
>>>>     }
>>>>     ...
>>>>     task = bpf_get_current_task_btf();
>>>>     cgroups = task->cgroups;
>>>>     dfl_cgroup = cgroups->dfl_cgrp;
>>>>     ... using dfl_cgroup ...
>>>>
>>>> The bpf_rcu_read_lock/unlock() should be added like below to
>>>> avoid verification failures.
>>>>     task = bpf_get_current_task_btf();
>>>>     bpf_rcu_read_lock();
>>>>     cgroups = task->cgroups;
>>>>     dfl_cgroup = cgroups->dfl_cgrp;
>>>>     bpf_rcu_read_unlock();
>>>>     ... using dfl_cgroup ...
>>>>
>>>> The following is another example for task->real_parent.
>>>>     task = bpf_get_current_task_btf();
>>>>     bpf_rcu_read_lock();
>>>>     real_parent = task->real_parent;
>>>>     ... bpf_task_storage_get(&map, real_parent, 0, 0);
>>>>     bpf_rcu_read_unlock();
>>>>
>>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>>    include/linux/bpf.h          |  1 +
>>>>    include/linux/bpf_verifier.h |  7 +++
>>>>    kernel/bpf/btf.c             | 32 ++++++++++++-
>>>>    kernel/bpf/verifier.c        | 92 +++++++++++++++++++++++++++++++-----
>>>>    4 files changed, 120 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index b4bbcafd1c9b..98af0c9ec721 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -761,6 +761,7 @@ struct bpf_prog_ops {
>>>>    struct btf_struct_access_info {
>>>>    	u32 next_btf_id;
>>>>    	enum bpf_type_flag flag;
>>>> +	bool is_rcu;
>>>>    };
>>>>
>>>>    struct bpf_verifier_ops {
>>>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>>>> index 1a32baa78ce2..5d703637bb12 100644
>>>> --- a/include/linux/bpf_verifier.h
>>>> +++ b/include/linux/bpf_verifier.h
>>>> @@ -179,6 +179,10 @@ struct bpf_reg_state {
>>>>    	 */
>>>>    	s32 subreg_def;
>>>>    	enum bpf_reg_liveness live;
>>>> +	/* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where
>>>> +	 * the rcu ptr reg is initialized.
>>>> +	 */
>>>> +	int active_rcu_lock;
>>>>    	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
>>>>    	bool precise;
>>>>    };
>>>> @@ -324,6 +328,8 @@ struct bpf_verifier_state {
>>>>    	u32 insn_idx;
>>>>    	u32 curframe;
>>>>    	u32 active_spin_lock;
>>>> +	/* <= 0: not in rcu read lock region; > 0: the rcu lock region id */
>>>> +	int active_rcu_lock;
>>>>    	bool speculative;
>>>>
>>>>    	/* first and last insn idx of this verifier state */
>>>> @@ -424,6 +430,7 @@ struct bpf_insn_aux_data {
>>>>    	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
>>>>    	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
>>>>    	bool zext_dst; /* this insn zero extends dst reg */
>>>> +	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
>>>>    	u8 alu_state; /* used in combination with alu_limit */
>>>>
>>>>    	/* below fields are initialized once */
>>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>>>> index d2ee1669a2f3..c5a9569f2ae0 100644
>>>> --- a/kernel/bpf/btf.c
>>>> +++ b/kernel/bpf/btf.c
>>>> @@ -5831,6 +5831,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
>>>>    		if (btf_type_is_ptr(mtype)) {
>>>>    			const struct btf_type *stype, *t;
>>>>    			enum bpf_type_flag tmp_flag = 0;
>>>> +			bool is_rcu = false;
>>>>    			u32 id;
>>>>
>>>>    			if (msize != size || off != moff) {
>>>> @@ -5850,12 +5851,16 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
>>>>    				/* check __percpu tag */
>>>>    				if (strcmp(tag_value, "percpu") == 0)
>>>>    					tmp_flag = MEM_PERCPU;
>>>> +				/* check __rcu tag */
>>>> +				if (strcmp(tag_value, "rcu") == 0)
>>>> +					is_rcu = true;
>>>>    			}
>>>>
>>>>    			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
>>>>    			if (btf_type_is_struct(stype)) {
>>>>    				info->next_btf_id = id;
>>>>    				info->flag = tmp_flag;
>>>> +				info->is_rcu = is_rcu;
>>>>    				return WALK_PTR;
>>>>    			}
>>>>    		}
>>>> @@ -6317,7 +6322,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>>>>    {
>>>>    	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
>>>>    	bool rel = false, kptr_get = false, trusted_args = false;
>>>> -	bool sleepable = false;
>>>> +	bool sleepable = false, rcu_lock = false, rcu_unlock = false;
>>>>    	struct bpf_verifier_log *log = &env->log;
>>>>    	u32 i, nargs, ref_id, ref_obj_id = 0;
>>>>    	bool is_kfunc = btf_is_kernel(btf);
>>>> @@ -6356,6 +6361,31 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
>>>>    		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
>>>>    		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
>>>>    		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
>>>> +		rcu_lock = kfunc_meta->flags & KF_RCU_LOCK;
>>>> +		rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK;
>>>> +	}
>>>> +
>>>> +	/* checking rcu read lock/unlock */
>>>> +	if (env->cur_state->active_rcu_lock > 0) {
>>>> +		if (rcu_lock) {
>>>> +			bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name);
>>>> +			return -EINVAL;
>>>> +		} else if (rcu_unlock) {
>>>> +			/* change active_rcu_lock to its corresponding negative value to
>>>> +			 * preserve the previous lock region id.
>>>> +			 */
>>>> +			env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock;
>>>> +		} else if (sleepable) {
>>>> +			bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n",
>>>> +				func_name);
>>>> +			return -EINVAL;
>>>> +		}
>>>> +	} else if (rcu_lock) {
>>>> +		/* a new lock region started, increase the region id. */
>>>> +		env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1;
>>>> +	} else if (rcu_unlock) {
>>>> +		bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name);
>>>> +		return -EINVAL;
>>>>    	}
>>>>
>>>
>>> Can you provide more context on why having ids is better than simply
>>> invalidating the registers when the section ends, and making active_rcu_lock a
>>> boolean instead? You can use bpf_for_each_reg_in_vstate to find every reg having
>>> MEM_RCU and mark it unknown.
>>
>> I think we also need to invalidate rcu-ptr related states as well in spills.
>>
>> I also tried to support cases like:
>> 	bpf_rcu_read_lock();
>> 	rcu_ptr = ...
>> 	   ... rcu_ptr ...
>> 	bpf_rcu_read_unlock();
>> 	... rcu_ptr ... /* no load, just use the rcu_ptr somehow */
>>
>> In the above case, outside the rcu read lock region, there is no
>> load with rcu_ptr but it can still be used for other purposes
>> with a property of a pointer.
>>
>> But for a second thought, it should be okay to invalidate
>> rcu_ptr during bpf_rcu_read_unlock() as a scalar. This should
>> satisfy almost all (if not all) cases.
>>
>>>
>>> You won't have to match the id in btf_struct_access as such registers won't ever
>>> reach that function (if marked unknown on invalidation, they become scalars).
>>> The reg state won't need another active_rcu_lock member either, it is simply
>>> part of reg->type.
>>
>> Right, if I don't maintain region id's, no need to have reg->active_rcu_lock
>> and using MEM_RCU should be enough.
>>
>>>
>>> It seems to that simply invalidating registers when rcu_read_unlock is called is
>>> both less code to write and simpler to understand.
>>
>> invalidating rcu_ptr in registers and spills.
>>
> 
> If you use bpf_for_each_reg_in_vstate, it should cover both.

Just checked the macro implementation. Yes, it covers both reg and 
spills. Thanks for mentioning bpf_for_each_reg_in_vstate which I
am not aware of.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4bbcafd1c9b..98af0c9ec721 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -761,6 +761,7 @@  struct bpf_prog_ops {
 struct btf_struct_access_info {
 	u32 next_btf_id;
 	enum bpf_type_flag flag;
+	bool is_rcu;
 };
 
 struct bpf_verifier_ops {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1a32baa78ce2..5d703637bb12 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -179,6 +179,10 @@  struct bpf_reg_state {
 	 */
 	s32 subreg_def;
 	enum bpf_reg_liveness live;
+	/* 0: not rcu ptr; > 0: rcu ptr, id of the rcu read lock region where
+	 * the rcu ptr reg is initialized.
+	 */
+	int active_rcu_lock;
 	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
 	bool precise;
 };
@@ -324,6 +328,8 @@  struct bpf_verifier_state {
 	u32 insn_idx;
 	u32 curframe;
 	u32 active_spin_lock;
+	/* <= 0: not in rcu read lock region; > 0: the rcu lock region id */
+	int active_rcu_lock;
 	bool speculative;
 
 	/* first and last insn idx of this verifier state */
@@ -424,6 +430,7 @@  struct bpf_insn_aux_data {
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
+	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d2ee1669a2f3..c5a9569f2ae0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5831,6 +5831,7 @@  static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 		if (btf_type_is_ptr(mtype)) {
 			const struct btf_type *stype, *t;
 			enum bpf_type_flag tmp_flag = 0;
+			bool is_rcu = false;
 			u32 id;
 
 			if (msize != size || off != moff) {
@@ -5850,12 +5851,16 @@  static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 				/* check __percpu tag */
 				if (strcmp(tag_value, "percpu") == 0)
 					tmp_flag = MEM_PERCPU;
+				/* check __rcu tag */
+				if (strcmp(tag_value, "rcu") == 0)
+					is_rcu = true;
 			}
 
 			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
 			if (btf_type_is_struct(stype)) {
 				info->next_btf_id = id;
 				info->flag = tmp_flag;
+				info->is_rcu = is_rcu;
 				return WALK_PTR;
 			}
 		}
@@ -6317,7 +6322,7 @@  static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	bool rel = false, kptr_get = false, trusted_args = false;
-	bool sleepable = false;
+	bool sleepable = false, rcu_lock = false, rcu_unlock = false;
 	struct bpf_verifier_log *log = &env->log;
 	u32 i, nargs, ref_id, ref_obj_id = 0;
 	bool is_kfunc = btf_is_kernel(btf);
@@ -6356,6 +6361,31 @@  static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
 		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
 		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
+		rcu_lock = kfunc_meta->flags & KF_RCU_LOCK;
+		rcu_unlock = kfunc_meta->flags & KF_RCU_UNLOCK;
+	}
+
+	/* checking rcu read lock/unlock */
+	if (env->cur_state->active_rcu_lock > 0) {
+		if (rcu_lock) {
+			bpf_log(log, "nested rcu read lock (kernel function %s)\n", func_name);
+			return -EINVAL;
+		} else if (rcu_unlock) {
+			/* change active_rcu_lock to its corresponding negative value to
+			 * preserve the previous lock region id.
+			 */
+			env->cur_state->active_rcu_lock = -env->cur_state->active_rcu_lock;
+		} else if (sleepable) {
+			bpf_log(log, "kernel func %s is sleepable within rcu_read_lock region\n",
+				func_name);
+			return -EINVAL;
+		}
+	} else if (rcu_lock) {
+		/* a new lock region started, increase the region id. */
+		env->cur_state->active_rcu_lock = (-env->cur_state->active_rcu_lock) + 1;
+	} else if (rcu_unlock) {
+		bpf_log(log, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+		return -EINVAL;
 	}
 
 	/* check that BTF function arguments match actual types that the
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4d50f9568245..85151f2a655a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,7 @@ 
 #include <linux/error-injection.h>
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
+#include <linux/trace_events.h>
 #include <linux/poison.h>
 
 #include "disasm.h"
@@ -513,6 +514,14 @@  static bool is_callback_calling_function(enum bpf_func_id func_id)
 	       func_id == BPF_FUNC_user_ringbuf_drain;
 }
 
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+	return func_id == BPF_FUNC_sk_storage_get ||
+	       func_id == BPF_FUNC_inode_storage_get ||
+	       func_id == BPF_FUNC_task_storage_get ||
+	       func_id == BPF_FUNC_cgrp_storage_get;
+}
+
 static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
 					const struct bpf_map *map)
 {
@@ -1203,6 +1212,7 @@  static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
 	dst_state->active_spin_lock = src->active_spin_lock;
+	dst_state->active_rcu_lock = src->active_rcu_lock;
 	dst_state->branches = src->branches;
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
@@ -1687,6 +1697,7 @@  static void __mark_reg_unknown(const struct bpf_verifier_env *env,
 	reg->var_off = tnum_unknown;
 	reg->frameno = 0;
 	reg->precise = !env->bpf_capable;
+	reg->active_rcu_lock = 0;
 	__mark_reg_unbounded(reg);
 }
 
@@ -1727,7 +1738,7 @@  static void mark_btf_ld_reg(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *regs, u32 regno,
 			    enum bpf_reg_type reg_type,
 			    struct btf *btf, u32 btf_id,
-			    enum bpf_type_flag flag)
+			    enum bpf_type_flag flag, bool set_rcu_lock)
 {
 	if (reg_type == SCALAR_VALUE) {
 		mark_reg_unknown(env, regs, regno);
@@ -1737,6 +1748,9 @@  static void mark_btf_ld_reg(struct bpf_verifier_env *env,
 	regs[regno].type = PTR_TO_BTF_ID | flag;
 	regs[regno].btf = btf;
 	regs[regno].btf_id = btf_id;
+	/* the reg rcu lock region id equals the current rcu lock region id */
+	if (set_rcu_lock)
+		regs[regno].active_rcu_lock = env->cur_state->active_rcu_lock;
 }
 
 #define DEF_NOT_SUBREG	(0)
@@ -3940,7 +3954,7 @@  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
-				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED, false);
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -4681,6 +4695,18 @@  static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	/* If reg valid rcu region id does not equal to the current rcu region id,
+	 * rcu access is not protected properly, either out of a valid rcu region,
+	 * or in a different rcu region.
+	 */
+	if (env->prog->aux->sleepable && reg->active_rcu_lock &&
+	    reg->active_rcu_lock != env->cur_state->active_rcu_lock) {
+		verbose(env,
+			"R%d is ptr_%s access rcu-protected memory with off=%d, not rcu protected\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+
 	if (env->ops->btf_struct_access) {
 		ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
 						  off, size, atype, &info);
@@ -4697,6 +4723,16 @@  static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (ret < 0)
 		return ret;
 
+	/* The value is a rcu pointer. For a sleepable program, the load needs to be
+	 * in a rcu lock region, similar to rcu_dereference().
+	 */
+	if (info.is_rcu && env->prog->aux->sleepable && env->cur_state->active_rcu_lock <= 0) {
+		verbose(env,
+			"R%d is rcu dereference ptr_%s with off=%d, not in rcu_read_lock region\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+
 	/* If this is an untrusted pointer, all pointers formed by walking it
 	 * also inherit the untrusted flag.
 	 */
@@ -4705,7 +4741,7 @@  static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
 	if (atype == BPF_READ && value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, info.next_btf_id,
-				info.flag);
+				info.flag, info.is_rcu && env->prog->aux->sleepable);
 
 	return 0;
 }
@@ -4761,7 +4797,7 @@  static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 
 	if (value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, info.next_btf_id,
-				info.flag);
+				info.flag, info.is_rcu && env->prog->aux->sleepable);
 
 	return 0;
 }
@@ -5874,6 +5910,17 @@  static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type & PTR_MAYBE_NULL)
 		type &= ~PTR_MAYBE_NULL;
 
+	/* If a rcu pointer is a helper argument, the helper should be protected in
+	 * the same rcu lock region where the rcu pointer reg is initialized.
+	 */
+	if (env->prog->aux->sleepable && reg->active_rcu_lock &&
+	    reg->active_rcu_lock != env->cur_state->active_rcu_lock) {
+		verbose(env,
+			"R%d is arg type %s needs rcu protection\n",
+			regno, reg_type_str(env, reg->type));
+		return -EACCES;
+	}
+
 	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
 		expected = compatible->types[i];
 		if (expected == NOT_INIT)
@@ -7418,6 +7465,18 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return err;
 	}
 
+	if (env->cur_state->active_rcu_lock > 0) {
+		if (bpf_lsm_sleepable_func_proto(func_id) ||
+		    bpf_tracing_sleepable_func_proto(func_id)) {
+			verbose(env, "sleepable helper %s#%din rcu_read_lock region\n",
+				func_id_name(func_id), func_id);
+			return -EINVAL;
+		}
+
+		if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+	}
+
 	meta.func_id = func_id;
 	/* check args */
 	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -10605,6 +10664,11 @@  static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
+	if (env->prog->aux->sleepable && env->cur_state->active_rcu_lock > 0) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+		return -EINVAL;
+	}
+
 	if (regs[ctx_reg].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -11869,6 +11933,9 @@  static bool states_equal(struct bpf_verifier_env *env,
 	if (old->active_spin_lock != cur->active_spin_lock)
 		return false;
 
+	if (old->active_rcu_lock != cur->active_rcu_lock)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -12553,6 +12620,11 @@  static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_rcu_lock > 0) {
+					verbose(env, "bpf_rcu_read_unlock is missing\n");
+					return -EINVAL;
+				}
+
 				/* We must do check_reference_leak here before
 				 * prepare_func_exit to handle the case when
 				 * state->curframe > 0, it may be a callback
@@ -14289,14 +14361,12 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
-		if (insn->imm == BPF_FUNC_task_storage_get ||
-		    insn->imm == BPF_FUNC_sk_storage_get ||
-		    insn->imm == BPF_FUNC_inode_storage_get ||
-		    insn->imm == BPF_FUNC_cgrp_storage_get) {
-			if (env->prog->aux->sleepable)
-				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
-			else
+		if (is_storage_get_function(insn->imm)) {
+			if (!env->prog->aux->sleepable ||
+			    env->insn_aux_data[i + delta].storage_get_func_atomic)
 				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+			else
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
 			insn_buf[1] = *insn;
 			cnt = 2;