diff mbox series

[v1,bpf-next,5/7] bpf: Consider non-owning refs to refcounted nodes RCU protected

Message ID 20230801203630.3581291-6-davemarchevsky@fb.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series BPF Refcount followups 3: bpf_mem_free_rcu refcounted nodes | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2788 this patch: 2788
netdev/cc_maintainers warning 8 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com sdf@google.com song@kernel.org yonghong.song@linux.dev jolsa@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 1530 this patch: 1530
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2806 this patch: 2806
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-8 success Logs for veristat

Commit Message

Dave Marchevsky Aug. 1, 2023, 8:36 p.m. UTC
The previous patch in the series ensures that the underlying memory of
nodes with bpf_refcount - which can have multiple owners - is not reused
until RCU Tasks Trace grace period has elapsed. This prevents
use-after-free with non-owning references that may point to
recently-freed memory. While RCU read lock is held, it's safe to
dereference such a non-owning ref, as by definition RCU GP couldn't have
elapsed and therefore underlying memory couldn't have been reused.

From the perspective of verifier "trustedness" non-owning refs to
refcounted nodes are now trusted only in RCU CS and therefore should no
longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
MEM_RCU in order to reflect this new state.

Similarly to bpf_spin_unlock being a non-owning ref invalidation point,
where non-owning ref reg states are clobbered so that they cannot be
used outside of the critical section, currently all MEM_RCU regs are
marked untrusted after bpf_rcu_read_unlock. This patch makes
bpf_rcu_read_unlock a non-owning ref invalidation point as well,
clobbering the non-owning refs instead of marking untrusted. In the
future we may want to allow untrusted non-owning refs in which case we
can remove this custom logic without breaking BPF programs as it's more
restrictive than the default. That's a big change in semantics, though,
and this series is focused on fixing the use-after-free in most
straightforward way.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
---
 include/linux/bpf.h   |  3 ++-
 kernel/bpf/verifier.c | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

Comments

Yonghong Song Aug. 2, 2023, 5:59 a.m. UTC | #1
On 8/1/23 1:36 PM, Dave Marchevsky wrote:
> The previous patch in the series ensures that the underlying memory of
> nodes with bpf_refcount - which can have multiple owners - is not reused
> until RCU Tasks Trace grace period has elapsed. This prevents
> use-after-free with non-owning references that may point to
> recently-freed memory. While RCU read lock is held, it's safe to
> dereference such a non-owning ref, as by definition RCU GP couldn't have
> elapsed and therefore underlying memory couldn't have been reused.
> 
>  From the perspective of verifier "trustedness" non-owning refs to
> refcounted nodes are now trusted only in RCU CS and therefore should no
> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
> MEM_RCU in order to reflect this new state.
> 
> Similarly to bpf_spin_unlock being a non-owning ref invalidation point,
> where non-owning ref reg states are clobbered so that they cannot be
> used outside of the critical section, currently all MEM_RCU regs are
> marked untrusted after bpf_rcu_read_unlock. This patch makes
> bpf_rcu_read_unlock a non-owning ref invalidation point as well,
> clobbering the non-owning refs instead of marking untrusted. In the
> future we may want to allow untrusted non-owning refs in which case we
> can remove this custom logic without breaking BPF programs as it's more
> restrictive than the default. That's a big change in semantics, though,
> and this series is focused on fixing the use-after-free in most
> straightforward way.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> ---
>   include/linux/bpf.h   |  3 ++-
>   kernel/bpf/verifier.c | 17 +++++++++++++++--
>   2 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index ceaa8c23287f..37fba01b061a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>   	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
>   
>   	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
> -	 * Currently only valid for linked-list and rbtree nodes.
> +	 * Currently only valid for linked-list and rbtree nodes. If the nodes
> +	 * have a bpf_refcount_field, they must be tagged MEM_RCU as well.

What does 'must' here mean?

>   	 */
>   	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
>   
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 9014b469dd9d..4bda365000d3 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -469,7 +469,8 @@ static bool type_is_ptr_alloc_obj(u32 type)
>   
>   static bool type_is_non_owning_ref(u32 type)
>   {
> -	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
> +	return type_is_ptr_alloc_obj(type) &&
> +		type_flag(type) & NON_OWN_REF;

There is no code change here.

>   }
>   
>   static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
> @@ -8012,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>   	case PTR_TO_BTF_ID | PTR_TRUSTED:
>   	case PTR_TO_BTF_ID | MEM_RCU:
>   	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
> +	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>   		/* When referenced PTR_TO_BTF_ID is passed to release function,
>   		 * its fixed offset must be 0. In the other cases, fixed offset
>   		 * can be non-zero. This was already checked above. So pass
> @@ -10478,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>   static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>   {
>   	struct bpf_verifier_state *state = env->cur_state;
> +	struct btf_record *rec = reg_btf_record(reg);
>   
>   	if (!state->active_lock.ptr) {
>   		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
> @@ -10490,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>   	}
>   
>   	reg->type |= NON_OWN_REF;
> +	if (rec->refcount_off >= 0)
> +		reg->type |= MEM_RCU;

Should we check whether the state is in rcu cs before marking MEM_RCU?

> +
>   	return 0;
>   }
>   
> @@ -11327,10 +11333,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>   		struct bpf_func_state *state;
>   		struct bpf_reg_state *reg;
>   
> +		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
> +			verbose(env, "can't rcu read {lock,unlock} in rbtree cb\n");
> +			return -EACCES;
> +		}
> +
>   		if (rcu_lock) {
>   			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>   			return -EINVAL;
>   		} else if (rcu_unlock) {
> +			invalidate_non_owning_refs(env);

If we have both spin lock and rcu like

      bpf_rcu_read_lock()
      ...
      bpf_spin_lock()
      ...
      bpf_spin_unlock()  <=== invalidate all non_owning_refs
      ...                <=== MEM_RCU type is gone
      bpf_rcu_read_unlock()

Maybe we could fine tune here to preserve MEM_RCU after bpf_spin_unlock()?

>   			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
>   				if (reg->type & MEM_RCU) {
>   					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
> @@ -16679,7 +16691,8 @@ static int do_check(struct bpf_verifier_env *env)
>   					return -EINVAL;
>   				}
>   
> -				if (env->cur_state->active_rcu_lock) {
> +				if (env->cur_state->active_rcu_lock &&
> +				    !in_rbtree_lock_required_cb(env)) {
>   					verbose(env, "bpf_rcu_read_unlock is missing\n");
>   					return -EINVAL;
>   				}
Alexei Starovoitov Aug. 2, 2023, 10:50 p.m. UTC | #2
On Tue, Aug 01, 2023 at 01:36:28PM -0700, Dave Marchevsky wrote:
> The previous patch in the series ensures that the underlying memory of
> nodes with bpf_refcount - which can have multiple owners - is not reused
> until RCU Tasks Trace grace period has elapsed. This prevents

Here and in the cover letter... above should probably be "RCU grace period"
and not "RCU tasks trace grace period".
bpf progs will reuse objects after normal RCU.
We're waiting for RCU tasks trace GP to free into slab.

> use-after-free with non-owning references that may point to
> recently-freed memory. While RCU read lock is held, it's safe to
> dereference such a non-owning ref, as by definition RCU GP couldn't have
> elapsed and therefore underlying memory couldn't have been reused.
> 
> From the perspective of verifier "trustedness" non-owning refs to
> refcounted nodes are now trusted only in RCU CS and therefore should no
> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
> MEM_RCU in order to reflect this new state.
> 
> Similarly to bpf_spin_unlock being a non-owning ref invalidation point,
> where non-owning ref reg states are clobbered so that they cannot be
> used outside of the critical section, currently all MEM_RCU regs are
> marked untrusted after bpf_rcu_read_unlock. This patch makes
> bpf_rcu_read_unlock a non-owning ref invalidation point as well,
> clobbering the non-owning refs instead of marking untrusted. In the
> future we may want to allow untrusted non-owning refs in which case we
> can remove this custom logic without breaking BPF programs as it's more
> restrictive than the default. That's a big change in semantics, though,
> and this series is focused on fixing the use-after-free in most
> straightforward way.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> ---
>  include/linux/bpf.h   |  3 ++-
>  kernel/bpf/verifier.c | 17 +++++++++++++++--
>  2 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index ceaa8c23287f..37fba01b061a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>  	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
>  
>  	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
> -	 * Currently only valid for linked-list and rbtree nodes.
> +	 * Currently only valid for linked-list and rbtree nodes. If the nodes
> +	 * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>  	 */
>  	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
>  
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 9014b469dd9d..4bda365000d3 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -469,7 +469,8 @@ static bool type_is_ptr_alloc_obj(u32 type)
>  
>  static bool type_is_non_owning_ref(u32 type)
>  {
> -	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
> +	return type_is_ptr_alloc_obj(type) &&
> +		type_flag(type) & NON_OWN_REF;
>  }
>  
>  static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
> @@ -8012,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>  	case PTR_TO_BTF_ID | PTR_TRUSTED:
>  	case PTR_TO_BTF_ID | MEM_RCU:
>  	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
> +	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>  		/* When referenced PTR_TO_BTF_ID is passed to release function,
>  		 * its fixed offset must be 0. In the other cases, fixed offset
>  		 * can be non-zero. This was already checked above. So pass
> @@ -10478,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>  static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>  {
>  	struct bpf_verifier_state *state = env->cur_state;
> +	struct btf_record *rec = reg_btf_record(reg);
>  
>  	if (!state->active_lock.ptr) {
>  		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
> @@ -10490,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>  	}
>  
>  	reg->type |= NON_OWN_REF;
> +	if (rec->refcount_off >= 0)
> +		reg->type |= MEM_RCU;
> +
>  	return 0;
>  }
>  
> @@ -11327,10 +11333,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>  		struct bpf_func_state *state;
>  		struct bpf_reg_state *reg;
>  
> +		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
> +			verbose(env, "can't rcu read {lock,unlock} in rbtree cb\n");
> +			return -EACCES;
> +		}

I guess it's ok to prevent cb from calling bpf_rcu_read_lock(), since it's unnecessary,
but pls make the message more verbose. Like:
 verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");

so that users know why the verifier complains.
Technically it's ok to do so. Unnecessary is not a safety issue.

> +
>  		if (rcu_lock) {
>  			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>  			return -EINVAL;
>  		} else if (rcu_unlock) {
> +			invalidate_non_owning_refs(env);

I agree with Yonghong. It probably doesn't belong here.
rcu lock/unlock and spin_lock/unlock are separate critical sections.
Since ref_set_non_owning() adds extra MEM_RCU flag nothing extra needs to be done here.
Below code will make the pointers untrusted.

>  			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
>  				if (reg->type & MEM_RCU) {
>  					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
> @@ -16679,7 +16691,8 @@ static int do_check(struct bpf_verifier_env *env)
>  					return -EINVAL;
>  				}
>  
> -				if (env->cur_state->active_rcu_lock) {
> +				if (env->cur_state->active_rcu_lock &&
> +				    !in_rbtree_lock_required_cb(env)) {

I'm not following here.
Didn't you want to prevent bpf_rcu_read_lock/unlock inside cb? Why this change?

>  					verbose(env, "bpf_rcu_read_unlock is missing\n");
>  					return -EINVAL;
>  				}
> -- 
> 2.34.1
>
David Marchevsky Aug. 4, 2023, 6:47 a.m. UTC | #3
On 8/2/23 1:59 AM, Yonghong Song wrote:
> 
> 
> On 8/1/23 1:36 PM, Dave Marchevsky wrote:
>> The previous patch in the series ensures that the underlying memory of
>> nodes with bpf_refcount - which can have multiple owners - is not reused
>> until RCU Tasks Trace grace period has elapsed. This prevents
>> use-after-free with non-owning references that may point to
>> recently-freed memory. While RCU read lock is held, it's safe to
>> dereference such a non-owning ref, as by definition RCU GP couldn't have
>> elapsed and therefore underlying memory couldn't have been reused.
>>
>>  From the perspective of verifier "trustedness" non-owning refs to
>> refcounted nodes are now trusted only in RCU CS and therefore should no
>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
>> MEM_RCU in order to reflect this new state.
>>
>> Similarly to bpf_spin_unlock being a non-owning ref invalidation point,
>> where non-owning ref reg states are clobbered so that they cannot be
>> used outside of the critical section, currently all MEM_RCU regs are
>> marked untrusted after bpf_rcu_read_unlock. This patch makes
>> bpf_rcu_read_unlock a non-owning ref invalidation point as well,
>> clobbering the non-owning refs instead of marking untrusted. In the
>> future we may want to allow untrusted non-owning refs in which case we
>> can remove this custom logic without breaking BPF programs as it's more
>> restrictive than the default. That's a big change in semantics, though,
>> and this series is focused on fixing the use-after-free in most
>> straightforward way.
>>
>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>> ---
>>   include/linux/bpf.h   |  3 ++-
>>   kernel/bpf/verifier.c | 17 +++++++++++++++--
>>   2 files changed, 17 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index ceaa8c23287f..37fba01b061a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>>       MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
>>         /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
>> -     * Currently only valid for linked-list and rbtree nodes.
>> +     * Currently only valid for linked-list and rbtree nodes. If the nodes
>> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
> 
> What does 'must' here mean?
> 

Meaning that if there's any NON_OWN_REF-flagged
PTR_TO_BTF_ID which points to a struct with a bpf_refcount field,
it should also be flagged with MEM_RCU. If it isn't, it's a
verifier error. 

>>        */
>>       NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
>>   diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 9014b469dd9d..4bda365000d3 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -469,7 +469,8 @@ static bool type_is_ptr_alloc_obj(u32 type)
>>     static bool type_is_non_owning_ref(u32 type)
>>   {
>> -    return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
>> +    return type_is_ptr_alloc_obj(type) &&
>> +        type_flag(type) & NON_OWN_REF;
> 
> There is no code change here.
> 

Yep, will undo in v2.

>>   }
>>     static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
>> @@ -8012,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>>       case PTR_TO_BTF_ID | PTR_TRUSTED:
>>       case PTR_TO_BTF_ID | MEM_RCU:
>>       case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>>           /* When referenced PTR_TO_BTF_ID is passed to release function,
>>            * its fixed offset must be 0. In the other cases, fixed offset
>>            * can be non-zero. This was already checked above. So pass
>> @@ -10478,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>>   static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>>   {
>>       struct bpf_verifier_state *state = env->cur_state;
>> +    struct btf_record *rec = reg_btf_record(reg);
>>         if (!state->active_lock.ptr) {
>>           verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
>> @@ -10490,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>>       }
>>         reg->type |= NON_OWN_REF;
>> +    if (rec->refcount_off >= 0)
>> +        reg->type |= MEM_RCU;
> 
> Should we check whether the state is in rcu cs before marking MEM_RCU?
> 

I think this is implicitly being enforced.
Rbtree/list kfuncs must be called under bpf_spin_lock,
and this series requires bpf_spin_{lock,unlock} helpers
to called in RCU CS if the BPF prog is sleepable.

>> +
>>       return 0;
>>   }
>>   @@ -11327,10 +11333,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>           struct bpf_func_state *state;
>>           struct bpf_reg_state *reg;
>>   +        if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
>> +            verbose(env, "can't rcu read {lock,unlock} in rbtree cb\n");
>> +            return -EACCES;
>> +        }
>> +
>>           if (rcu_lock) {
>>               verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>>               return -EINVAL;
>>           } else if (rcu_unlock) {
>> +            invalidate_non_owning_refs(env);
> 
> If we have both spin lock and rcu like
> 
>      bpf_rcu_read_lock()
>      ...
>      bpf_spin_lock()
>      ...
>      bpf_spin_unlock()  <=== invalidate all non_owning_refs
>      ...                <=== MEM_RCU type is gone
>      bpf_rcu_read_unlock()
> 
> Maybe we could fine tune here to preserve MEM_RCU after bpf_spin_unlock()?
> 

IIUC, you're saying that we should no longer
have non-owning refs get clobbered after bpf_spin_unlock,
and instead just have rcu_read_unlock do its default
"MEM_RCU refs become PTR_UNTRUSTED" logic.

In the cover letter I mention that this is probably
the direction we want to go in in the long term, on
the comments on patch 3:

  This might
  allow custom non-owning ref lifetime + invalidation logic to be
  entirely subsumed by MEM_RCU handling.

But I'm hesitant to do that in this fixes series
as I'd like to minimize changes that could introduce
additional bugs. This series' current changes keep the
clobbering rules effectively unchanged - can always
loosen them in the future. Also, I think we should
make this change for _all_ non-owning refs, (w/ and w/o
bpf_refcount field). Otherwise the verifier lifetime
of non-owning refs would change if BPF program writer
adds bpf_refcount field to their struct, or removes it.
 

>>               bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
>>                   if (reg->type & MEM_RCU) {
>>                       reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
>> @@ -16679,7 +16691,8 @@ static int do_check(struct bpf_verifier_env *env)
>>                       return -EINVAL;
>>                   }
>>   -                if (env->cur_state->active_rcu_lock) {
>> +                if (env->cur_state->active_rcu_lock &&
>> +                    !in_rbtree_lock_required_cb(env)) {
>>                       verbose(env, "bpf_rcu_read_unlock is missing\n");
>>                       return -EINVAL;
>>                   }
David Marchevsky Aug. 4, 2023, 6:55 a.m. UTC | #4
On 8/2/23 6:50 PM, Alexei Starovoitov wrote:
> On Tue, Aug 01, 2023 at 01:36:28PM -0700, Dave Marchevsky wrote:
>> The previous patch in the series ensures that the underlying memory of
>> nodes with bpf_refcount - which can have multiple owners - is not reused
>> until RCU Tasks Trace grace period has elapsed. This prevents
> 
> Here and in the cover letter... above should probably be "RCU grace period"
> and not "RCU tasks trace grace period".
> bpf progs will reuse objects after normal RCU.
> We're waiting for RCU tasks trace GP to free into slab.
> 

Will fix.

>> use-after-free with non-owning references that may point to
>> recently-freed memory. While RCU read lock is held, it's safe to
>> dereference such a non-owning ref, as by definition RCU GP couldn't have
>> elapsed and therefore underlying memory couldn't have been reused.
>>
>> From the perspective of verifier "trustedness" non-owning refs to
>> refcounted nodes are now trusted only in RCU CS and therefore should no
>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
>> MEM_RCU in order to reflect this new state.
>>
>> Similarly to bpf_spin_unlock being a non-owning ref invalidation point,
>> where non-owning ref reg states are clobbered so that they cannot be
>> used outside of the critical section, currently all MEM_RCU regs are
>> marked untrusted after bpf_rcu_read_unlock. This patch makes
>> bpf_rcu_read_unlock a non-owning ref invalidation point as well,
>> clobbering the non-owning refs instead of marking untrusted. In the
>> future we may want to allow untrusted non-owning refs in which case we
>> can remove this custom logic without breaking BPF programs as it's more
>> restrictive than the default. That's a big change in semantics, though,
>> and this series is focused on fixing the use-after-free in most
>> straightforward way.
>>
>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>> ---
>>  include/linux/bpf.h   |  3 ++-
>>  kernel/bpf/verifier.c | 17 +++++++++++++++--
>>  2 files changed, 17 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index ceaa8c23287f..37fba01b061a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>>  	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
>>  
>>  	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
>> -	 * Currently only valid for linked-list and rbtree nodes.
>> +	 * Currently only valid for linked-list and rbtree nodes. If the nodes
>> +	 * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>>  	 */
>>  	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
>>  
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 9014b469dd9d..4bda365000d3 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -469,7 +469,8 @@ static bool type_is_ptr_alloc_obj(u32 type)
>>  
>>  static bool type_is_non_owning_ref(u32 type)
>>  {
>> -	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
>> +	return type_is_ptr_alloc_obj(type) &&
>> +		type_flag(type) & NON_OWN_REF;
>>  }
>>  
>>  static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
>> @@ -8012,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>>  	case PTR_TO_BTF_ID | PTR_TRUSTED:
>>  	case PTR_TO_BTF_ID | MEM_RCU:
>>  	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
>> +	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>>  		/* When referenced PTR_TO_BTF_ID is passed to release function,
>>  		 * its fixed offset must be 0. In the other cases, fixed offset
>>  		 * can be non-zero. This was already checked above. So pass
>> @@ -10478,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>>  static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>>  {
>>  	struct bpf_verifier_state *state = env->cur_state;
>> +	struct btf_record *rec = reg_btf_record(reg);
>>  
>>  	if (!state->active_lock.ptr) {
>>  		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
>> @@ -10490,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>>  	}
>>  
>>  	reg->type |= NON_OWN_REF;
>> +	if (rec->refcount_off >= 0)
>> +		reg->type |= MEM_RCU;
>> +
>>  	return 0;
>>  }
>>  
>> @@ -11327,10 +11333,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>  		struct bpf_func_state *state;
>>  		struct bpf_reg_state *reg;
>>  
>> +		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
>> +			verbose(env, "can't rcu read {lock,unlock} in rbtree cb\n");
>> +			return -EACCES;
>> +		}
> 
> I guess it's ok to prevent cb from calling bpf_rcu_read_lock(), since it's unnecessary,
> but pls make the message more verbose. Like:
>  verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
> 
> so that users know why the verifier complains.
> Technically it's ok to do so. Unnecessary is not a safety issue.
> 

Well, for rcu_read_unlock it would be a safety issue, no?
Feels easier to reason about if we can just say "RCU lock is
held for the duration of the callback".

>> +
>>  		if (rcu_lock) {
>>  			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>>  			return -EINVAL;
>>  		} else if (rcu_unlock) {
>> +			invalidate_non_owning_refs(env);
> 
> I agree with Yonghong. It probably doesn't belong here.
> rcu lock/unlock and spin_lock/unlock are separate critical sections.
> Since ref_set_non_owning() adds extra MEM_RCU flag nothing extra needs to be done here.
> Below code will make the pointers untrusted.
> 

Will change. The desire here was to not loosen constraints
on non-owning ref lifetime in this series. As I mention in
my thoughts on Patch 3 in the cover letter, I do think we can
loosen that in the future, but would like to avoid doing so in this
fixes series. Regardless, because bpf_spin_unlock will
happen before this executes, this line can be removed.

>>  			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
>>  				if (reg->type & MEM_RCU) {
>>  					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
>> @@ -16679,7 +16691,8 @@ static int do_check(struct bpf_verifier_env *env)
>>  					return -EINVAL;
>>  				}
>>  
>> -				if (env->cur_state->active_rcu_lock) {
>> +				if (env->cur_state->active_rcu_lock &&
>> +				    !in_rbtree_lock_required_cb(env)) {
> 
> I'm not following here.
> Didn't you want to prevent bpf_rcu_read_lock/unlock inside cb? Why this change?
> >>  					verbose(env, "bpf_rcu_read_unlock is missing\n");
>>  					return -EINVAL;
>>  				}
>> -- 
>> 2.34.1
>>
Yonghong Song Aug. 4, 2023, 3:43 p.m. UTC | #5
On 8/3/23 11:47 PM, David Marchevsky wrote:
> On 8/2/23 1:59 AM, Yonghong Song wrote:
>>
>>
>> On 8/1/23 1:36 PM, Dave Marchevsky wrote:
>>> The previous patch in the series ensures that the underlying memory of
>>> nodes with bpf_refcount - which can have multiple owners - is not reused
>>> until RCU Tasks Trace grace period has elapsed. This prevents
>>> use-after-free with non-owning references that may point to
>>> recently-freed memory. While RCU read lock is held, it's safe to
>>> dereference such a non-owning ref, as by definition RCU GP couldn't have
>>> elapsed and therefore underlying memory couldn't have been reused.
>>>
>>>   From the perspective of verifier "trustedness" non-owning refs to
>>> refcounted nodes are now trusted only in RCU CS and therefore should no
>>> longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
>>> MEM_RCU in order to reflect this new state.
>>>
>>> Similarly to bpf_spin_unlock being a non-owning ref invalidation point,
>>> where non-owning ref reg states are clobbered so that they cannot be
>>> used outside of the critical section, currently all MEM_RCU regs are
>>> marked untrusted after bpf_rcu_read_unlock. This patch makes
>>> bpf_rcu_read_unlock a non-owning ref invalidation point as well,
>>> clobbering the non-owning refs instead of marking untrusted. In the
>>> future we may want to allow untrusted non-owning refs in which case we
>>> can remove this custom logic without breaking BPF programs as it's more
>>> restrictive than the default. That's a big change in semantics, though,
>>> and this series is focused on fixing the use-after-free in most
>>> straightforward way.
>>>
>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>> ---
>>>    include/linux/bpf.h   |  3 ++-
>>>    kernel/bpf/verifier.c | 17 +++++++++++++++--
>>>    2 files changed, 17 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index ceaa8c23287f..37fba01b061a 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -653,7 +653,8 @@ enum bpf_type_flag {
>>>        MEM_RCU            = BIT(13 + BPF_BASE_TYPE_BITS),
>>>          /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
>>> -     * Currently only valid for linked-list and rbtree nodes.
>>> +     * Currently only valid for linked-list and rbtree nodes. If the nodes
>>> +     * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
>>
>> What does 'must' here mean?
>>
> 
> Meaning that if there's any NON_OWN_REF-flagged
> PTR_TO_BTF_ID which points to a struct with a bpf_refcount field,
> it should also be flagged with MEM_RCU. If it isn't, it's a
> verifier error.
> 
>>>         */
>>>        NON_OWN_REF        = BIT(14 + BPF_BASE_TYPE_BITS),
>>>    diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index 9014b469dd9d..4bda365000d3 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -469,7 +469,8 @@ static bool type_is_ptr_alloc_obj(u32 type)
>>>      static bool type_is_non_owning_ref(u32 type)
>>>    {
>>> -    return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
>>> +    return type_is_ptr_alloc_obj(type) &&
>>> +        type_flag(type) & NON_OWN_REF;
>>
>> There is no code change here.
>>
> 
> Yep, will undo in v2.
> 
>>>    }
>>>      static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
>>> @@ -8012,6 +8013,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
>>>        case PTR_TO_BTF_ID | PTR_TRUSTED:
>>>        case PTR_TO_BTF_ID | MEM_RCU:
>>>        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
>>> +    case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
>>>            /* When referenced PTR_TO_BTF_ID is passed to release function,
>>>             * its fixed offset must be 0. In the other cases, fixed offset
>>>             * can be non-zero. This was already checked above. So pass
>>> @@ -10478,6 +10480,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
>>>    static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
>>>    {
>>>        struct bpf_verifier_state *state = env->cur_state;
>>> +    struct btf_record *rec = reg_btf_record(reg);
>>>          if (!state->active_lock.ptr) {
>>>            verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
>>> @@ -10490,6 +10493,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
>>>        }
>>>          reg->type |= NON_OWN_REF;
>>> +    if (rec->refcount_off >= 0)
>>> +        reg->type |= MEM_RCU;
>>
>> Should we check whether the state is in rcu cs before marking MEM_RCU?
>>
> 
> I think this is implicitly being enforced.
> Rbtree/list kfuncs must be called under bpf_spin_lock,
> and this series requires bpf_spin_{lock,unlock} helpers
> to called in RCU CS if the BPF prog is sleepable.

I see.

Alexei early mentioned that
there is no need to put bpf_spin_lock inside RCU CS
if for sleepable program, we do preempt_disable before
real arch_spin_lock(). This is similar to to regular
spin_lock/raw_spin_lock which does preempt_disable
so spin_lock/spin_unlock region becomes an ATOMIC
region which prevents any blocking.

Not sure whether you want to implement in this
patch set or in later patch set.

> 
>>> +
>>>        return 0;
>>>    }
>>>    @@ -11327,10 +11333,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>>            struct bpf_func_state *state;
>>>            struct bpf_reg_state *reg;
>>>    +        if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
>>> +            verbose(env, "can't rcu read {lock,unlock} in rbtree cb\n");
>>> +            return -EACCES;
>>> +        }
>>> +
>>>            if (rcu_lock) {
>>>                verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
>>>                return -EINVAL;
>>>            } else if (rcu_unlock) {
>>> +            invalidate_non_owning_refs(env);
>>
>> If we have both spin lock and rcu like
>>
>>       bpf_rcu_read_lock()
>>       ...
>>       bpf_spin_lock()
>>       ...
>>       bpf_spin_unlock()  <=== invalidate all non_owning_refs
>>       ...                <=== MEM_RCU type is gone
>>       bpf_rcu_read_unlock()
>>
>> Maybe we could fine tune here to preserve MEM_RCU after bpf_spin_unlock()?
>>
> 
> IIUC, you're saying that we should no longer
> have non-owning refs get clobbered after bpf_spin_unlock,
> and instead just have rcu_read_unlock do its default
> "MEM_RCU refs become PTR_UNTRUSTED" logic.
> 
> In the cover letter I mention that this is probably
> the direction we want to go in in the long term, on
> the comments on patch 3:
> 
>    This might
>    allow custom non-owning ref lifetime + invalidation logic to be
>    entirely subsumed by MEM_RCU handling.
> 
> But I'm hesitant to do that in this fixes series
> as I'd like to minimize changes that could introduce
> additional bugs. This series' current changes keep the
> clobbering rules effectively unchanged - can always
> loosen them in the future. Also, I think we should
> make this change for _all_ non-owning refs, (w/ and w/o
> bpf_refcount field). Otherwise the verifier lifetime
> of non-owning refs would change if BPF program writer
> adds bpf_refcount field to their struct, or removes it.
>   
> 
>>>                bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
>>>                    if (reg->type & MEM_RCU) {
>>>                        reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
>>> @@ -16679,7 +16691,8 @@ static int do_check(struct bpf_verifier_env *env)
>>>                        return -EINVAL;
>>>                    }
>>>    -                if (env->cur_state->active_rcu_lock) {
>>> +                if (env->cur_state->active_rcu_lock &&
>>> +                    !in_rbtree_lock_required_cb(env)) {
>>>                        verbose(env, "bpf_rcu_read_unlock is missing\n");
>>>                        return -EINVAL;
>>>                    }
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ceaa8c23287f..37fba01b061a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -653,7 +653,8 @@  enum bpf_type_flag {
 	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
 
 	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
-	 * Currently only valid for linked-list and rbtree nodes.
+	 * Currently only valid for linked-list and rbtree nodes. If the nodes
+	 * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
 	 */
 	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9014b469dd9d..4bda365000d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -469,7 +469,8 @@  static bool type_is_ptr_alloc_obj(u32 type)
 
 static bool type_is_non_owning_ref(u32 type)
 {
-	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+	return type_is_ptr_alloc_obj(type) &&
+		type_flag(type) & NON_OWN_REF;
 }
 
 static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
@@ -8012,6 +8013,7 @@  int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_BTF_ID | PTR_TRUSTED:
 	case PTR_TO_BTF_ID | MEM_RCU:
 	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
 		 * its fixed offset must be 0. In the other cases, fixed offset
 		 * can be non-zero. This was already checked above. So pass
@@ -10478,6 +10480,7 @@  static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_verifier_state *state = env->cur_state;
+	struct btf_record *rec = reg_btf_record(reg);
 
 	if (!state->active_lock.ptr) {
 		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
@@ -10490,6 +10493,9 @@  static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
 	}
 
 	reg->type |= NON_OWN_REF;
+	if (rec->refcount_off >= 0)
+		reg->type |= MEM_RCU;
+
 	return 0;
 }
 
@@ -11327,10 +11333,16 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		struct bpf_func_state *state;
 		struct bpf_reg_state *reg;
 
+		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+			verbose(env, "can't rcu read {lock,unlock} in rbtree cb\n");
+			return -EACCES;
+		}
+
 		if (rcu_lock) {
 			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
 			return -EINVAL;
 		} else if (rcu_unlock) {
+			invalidate_non_owning_refs(env);
 			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
 				if (reg->type & MEM_RCU) {
 					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
@@ -16679,7 +16691,8 @@  static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				if (env->cur_state->active_rcu_lock) {
+				if (env->cur_state->active_rcu_lock &&
+				    !in_rbtree_lock_required_cb(env)) {
 					verbose(env, "bpf_rcu_read_unlock is missing\n");
 					return -EINVAL;
 				}